# Training KNN Model

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.datasets import load_iris

In [12]:
# Replace 'iris.csv' with the actual file name or path if needed.
try:
    data_url = "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv"
    df = pd.read_csv(data_url, header=None)
except FileNotFoundError:
    print("Error: 'iris.csv' not found. Please make sure the file exists in the current directory.")

In [14]:
df.head(5)

Unnamed: 0,0,1,2,3,4
0,sepal_length,sepal_width,petal_length,petal_width,species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa


In [17]:
# Separate features (X) and target (y)
X = df.iloc[1:, :-1]  # All columns except the last one (species)
y = df.iloc[1:, -1]   # The last column (species)

In [18]:
# Encode the target variable (species) using Label Encoding
le = LabelEncoder()
y = le.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Initialize and train the KNN model
knn = KNeighborsClassifier(n_neighbors=5) # You can change n_neighbors
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Use weighted average for multiclass
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.9777777777777777
Precision: 0.9796296296296295
Recall: 0.9777777777777777
F1 Score: 0.9779434092477569
Confusion Matrix:
[[16  0  0]
 [ 0 17  1]
 [ 0  0 11]]


In [19]:
# prompt: train a KNN on a credit score problem

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset (replace 'credit_data.csv' with your file)
try:
    data_url = "https://raw.githubusercontent.com/k8xu/credit-risk-classification/refs/heads/master/credit_data.csv"
    df = pd.read_csv(data_url)
except FileNotFoundError:
    print("Error: 'credit_data.csv' not found. Please upload the file or provide the correct path.")
    exit()


In [21]:
# Preprocessing (assuming 'Credit_Score' is the target variable and other columns are features)
X = df.drop('Risk', axis=1)
y = df['Risk']

# Convert categorical features to numerical using one-hot encoding (if any)
X = pd.get_dummies(X, drop_first=True)

# Feature scaling (important for KNN)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the KNN model
knn = KNeighborsClassifier(n_neighbors=5)  # You can experiment with different values for n_neighbors
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.64
              precision    recall  f1-score   support

         bad       0.30      0.17      0.22        59
        good       0.71      0.84      0.77       141

    accuracy                           0.64       200
   macro avg       0.50      0.50      0.49       200
weighted avg       0.59      0.64      0.60       200

[[ 10  49]
 [ 23 118]]
