# Implement   K-Nearest   Neighbors   algorithm   on   diabetes.csv   dataset.   Compute   confusion matrix, accuracy, error rate, precision and recall on the given dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('7458_diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Pedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
#Check for null or missing values
data.isnull().sum()

Pregnancies      0
Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
Pedigree         0
Age              0
Outcome          0
dtype: int64

In [4]:
# --- This REPLACES Cell [4] ---

# We will handle the '0' values AFTER splitting to prevent data leakage
# Just inspect the data for now
data.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Pedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [5]:
# --- This REPLACES Cell [5] ---

# X = all columns EXCEPT the last one
X = data.iloc[:, :-1] 

# Y = ONLY the last column (as a 1D Series)
Y = data.iloc[:, -1] 

print("X shape:", X.shape)
print("Y shape:", Y.shape)

X shape: (768, 8)
Y shape: (768,)


In [6]:
from sklearn.model_selection import train_test_split

# X is your full feature DataFrame, Y is your full target Series
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y,
    test_size=0.2,    # Or whatever your test size is
    random_state=42,  # Good practice for reproducible results
    stratify=Y        # <-- THIS IS THE FIX
)

In [7]:
# --- This REPLACES Cell [7] ---

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# 1. Split the data FIRST (using correct 1D Y for stratify)
# We split the raw X and Y
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y,
    test_size=0.2,
    random_state=42,
    stratify=Y
)

# 2. Define columns where 0 is a missing value 
cols_with_zero_nan = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

# 3. Replace 0s with np.nan in these columns AFTER splitting
# We must do this on copies to avoid a SettingWithCopyWarning
X_train = X_train.copy()
X_test = X_test.copy()

for col in cols_with_zero_nan:
    X_train[col] = X_train[col].replace(0, np.nan)
    X_test[col] = X_test[col].replace(0, np.nan)

# 4. Create a preprocessing pipeline
# This will fill NaNs with the mean, then scale all features
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# 5. Apply preprocessing
# Fit on train and transform both
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# 6. Train the KNN model on PROCESSED data
knn = KNeighborsClassifier(n_neighbors=5) # n_neighbors=5 is the default
knn.fit(X_train_processed, Y_train)

# 7. Predict on the PROCESSED test data
knn_pred = knn.predict(X_test_processed)

In [8]:
# --- This REPLACES Cell [8] ---

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

# Calculate all metrics
cm = confusion_matrix(Y_test, knn_pred)
acc = accuracy_score(Y_test, knn_pred)
err_rate = 1 - acc  # Error Rate = 1 - Accuracy
prec = precision_score(Y_test, knn_pred)
rec = recall_score(Y_test, knn_pred)
f1 = f1_score(Y_test, knn_pred)

# Print metrics
print("--- KNN Model Evaluation ---")
print("Confusion Matrix:")
print(cm)
print(f"Accuracy Score:  {acc:.4f}")
print(f"Error Rate:      {err_rate:.4f}")
print(f"Precision Score: {prec:.4f}")
print(f"Recall Score:    {rec:.4f}")
print(f"F1 Score:        {f1:.4f}")

--- KNN Model Evaluation ---
Confusion Matrix:
[[80 20]
 [20 34]]
Accuracy Score:  0.7403
Error Rate:      0.2597
Precision Score: 0.6296
Recall Score:    0.6296
F1 Score:        0.6296
