In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

# Load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 
        'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = pd.read_csv(url, header=None, names=cols)

print("Initial Data Sample:\n", df.head())


Initial Data Sample:
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [2]:
# Replace 0 with NaN for certain columns
to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[to_replace] = df[to_replace].replace(0, np.nan)

# Fill missing values with median
df.fillna(df.median(numeric_only=True), inplace=True)

# Normalize
scaler = MinMaxScaler()
features = df.drop('Outcome', axis=1)
X = scaler.fit_transform(features)
y = df['Outcome'].values

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

def manhattan_distance(a, b):
    return np.sum(np.abs(a - b))

def knn_predict(X_train, y_train, x_test, k=5, metric='euclidean'):
    distances = []
    for i in range(len(X_train)):
        dist = euclidean_distance(X_train[i], x_test) if metric == 'euclidean' else manhattan_distance(X_train[i], x_test)
        distances.append((dist, y_train[i]))
    distances.sort(key=lambda x: x[0])
    k_nearest = [label for (_, label) in distances[:k]]
    most_common = Counter(k_nearest).most_common(1)[0][0]
    return most_common

def knn_model(X_train, y_train, X_test, k=5, metric='euclidean'):
    predictions = []
    for x in X_test:
        predictions.append(knn_predict(X_train, y_train, x, k=k, metric=metric))
    return np.array(predictions)


In [4]:
# Try with default K=5 and Euclidean distance
y_pred = knn_model(X_train, y_train, X_test, k=5, metric='euclidean')
print(f"Accuracy (K=5, Euclidean): {accuracy_score(y_test, y_pred):.4f}")


Accuracy (K=5, Euclidean): 0.7468


In [5]:
for k in [3, 5, 7, 9, 11]:
    for metric in ['euclidean', 'manhattan']:
        y_pred = knn_model(X_train, y_train, X_test, k=k, metric=metric)
        acc = accuracy_score(y_test, y_pred)
        print(f"K={k}, Metric={metric}, Accuracy={acc:.4f}")


K=3, Metric=euclidean, Accuracy=0.7208
K=3, Metric=manhattan, Accuracy=0.7208
K=5, Metric=euclidean, Accuracy=0.7468
K=5, Metric=manhattan, Accuracy=0.7338
K=7, Metric=euclidean, Accuracy=0.7468
K=7, Metric=manhattan, Accuracy=0.7532
K=9, Metric=euclidean, Accuracy=0.7597
K=9, Metric=manhattan, Accuracy=0.7468
K=11, Metric=euclidean, Accuracy=0.7468
K=11, Metric=manhattan, Accuracy=0.7403
