7. Build a KNN model for predicting if a person will have diabetes or not with a high accuracy
score. Perform some appropriate Pre-Processing steps on the given dataset for better results.
Implement the KNN algorithm on your own. Try other possible processes that can be done to
dataset and tuning the model to increase accuracy such as Increase K value, Normalization and
Different Distance Metrics.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Load dataset
data = load_diabetes()
X = data.data
y = (data.target > 140).astype(int)  #convert to classification as dataset is for regression
df = pd.DataFrame(X,y)
df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
0,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
1,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
1,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
0,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [3]:
#Preprocessing

# Normalize
scaler = MinMaxScaler()
X = scaler.fit_transform(X)


In [4]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Custom KNN Implementation
def compute_distance(x1, x2, metric):
    if metric == 'euclidean':
        return np.sqrt(np.sum((x1 - x2) ** 2))
    elif metric == 'manhattan':
        return np.sum(np.abs(x1 - x2))
    else:
        raise ValueError("Unsupported distance metric")

def knn_predict(X_train, y_train, X_test, k=5, metric='euclidean'):
    predictions = []
    for test_point in X_test:
        # Compute distances to all training points
        distances = [compute_distance(test_point, x, metric) for x in X_train]
        # Get indices of k-nearest neighbors
        k_indices = np.argsort(distances)[:k]
        # Get the labels of the k-nearest neighbors
        k_labels = [y_train[i] for i in k_indices]
        # Majority vote
        predicted_label = int(np.round(np.mean(k_labels)))
        predictions.append(predicted_label)
    return np.array(predictions)


In [8]:
for k in [3, 5, 7, 9, 11]:
    for metric in ['euclidean', 'manhattan']:
        y_pred = knn_predict(X_train, y_train, X_test, k=k, metric=metric)
        acc = accuracy_score(y_test, y_pred)
        print(f"K = {k}, Metric = {metric}, Accuracy = {acc:.4f}")


K = 3, Metric = euclidean, Accuracy = 0.7191
K = 3, Metric = manhattan, Accuracy = 0.7191
K = 5, Metric = euclidean, Accuracy = 0.7079
K = 5, Metric = manhattan, Accuracy = 0.7416
K = 7, Metric = euclidean, Accuracy = 0.7191
K = 7, Metric = manhattan, Accuracy = 0.7079
K = 9, Metric = euclidean, Accuracy = 0.7079
K = 9, Metric = manhattan, Accuracy = 0.6966
K = 11, Metric = euclidean, Accuracy = 0.6966
K = 11, Metric = manhattan, Accuracy = 0.7191
