In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.decomposition import PCA

In [3]:
# Load dataset
data = pd.read_csv('diabetes.csv')

In [15]:
data.head(20)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [4]:
# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(data.drop('Outcome', axis=1))
y = data['Outcome']

In [5]:
# Dimensionality Reduction using PCA
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)

In [6]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)


In [7]:
# Hyperparameter Tuning using GridSearchCV
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

In [8]:
# Best KNN Model
classifier = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'], metric=best_params['metric'])
classifier.fit(X_train, y_train)

In [9]:
# Predict the test set results
y_pred = classifier.predict(X_test)

In [10]:
# Evaluate the model
cm = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")
print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")

Confusion Matrix:
[[78 21]
 [28 27]]
F1 Score: 0.5242718446601942
Accuracy: 0.6818181818181818


In [11]:
# Example Prediction
# Assuming you want to check if a person with certain health metrics has diabetes
example_patient = [[5, 116, 74, 0, 0, 25.6, 0.201, 30]]  # Example feature values
example_patient_scaled = scaler.transform(example_patient)
example_patient_pca = pca.transform(example_patient_scaled)
example_prediction = classifier.predict(example_patient_pca)

if example_prediction[0] == 1:
    print("The person has diabetes.")
else:
    print("The person does not have diabetes.")


The person does not have diabetes.


