**Objective**: Predicting whether a person will be diagnosed with diabetes or not

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
dataset = pd.read_csv('diabetes.csv')
print(len(dataset))

768


In [3]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# Replacing zeros
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin'] # These columns can't be zero

for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0, np.NaN)
    mean = int(dataset[column].mean(skipna=True))
    dataset[column] = dataset[column].replace(np.NaN, mean)

In [5]:
dataset['Glucose'].mean()

121.68229166666667

In [6]:
print(dataset['Glucose'])

0      148.0
1       85.0
2      183.0
3       89.0
4      137.0
       ...  
763    101.0
764    122.0
765    121.0
766    126.0
767     93.0
Name: Glucose, Length: 768, dtype: float64


In [7]:
# Split the dataset
X = dataset.iloc[:, 0:8] # all rows and all columns except the last one
y = dataset.iloc[:, 8] # all rows and only the last column

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [8]:
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

614
614
154
154


In [9]:
# Feature scaling
sc_X = StandardScaler() # We do not train the y_train and y_test. What we want to train is only the data going in
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [10]:
import math
math.sqrt(len(y_test))

12.409673645990857

In [11]:
# Define the model: Init K-NN
classifier = KNeighborsClassifier(n_neighbors=11, p=2,metric='euclidean')

In [12]:
# Fit the Model
classifier.fit(X_train, y_train)

In [13]:
# Predict the test set results
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [14]:
# Evaluate the Model
cm = confusion_matrix(y_test, y_pred)
print (cm)
print(f1_score(y_test, y_pred))

[[94 13]
 [15 32]]
0.6956521739130436


In [15]:
print(accuracy_score(y_test, y_pred))

0.8181818181818182


In [19]:
# Example individual's data
individual_data = [2, 138, 62, 35, 0, 33.6, 0.127, 47]

# Convert the data to a numpy array and reshape it
# because the model expects a 2D array
individual_data_np = np.array(individual_data).reshape(1, -1)

# We have scaled our data during training, so scale this new data as well
individual_data_np_scaled = sc_X.transform(individual_data_np)

# Make a prediction
prediction = classifier.predict(individual_data_np_scaled)

# Interpret the prediction
if prediction[0] == 1:
    print("The model predicts this individual is at risk of diabetes.")
else:
    print("The model predicts this individual is not at risk of diabetes.")


The model predicts this individual is at risk of diabetes.


