In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
url = "https://raw.githubusercontent.com/anvarnarz/praktikum_datasets/main/diabetes.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.shape

(768, 9)

In [4]:
# Replacing zero values with NaN values

columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[columns_to_replace] = df[columns_to_replace].replace(0, np.nan)


# In order to make it easy to replace NaN with median values
# I can't delete zero values because they are too a lot.



In [5]:
df.isna().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [6]:
for column in columns_to_replace:
    df[column].fillna(df[column].median(), inplace=True)
df.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0
764,2,122.0,70.0,27.0,125.0,36.8,0.340,27,0
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30,0
766,1,126.0,60.0,29.0,125.0,30.1,0.349,47,1


In [8]:
# Making sure that it isn't happening overfitting or underfitting
df['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [9]:
# Balance the values
from sklearn.utils import resample

majority_class = df[df['Outcome']==0]
minority_class = df[df['Outcome']==1]

minority_class_upsampled = resample(minority_class,
                                   replace=True,
                                   n_samples=len(majority_class),
                                   random_state=42
                                   )
df_ = pd.concat([majority_class, minority_class_upsampled])
df_

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
5,5,116.0,74.0,29.0,125.0,25.6,0.201,30,0
7,10,115.0,72.0,29.0,125.0,35.3,0.134,29,0
10,4,110.0,92.0,29.0,125.0,37.6,0.191,30,0
...,...,...,...,...,...,...,...,...,...
612,7,168.0,88.0,42.0,321.0,38.2,0.787,40,1
586,8,143.0,66.0,29.0,125.0,34.9,0.129,41,1
730,3,130.0,78.0,23.0,79.0,28.4,0.323,34,1
664,6,115.0,60.0,39.0,125.0,33.7,0.245,40,1


In [10]:
df_['Outcome'].value_counts()
print(df_.shape)

(1000, 9)


In [11]:
corr = df_.corr().abs()
corr

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.150677,0.182001,0.05368,0.053915,0.024151,0.066229,0.55226,0.250498
Glucose,0.150677,1.0,0.239686,0.170898,0.414768,0.198606,0.124863,0.262145,0.478401
BloodPressure,0.182001,0.239686,1.0,0.212028,0.061421,0.290994,0.05274,0.303678,0.199252
SkinThickness,0.05368,0.170898,0.212028,1.0,0.230157,0.556992,0.076089,0.073992,0.230109
Insulin,0.053915,0.414768,0.061421,0.230157,1.0,0.184402,0.098286,0.096184,0.209262
BMI,0.024151,0.198606,0.290994,0.556992,0.184402,1.0,0.0909,0.016587,0.28487
DiabetesPedigreeFunction,0.066229,0.124863,0.05274,0.076089,0.098286,0.0909,1.0,0.09568,0.143301
Age,0.55226,0.262145,0.303678,0.073992,0.096184,0.016587,0.09568,1.0,0.265366
Outcome,0.250498,0.478401,0.199252,0.230109,0.209262,0.28487,0.143301,0.265366,1.0


In [12]:
df_.corrwith(df_['Outcome']).abs().sort_values(ascending=False)

Outcome                     1.000000
Glucose                     0.478401
BMI                         0.284870
Age                         0.265366
Pregnancies                 0.250498
SkinThickness               0.230109
Insulin                     0.209262
BloodPressure               0.199252
DiabetesPedigreeFunction    0.143301
dtype: float64

In [13]:
X = df_.drop(columns="Outcome")
y = df_['Outcome']

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# I want to use k-NN, this will be good practise
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [17]:
y_pred = knn.predict(X_train)

Now let's measure jaccard, confusion_matrix, MSE and RMSE scores

In [18]:
from sklearn.metrics import jaccard_score, confusion_matrix, mean_squared_error, root_mean_squared_error

jaccard = jaccard_score(y_train, y_pred)
confusion = confusion_matrix(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))

print(f"{jaccard=}")
print(f"{confusion=}")
print(f"{mse=}")
print(f"{rmse=}")

jaccard=0.7259713701431493
confusion=array([[311,  91],
       [ 43, 355]], dtype=int64)
mse=0.1675
rmse=0.40926763859362253


In [22]:
from sklearn.metrics import classification_report

clasrep = classification_report(y_train, y_pred)
print(clasrep)

              precision    recall  f1-score   support

           0       0.88      0.77      0.82       402
           1       0.80      0.89      0.84       398

    accuracy                           0.83       800
   macro avg       0.84      0.83      0.83       800
weighted avg       0.84      0.83      0.83       800



In [23]:
from sklearn.model_selection import cross_val_predict
predict = cross_val_predict(estimator=knn, X = X, y=y, cv=5)

print("Cross Validation Report", classification_report(y, predict))

Cross Validation Report               precision    recall  f1-score   support

           0       0.78      0.70      0.74       500
           1       0.73      0.81      0.77       500

    accuracy                           0.75      1000
   macro avg       0.76      0.75      0.75      1000
weighted avg       0.76      0.75      0.75      1000



In [24]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': np.arange(1, 51)}

knn_gscv = GridSearchCV(knn, param_grid, cv=5)

knn_gscv.fit(X, y)

In [26]:
knn_gscv.cv_results_['rank_test_score']

array([ 1,  2,  3,  4,  7, 20,  6, 20, 12,  5, 10, 16, 20, 11, 12, 25, 18,
       42, 35, 18, 34, 42, 33, 20, 32, 30, 20,  9, 17, 14, 14,  8, 25, 39,
       27, 42, 38, 49, 46, 39, 46, 46, 39, 30, 49, 27, 27, 37, 42, 35])

In [27]:
knn_gscv.best_params_

{'n_neighbors': 1}

In [28]:
knn_gscv.best_score_

0.8219999999999998