## Load the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [3]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
data.shape

(768, 9)

In [5]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

## Create a pipeline for Scaling and applying Logistic Regression

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import make_pipeline
pipe = make_pipeline(MinMaxScaler(), LogisticRegression())
pipe

## Seperate X and y from the data

In [7]:
X = data.drop('Outcome', axis = 1)
y = data['Outcome']

## Split the data into train test sets

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

## Apply pipeline on X_train and y_train

In [9]:
pipe.fit(X_train, y_train)

## Perform predictions

In [10]:
y_pred = pipe.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [11]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7662337662337663

## I want to check whether 76.6% is the highest accuracy possible on this data?

In [15]:
from sklearn.model_selection import cross_val_score
log_reg_scores = cross_val_score(LogisticRegression(), X, y, cv = 5)
log_reg_scores

array([0.77272727, 0.74675325, 0.75974026, 0.81699346, 0.75816993])

## Observation:

- Using cross_val_score function we create 5 sets of train and test set to check which train and test set produces the highest accuracy
- We observed that the 4th set produces the highest accuracy

## I want ot know whether LogisticRegression is best suitable for this data or KNN is best suitable for this data
(Model Selection using cross_val_score)

In [16]:
from sklearn.neighbors import KNeighborsClassifier

knn_scores = cross_val_score(KNeighborsClassifier(), X, y, cv = 5)
knn_scores

array([0.72727273, 0.72727273, 0.7012987 , 0.75816993, 0.70588235])

In [17]:
log_reg_scores.mean()

0.7708768355827178

In [18]:
knn_scores.mean()

0.723979288685171

## Observations:

1. Looking at the mean accuracy scores of LogisticRegression and KNN, it is clear that logreg is having highest mean score. Hence logreg is best suitable for diabetes data

## Hyperparameter Optimization using GridSearchCV

In [39]:
parameters_list = {'n_neighbors' : list(range(10,20)), 'p' : [1, 2]}
parameters_list

{'n_neighbors': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 'p': [1, 2]}

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
gscv = GridSearchCV(KNeighborsClassifier(), parameters_list, cv = 5)
gscv

In [25]:
gscv.fit(X_train, y_train)

In [26]:
gscv.cv_results_

{'mean_fit_time': array([0.0013443 , 0.00079846, 0.00311656, 0.00197244, 0.00232801,
        0.0022213 , 0.00215802, 0.00209289, 0.00163598, 0.00286369,
        0.00313492, 0.00224195, 0.00344605, 0.00287895, 0.00301485,
        0.00226202, 0.00233574, 0.00242667, 0.00204048, 0.00366716,
        0.00237875, 0.00203938, 0.00310802, 0.00299826, 0.00202732,
        0.00264125, 0.00304809, 0.00187454, 0.00324373, 0.0027328 ,
        0.00276771, 0.00203094, 0.00363369, 0.00240636, 0.00185747,
        0.00215869, 0.00210752, 0.00352211, 0.00279999, 0.00282478]),
 'std_fit_time': array([0.0002955 , 0.00039925, 0.00061901, 0.00042875, 0.00041898,
        0.00132069, 0.00077063, 0.00069834, 0.00047023, 0.00111964,
        0.00131095, 0.00085816, 0.00237053, 0.00100142, 0.00071315,
        0.00082061, 0.00044672, 0.00038176, 0.0006926 , 0.00328776,
        0.00083948, 0.00073394, 0.00102416, 0.00103849, 0.00066018,
        0.00080539, 0.00120798, 0.00081233, 0.00081475, 0.00151186,
        0.000

In [28]:
res = pd.DataFrame(gscv.cv_results_)
res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001344,0.000296,0.0,0.0,0,1,"{'n_neighbors': 0, 'p': 1}",,,,,,,,39
1,0.000798,0.000399,0.0,0.0,0,2,"{'n_neighbors': 0, 'p': 2}",,,,,,,,39
2,0.003117,0.000619,0.006273,0.001091,1,1,"{'n_neighbors': 1, 'p': 1}",0.731481,0.75,0.654206,0.64486,0.700935,0.696296,0.041379,36
3,0.001972,0.000429,0.004448,0.000595,1,2,"{'n_neighbors': 1, 'p': 2}",0.675926,0.75,0.691589,0.654206,0.654206,0.685185,0.035347,38
4,0.002328,0.000419,0.005014,0.000639,2,1,"{'n_neighbors': 2, 'p': 1}",0.722222,0.722222,0.672897,0.682243,0.719626,0.703842,0.021674,34
5,0.002221,0.001321,0.004796,0.000789,2,2,"{'n_neighbors': 2, 'p': 2}",0.731481,0.703704,0.663551,0.700935,0.691589,0.698252,0.021862,35
6,0.002158,0.000771,0.00457,0.000621,3,1,"{'n_neighbors': 3, 'p': 1}",0.712963,0.712963,0.71028,0.64486,0.738318,0.703877,0.031226,33
7,0.002093,0.000698,0.004224,0.000818,3,2,"{'n_neighbors': 3, 'p': 2}",0.675926,0.675926,0.672897,0.691589,0.719626,0.687193,0.01749,37
8,0.001636,0.00047,0.00514,0.000651,4,1,"{'n_neighbors': 4, 'p': 1}",0.740741,0.740741,0.719626,0.672897,0.700935,0.714988,0.025757,30
9,0.002864,0.00112,0.006031,0.002371,4,2,"{'n_neighbors': 4, 'p': 2}",0.740741,0.712963,0.672897,0.672897,0.757009,0.711301,0.034376,32


In [29]:
res[['param_n_neighbors', 'param_p', 'mean_test_score']]

Unnamed: 0,param_n_neighbors,param_p,mean_test_score
0,0,1,
1,0,2,
2,1,1,0.696296
3,1,2,0.685185
4,2,1,0.703842
5,2,2,0.698252
6,3,1,0.703877
7,3,2,0.687193
8,4,1,0.714988
9,4,2,0.711301


## Observations:

- The optimum value of k is 11
- The optimum value of distance (p) is 1 (Manhattan distance)

- The accuracy is high when p = 1 (Manhattan distance is best suitable distance metric for this data)

## Better Alternative to GridSearchCV - RandomizedSearchCV

In [32]:
from sklearn.model_selection import RandomizedSearchCV
rscv = RandomizedSearchCV(KNeighborsClassifier(), parameters_list, cv = 5)
rscv

In [33]:
rscv.fit(X_train, y_train)

In [34]:
rscv.cv_results_

{'mean_fit_time': array([0.00287862, 0.0020308 , 0.00223246, 0.00268302, 0.00229077,
        0.00210328, 0.00170345, 0.00232034, 0.00218878, 0.00369515]),
 'std_fit_time': array([0.00164767, 0.00065277, 0.00080374, 0.0013915 , 0.00068601,
        0.000666  , 0.00040135, 0.0008939 , 0.0001927 , 0.00120621]),
 'mean_score_time': array([0.0060184 , 0.00618539, 0.00479698, 0.00503283, 0.00522752,
        0.0040144 , 0.00443039, 0.00591116, 0.00487285, 0.00630326]),
 'std_score_time': array([0.00125645, 0.00127222, 0.00028859, 0.00139843, 0.00153427,
        0.0003372 , 0.00055055, 0.00170748, 0.00094573, 0.00148913]),
 'param_p': masked_array(data=[2, 1, 2, 1, 2, 2, 2, 2, 2, 1],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_n_neighbors': masked_array(data=[5, 13, 18, 2, 10, 11, 16, 3, 2, 1],
              mask=[False, False, False, False, False, False, False, False,


In [36]:
res = pd.DataFrame(rscv.cv_results_)
res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_p,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002879,0.001648,0.006018,0.001256,2,5,"{'p': 2, 'n_neighbors': 5}",0.731481,0.712963,0.700935,0.616822,0.794393,0.711319,0.05718,6
1,0.002031,0.000653,0.006185,0.001272,1,13,"{'p': 1, 'n_neighbors': 13}",0.75,0.731481,0.785047,0.691589,0.775701,0.746764,0.03344,1
2,0.002232,0.000804,0.004797,0.000289,2,18,"{'p': 2, 'n_neighbors': 18}",0.731481,0.75,0.738318,0.700935,0.775701,0.739287,0.024387,3
3,0.002683,0.001392,0.005033,0.001398,1,2,"{'p': 1, 'n_neighbors': 2}",0.722222,0.722222,0.672897,0.682243,0.719626,0.703842,0.021674,7
4,0.002291,0.000686,0.005228,0.001534,2,10,"{'p': 2, 'n_neighbors': 10}",0.694444,0.75,0.785047,0.682243,0.775701,0.737487,0.041913,4
5,0.002103,0.000666,0.004014,0.000337,2,11,"{'p': 2, 'n_neighbors': 11}",0.731481,0.740741,0.794393,0.663551,0.794393,0.744912,0.048402,2
6,0.001703,0.000401,0.00443,0.000551,2,16,"{'p': 2, 'n_neighbors': 16}",0.722222,0.722222,0.719626,0.728972,0.747664,0.728141,0.010241,5
7,0.00232,0.000894,0.005911,0.001707,2,3,"{'p': 2, 'n_neighbors': 3}",0.675926,0.675926,0.672897,0.691589,0.719626,0.687193,0.01749,10
8,0.002189,0.000193,0.004873,0.000946,2,2,"{'p': 2, 'n_neighbors': 2}",0.731481,0.703704,0.663551,0.700935,0.691589,0.698252,0.021862,8
9,0.003695,0.001206,0.006303,0.001489,1,1,"{'p': 1, 'n_neighbors': 1}",0.731481,0.75,0.654206,0.64486,0.700935,0.696296,0.041379,9


In [37]:
res[['param_p', 'param_n_neighbors', 'mean_test_score']]

Unnamed: 0,param_p,param_n_neighbors,mean_test_score
0,2,5,0.711319
1,1,13,0.746764
2,2,18,0.739287
3,1,2,0.703842
4,2,10,0.737487
5,2,11,0.744912
6,2,16,0.728141
7,2,3,0.687193
8,2,2,0.698252
9,1,1,0.696296
