## Import and Download




In [150]:
import sys 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
%matplotlib inline

In [151]:
!unzip './data/Pima Indians Diabetes Database.zip' -d './content/Pima-Indians-Diabetes-Database'

Archive:  ./data/Pima Indians Diabetes Database.zip
  inflating: ./content/Pima-Indians-Diabetes-Database/Pima Indians Diabetes Database.csv  
  inflating: ./content/Pima-Indians-Diabetes-Database/__MACOSX/._Pima Indians Diabetes Database.csv  


In [152]:
train=pd.read_csv('./content/Pima-Indians-Diabetes-Database/Pima Indians Diabetes Database.csv', index_col=False)


# Preprocessing

## Analyzing

In [153]:
# credit: https://www.kaggle.com/willkoehrsen/start-here-a-gentle-introduction. 

def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [154]:
train.head()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable (0 or 1)
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [155]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                                                                    Non-Null Count  Dtype  
---  ------                                                                    --------------  -----  
 0   Number of times pregnant                                                  768 non-null    int64  
 1   Plasma glucose concentration a 2 hours in an oral glucose tolerance test  768 non-null    int64  
 2   Diastolic blood pressure (mm Hg)                                          768 non-null    int64  
 3   Triceps skin fold thickness (mm)                                          768 non-null    int64  
 4   2-Hour serum insulin (mu U/ml)                                            768 non-null    int64  
 5   Body mass index (weight in kg/(height in m)^2)                            768 non-null    float64
 6   Diabetes pedigree function                                         

In [156]:
train.isnull().sum()

Number of times pregnant                                                    0
Plasma glucose concentration a 2 hours in an oral glucose tolerance test    0
Diastolic blood pressure (mm Hg)                                            0
Triceps skin fold thickness (mm)                                            0
2-Hour serum insulin (mu U/ml)                                              0
Body mass index (weight in kg/(height in m)^2)                              0
Diabetes pedigree function                                                  0
Age (years)                                                                 0
Class variable (0 or 1)                                                     0
dtype: int64

In [157]:
train.describe()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable (0 or 1)
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


Although there is no missing Value in a sense that there is no Nan Value we can cleraly see that here 0 represent missing value in most categories. So we will need to change these 0 to Nan so we can see the amount of missing value more easily.

In [158]:
cols = ['Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', 
     '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 
     'Age (years)']
train[cols] = train[cols].replace(0,np.nan)


In [159]:
missing_values_table(train)

Your selected dataframe has 9 columns.
There are 5 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
2-Hour serum insulin (mu U/ml),374,48.7
Triceps skin fold thickness (mm),227,29.6
Diastolic blood pressure (mm Hg),35,4.6
Body mass index (weight in kg/(height in m)^2),11,1.4
Plasma glucose concentration a 2 hours in an oral glucose tolerance test,5,0.7


In [160]:
train["2-Hour serum insulin (mu U/ml)"].unique()

array([ nan,  94., 168.,  88., 543., 846., 175., 230.,  83.,  96., 235.,
       146., 115., 140., 110., 245.,  54., 192., 207.,  70., 240.,  82.,
        36.,  23., 300., 342., 304., 142., 128.,  38., 100.,  90., 270.,
        71., 125., 176.,  48.,  64., 228.,  76., 220.,  40., 152.,  18.,
       135., 495.,  37.,  51.,  99., 145., 225.,  49.,  50.,  92., 325.,
        63., 284., 119., 204., 155., 485.,  53., 114., 105., 285., 156.,
        78., 130.,  55.,  58., 160., 210., 318.,  44., 190., 280.,  87.,
       271., 129., 120., 478.,  56.,  32., 744., 370.,  45., 194., 680.,
       402., 258., 375., 150.,  67.,  57., 116., 278., 122., 545.,  75.,
        74., 182., 360., 215., 184.,  42., 132., 148., 180., 205.,  85.,
       231.,  29.,  68.,  52., 255., 171.,  73., 108.,  43., 167., 249.,
       293.,  66., 465.,  89., 158.,  84.,  72.,  59.,  81., 196., 415.,
       275., 165., 579., 310.,  61., 474., 170., 277.,  60.,  14.,  95.,
       237., 191., 328., 250., 480., 265., 193.,  7

# Treat Missings Values

## Deleting Rows / Listwise Deletion

Delete all row with missing values

In [161]:
train_delete_row = train.copy()
train_delete_row.dropna(inplace=True)
train_delete_row.isnull().sum()


Number of times pregnant                                                    0
Plasma glucose concentration a 2 hours in an oral glucose tolerance test    0
Diastolic blood pressure (mm Hg)                                            0
Triceps skin fold thickness (mm)                                            0
2-Hour serum insulin (mu U/ml)                                              0
Body mass index (weight in kg/(height in m)^2)                              0
Diabetes pedigree function                                                  0
Age (years)                                                                 0
Class variable (0 or 1)                                                     0
dtype: int64

## Replacing With Mean

In [162]:
def filter1(dataset):
    for col in dataset.columns:
        #print(dataset[col].dtypes)
        dataset[col].fillna(dataset[col].mean(),inplace=True)            

    return dataset

In [163]:
train_mean = train.copy()

train_mean = filter1(train_mean)
train_mean


Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable (0 or 1)
0,6,148.0,72.0,35.00000,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.00000,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.00000,94.000000,28.1,0.167,21,0
4,0,137.0,40.0,35.00000,168.000000,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.00000,180.000000,32.9,0.171,63,0
764,2,122.0,70.0,27.00000,155.548223,36.8,0.340,27,0
765,5,121.0,72.0,23.00000,112.000000,26.2,0.245,30,0
766,1,126.0,60.0,29.15342,155.548223,30.1,0.349,47,1


In [164]:
missing_values_table(train_mean)

Your selected dataframe has 9 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


## Assigning An Unique Category

In [165]:
train_unique_categ = train.copy()
missing_values_table(train_unique_categ)


Your selected dataframe has 9 columns.
There are 5 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
2-Hour serum insulin (mu U/ml),374,48.7
Triceps skin fold thickness (mm),227,29.6
Diastolic blood pressure (mm Hg),35,4.6
Body mass index (weight in kg/(height in m)^2),11,1.4
Plasma glucose concentration a 2 hours in an oral glucose tolerance test,5,0.7


In [166]:
train_unique_categ["2-Hour serum insulin (mu U/ml)"].describe()

count    394.000000
mean     155.548223
std      118.775855
min       14.000000
25%       76.250000
50%      125.000000
75%      190.000000
max      846.000000
Name: 2-Hour serum insulin (mu U/ml), dtype: float64

In [167]:
cat = "2-Hour serum insulin (mu U/ml)"
train_unique_categ[cat] = train_unique_categ[cat].replace(np.nan,-1)

train_unique_categ.loc[ train_unique_categ[cat] < 0, cat] = 0
train_unique_categ.loc[(train_unique_categ[cat] >= 0) & (train_unique_categ[cat] <= 76.25), cat] = 1
train_unique_categ.loc[(train_unique_categ[cat] > 76.25) & (train_unique_categ[cat] <= 125), cat] = 2
train_unique_categ.loc[(train_unique_categ[cat] > 125) & (train_unique_categ[cat] <= 190), cat] = 3
train_unique_categ.loc[(train_unique_categ[cat] > 190) & (train_unique_categ[cat] <= 900), cat] = 4


In [168]:
cat = "Triceps skin fold thickness (mm)"

train_unique_categ[cat].describe()

count    541.000000
mean      29.153420
std       10.476982
min        7.000000
25%       22.000000
50%       29.000000
75%       36.000000
max       99.000000
Name: Triceps skin fold thickness (mm), dtype: float64

In [169]:
train_unique_categ[cat] = train_unique_categ[cat].replace(np.nan,-1)

train_unique_categ.loc[ train_unique_categ[cat] < 0, cat] = 0
train_unique_categ.loc[(train_unique_categ[cat] > 0) & (train_unique_categ[cat] <= 22), cat] = 1
train_unique_categ.loc[(train_unique_categ[cat] > 22) & (train_unique_categ[cat] <= 29), cat] = 2
train_unique_categ.loc[(train_unique_categ[cat] > 29) & (train_unique_categ[cat] <= 36), cat] = 3
train_unique_categ.loc[(train_unique_categ[cat] > 36) & (train_unique_categ[cat] <= 100), cat] = 4


In [170]:
cat = "Diastolic blood pressure (mm Hg)"

train_unique_categ[cat].describe()

count    733.000000
mean      72.405184
std       12.382158
min       24.000000
25%       64.000000
50%       72.000000
75%       80.000000
max      122.000000
Name: Diastolic blood pressure (mm Hg), dtype: float64

In [171]:
train_unique_categ[cat] = train_unique_categ[cat].replace(np.nan,-1)

train_unique_categ.loc[ train_unique_categ[cat] < 0, cat] = 0
train_unique_categ.loc[(train_unique_categ[cat] > 0) & (train_unique_categ[cat] <= 64), cat] = 1
train_unique_categ.loc[(train_unique_categ[cat] > 64) & (train_unique_categ[cat] <= 72), cat] = 2
train_unique_categ.loc[(train_unique_categ[cat] > 72) & (train_unique_categ[cat] <= 80), cat] = 3
train_unique_categ.loc[(train_unique_categ[cat] > 80) & (train_unique_categ[cat] <= 122), cat] = 4


In [172]:
cat = "Body mass index (weight in kg/(height in m)^2)"

train_unique_categ[cat].describe()

count    757.000000
mean      32.457464
std        6.924988
min       18.200000
25%       27.500000
50%       32.300000
75%       36.600000
max       67.100000
Name: Body mass index (weight in kg/(height in m)^2), dtype: float64

In [173]:
train_unique_categ[cat] = train_unique_categ[cat].replace(np.nan,-1)

train_unique_categ.loc[ train_unique_categ[cat] < 0, cat] = 0
train_unique_categ.loc[(train_unique_categ[cat] > 0) & (train_unique_categ[cat] <= 27.5), cat] = 1
train_unique_categ.loc[(train_unique_categ[cat] > 27.5) & (train_unique_categ[cat] <= 32.3), cat] = 2
train_unique_categ.loc[(train_unique_categ[cat] > 32.3) & (train_unique_categ[cat] <= 36.6), cat] = 3
train_unique_categ.loc[(train_unique_categ[cat] > 36.6) & (train_unique_categ[cat] <= 67.1), cat] = 4


In [174]:
cat = "Plasma glucose concentration a 2 hours in an oral glucose tolerance test"

train_unique_categ[cat].describe()

count    763.000000
mean     121.686763
std       30.535641
min       44.000000
25%       99.000000
50%      117.000000
75%      141.000000
max      199.000000
Name: Plasma glucose concentration a 2 hours in an oral glucose tolerance test, dtype: float64

In [175]:
train_unique_categ[cat] = train_unique_categ[cat].replace(np.nan,-1)

train_unique_categ.loc[ train_unique_categ[cat] < 0, cat] = 0
train_unique_categ.loc[(train_unique_categ[cat] > 0) & (train_unique_categ[cat] <= 99), cat] = 1
train_unique_categ.loc[(train_unique_categ[cat] > 99) & (train_unique_categ[cat] <= 117), cat] = 2
train_unique_categ.loc[(train_unique_categ[cat] > 117) & (train_unique_categ[cat] <= 141), cat] = 3
train_unique_categ.loc[(train_unique_categ[cat] > 141) & (train_unique_categ[cat] <= 199), cat] = 4


## Using Algorithms Which Support Missing Values | KNN Imputer 5 nearest neighbours

In [176]:
train_knn = train.copy()


In [177]:
missing_values_table(train_knn)

Your selected dataframe has 9 columns.
There are 5 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
2-Hour serum insulin (mu U/ml),374,48.7
Triceps skin fold thickness (mm),227,29.6
Diastolic blood pressure (mm Hg),35,4.6
Body mass index (weight in kg/(height in m)^2),11,1.4
Plasma glucose concentration a 2 hours in an oral glucose tolerance test,5,0.7


In [178]:
train_knn

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable (0 or 1)
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0
764,2,122.0,70.0,27.0,,36.8,0.340,27,0
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30,0
766,1,126.0,60.0,,,30.1,0.349,47,1


In [179]:

from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_knn = pd.DataFrame(scaler.fit_transform(train_knn), columns = train_knn.columns)
train_knn

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable (0 or 1)
0,0.352941,0.670968,0.489796,0.304348,,0.314928,0.234415,0.483333,1.0
1,0.058824,0.264516,0.428571,0.239130,,0.171779,0.116567,0.166667,0.0
2,0.470588,0.896774,0.408163,,,0.104294,0.253629,0.183333,1.0
3,0.058824,0.290323,0.428571,0.173913,0.096154,0.202454,0.038002,0.000000,0.0
4,0.000000,0.600000,0.163265,0.304348,0.185096,0.509202,0.943638,0.200000,1.0
...,...,...,...,...,...,...,...,...,...
763,0.588235,0.367742,0.530612,0.445652,0.199519,0.300613,0.039710,0.700000,0.0
764,0.117647,0.503226,0.469388,0.217391,,0.380368,0.111870,0.100000,0.0
765,0.294118,0.496774,0.489796,0.173913,0.117788,0.163599,0.071307,0.150000,0.0
766,0.058824,0.529032,0.367347,,,0.243354,0.115713,0.433333,1.0


In [180]:
imputer = KNNImputer(n_neighbors=5)
train_knn = pd.DataFrame(imputer.fit_transform(train_knn),columns = train_knn.columns)
train_knn

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable (0 or 1)
0,0.352941,0.670968,0.489796,0.304348,0.384856,0.314928,0.234415,0.483333,1.0
1,0.058824,0.264516,0.428571,0.239130,0.050721,0.171779,0.116567,0.166667,0.0
2,0.470588,0.896774,0.408163,0.273913,0.269231,0.104294,0.253629,0.183333,1.0
3,0.058824,0.290323,0.428571,0.173913,0.096154,0.202454,0.038002,0.000000,0.0
4,0.000000,0.600000,0.163265,0.304348,0.185096,0.509202,0.943638,0.200000,1.0
...,...,...,...,...,...,...,...,...,...
763,0.588235,0.367742,0.530612,0.445652,0.199519,0.300613,0.039710,0.700000,0.0
764,0.117647,0.503226,0.469388,0.217391,0.163462,0.380368,0.111870,0.100000,0.0
765,0.294118,0.496774,0.489796,0.173913,0.117788,0.163599,0.071307,0.150000,0.0
766,0.058824,0.529032,0.367347,0.210870,0.130288,0.243354,0.115713,0.433333,1.0


In [181]:
train_knn.isna().any()
train_knn

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable (0 or 1)
0,0.352941,0.670968,0.489796,0.304348,0.384856,0.314928,0.234415,0.483333,1.0
1,0.058824,0.264516,0.428571,0.239130,0.050721,0.171779,0.116567,0.166667,0.0
2,0.470588,0.896774,0.408163,0.273913,0.269231,0.104294,0.253629,0.183333,1.0
3,0.058824,0.290323,0.428571,0.173913,0.096154,0.202454,0.038002,0.000000,0.0
4,0.000000,0.600000,0.163265,0.304348,0.185096,0.509202,0.943638,0.200000,1.0
...,...,...,...,...,...,...,...,...,...
763,0.588235,0.367742,0.530612,0.445652,0.199519,0.300613,0.039710,0.700000,0.0
764,0.117647,0.503226,0.469388,0.217391,0.163462,0.380368,0.111870,0.100000,0.0
765,0.294118,0.496774,0.489796,0.173913,0.117788,0.163599,0.071307,0.150000,0.0
766,0.058824,0.529032,0.367347,0.210870,0.130288,0.243354,0.115713,0.433333,1.0


# Accuracy

In [182]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

def model(train):
  values = train.values
  X = values[:,0:8]
  y = values[:,8]
  model = LinearDiscriminantAnalysis()
  kfold = KFold(n_splits=6)
  result = cross_val_score(model, X, y, cv=kfold, scoring = 'accuracy')
  print(result.mean())

In [183]:
train_delete_row.shape

(392, 9)

In [184]:
# Logistic Regression :
print("logistic Regression: Replacing With Mean")

model(train_mean)
print("logistic Regression: Deleting Rows")
model(train_delete_row)
print("logistic Regression: Unique Category")
model(train_unique_categ)
print("logistic Regression: KNN")
model(train_knn)

logistic Regression: Replacing With Mean
0.7669270833333334
logistic Regression: Deleting Rows
0.7832944832944834
logistic Regression: Unique Category
0.7526041666666666
logistic Regression: KNN
0.7747395833333334
