# Import Libs

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from google.colab import drive

drive.mount('/content/gdrive')
print(tf.version)

Mounted at /content/gdrive
<module 'tensorflow._api.v2.version' from '/usr/local/lib/python3.7/dist-packages/tensorflow/_api/v2/version/__init__.py'>


# Data Preprocessing

In [None]:
dataset = pd.read_csv('gdrive/My Drive/Dataset/diabetes.csv')
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
dataset.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
763    False
764    False
765    False
766    False
767    False
Length: 768, dtype: bool

In [None]:
cor_matrix = dataset.corr().abs()
print(cor_matrix)

                          Pregnancies   Glucose  BloodPressure  SkinThickness  \
Pregnancies                  1.000000  0.129459       0.141282       0.081672   
Glucose                      0.129459  1.000000       0.152590       0.057328   
BloodPressure                0.141282  0.152590       1.000000       0.207371   
SkinThickness                0.081672  0.057328       0.207371       1.000000   
Insulin                      0.073535  0.331357       0.088933       0.436783   
BMI                          0.017683  0.221071       0.281805       0.392573   
DiabetesPedigreeFunction     0.033523  0.137337       0.041265       0.183928   
Age                          0.544341  0.263514       0.239528       0.113970   
Outcome                      0.221898  0.466581       0.065068       0.074752   

                           Insulin       BMI  DiabetesPedigreeFunction  \
Pregnancies               0.073535  0.017683                  0.033523   
Glucose                   0.331357  0.221

In [None]:
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

for column in zero_not_accepted:
  dataset[column] = dataset[column].replace(0, np.NaN)
  mean = int(dataset[column].mean(skipna=True))
  dataset[column] = dataset[column].replace(np.NaN, mean)

print(dataset)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6    148.0           72.0           35.0    155.0  33.6   
1              1     85.0           66.0           29.0    155.0  26.6   
2              8    183.0           64.0           29.0    155.0  23.3   
3              1     89.0           66.0           23.0     94.0  28.1   
4              0    137.0           40.0           35.0    168.0  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10    101.0           76.0           48.0    180.0  32.9   
764            2    122.0           70.0           27.0    155.0  36.8   
765            5    121.0           72.0           23.0    112.0  26.2   
766            1    126.0           60.0           29.0    155.0  30.1   
767            1     93.0           70.0           31.0    155.0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                  

In [None]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]
print(X)
print(y)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6    148.0           72.0           35.0    155.0  33.6   
1              1     85.0           66.0           29.0    155.0  26.6   
2              8    183.0           64.0           29.0    155.0  23.3   
3              1     89.0           66.0           23.0     94.0  28.1   
4              0    137.0           40.0           35.0    168.0  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10    101.0           76.0           48.0    180.0  32.9   
764            2    122.0           70.0           27.0    155.0  36.8   
765            5    121.0           72.0           23.0    112.0  26.2   
766            1    126.0           60.0           29.0    155.0  30.1   
767            1     93.0           70.0           31.0    155.0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train)
print(X_test)

[[0.41176471 0.68831169 0.55102041 ... 0.34764826 0.26216909 0.55      ]
 [0.23529412 0.34415584 0.36734694 ... 0.20449898 0.1558497  0.01666667]
 [0.         0.78571429 0.67346939 ... 0.69734151 0.14901793 0.03333333]
 ...
 [0.23529412 0.32467532 0.41836735 ... 0.13292434 0.02988898 0.        ]
 [0.64705882 0.26623377 0.51020408 ... 0.24335378 0.09479078 0.23333333]
 [0.29411765 0.5974026  0.59183673 ... 0.28220859 0.23996584 0.8       ]]
[[0.05882353 1.00649351 0.53061224 ... 0.50511247 0.56191289 0.01666667]
 [0.11764706 0.40909091 0.51020408 ... 0.31492843 0.13919727 0.03333333]
 [0.23529412 0.20779221 0.3877551  ... 0.32310838 0.13364646 0.06666667]
 ...
 [0.23529412 0.63636364 0.63265306 ... 0.52760736 0.24210077 0.01666667]
 [0.17647059 0.46753247 0.51020408 ... 0.16564417 0.01238258 0.05      ]
 [0.05882353 0.40909091 0.48979592 ... 0.25766871 0.31725021 0.05      ]]


# Decision Tree

In [None]:
params = {
    "criterion":("gini", "entropy"),
    "splitter":("best", "random"),
    "max_depth":(list(range(1, 20))),
    "min_samples_split":[2, 3, 4],
    "min_samples_leaf":list(range(1, 20)),
}

In [None]:
D_tree = DecisionTreeClassifier(random_state=42)
tree_cv = GridSearchCV(D_tree, params, scoring='accuracy', n_jobs=1, verbose=1, cv=3)

In [None]:
tree_cv.fit(X_train, y_train)
best_params = tree_cv.best_params_
print(f"Best Params: {best_params}")

Fitting 3 folds for each of 4332 candidates, totalling 12996 fits
Best Params: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 2, 'splitter': 'random'}


In [None]:
D_tree = DecisionTreeClassifier(**best_params)
D_tree.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=3,
                       splitter='random')

In [None]:
#Training Accuracy
train_pred = D_tree.predict(X_train)
cm1 = confusion_matrix(y_train, train_pred)
print(classification_report(y_train, train_pred))
print("Confusion Matrix: \n", cm1)
print("F1 Score: ", f1_score(y_train, train_pred))
print("Accuracy: ", accuracy_score(y_train, train_pred))

              precision    recall  f1-score   support

           0       0.83      0.78      0.81       393
           1       0.65      0.72      0.68       221

    accuracy                           0.76       614
   macro avg       0.74      0.75      0.74       614
weighted avg       0.77      0.76      0.76       614

Confusion Matrix: 
 [[307  86]
 [ 62 159]]
F1 Score:  0.6824034334763949
Accuracy:  0.758957654723127


In [None]:
#Test Accuracy
y_pred = D_tree.predict(X_test)
cm1 = confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))
print("Confusion Matrix: \n", cm1)
print("F1 Score: ", f1_score(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.77      0.79       107
           1       0.54      0.62      0.57        47

    accuracy                           0.72       154
   macro avg       0.68      0.69      0.68       154
weighted avg       0.73      0.72      0.73       154

Confusion Matrix: 
 [[82 25]
 [18 29]]
F1 Score:  0.5742574257425743
Accuracy:  0.7207792207792207


# Gaussian Naive Bayes

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

GaussianNB()

In [None]:
#Training Accuracy
train_pred = gnb.predict(X_train)
cm1 = confusion_matrix(y_train, train_pred)
print(classification_report(y_train, train_pred))
print("Confusion Matrix: \n", cm1)
print("F1 Score: ", f1_score(y_train, train_pred))
print("Accuracy: ", accuracy_score(y_train, train_pred)) 

              precision    recall  f1-score   support

           0       0.79      0.82      0.81       393
           1       0.66      0.62      0.64       221

    accuracy                           0.75       614
   macro avg       0.73      0.72      0.72       614
weighted avg       0.75      0.75      0.75       614

Confusion Matrix: 
 [[324  69]
 [ 85 136]]
F1 Score:  0.6384976525821596
Accuracy:  0.749185667752443


In [None]:
#Test Accuracy
y_pred = gnb.predict(X_test)
cm1 = confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))
print("Confusion Matrix: \n", cm1)
print("F1 Score: ", f1_score(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       107
           1       0.66      0.62      0.64        47

    accuracy                           0.79       154
   macro avg       0.75      0.74      0.74       154
weighted avg       0.78      0.79      0.78       154

Confusion Matrix: 
 [[92 15]
 [18 29]]
F1 Score:  0.6373626373626374
Accuracy:  0.7857142857142857


# Artificial Neural Network

In [None]:
ann = tf.keras.models.Sequential()
#Hidden Layer 1
ann.add(tf.keras.layers.Dense(units=5, activation='relu'))
#Output Layer
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
ann.fit(X_train, y_train, batch_size=32, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fb4789f4110>

In [None]:
#Training Accuracy
train_pred = ann.predict(X_train) > 0.5
cm1 = confusion_matrix(y_train, train_pred)
print(classification_report(y_train, train_pred))
print("Confusion Matrix: \n", cm1)
print("F1 Score: ", f1_score(y_train, train_pred))
print("Accuracy: ", accuracy_score(y_train, train_pred)) 

              precision    recall  f1-score   support

           0       0.77      0.88      0.82       393
           1       0.71      0.53      0.61       221

    accuracy                           0.75       614
   macro avg       0.74      0.70      0.71       614
weighted avg       0.75      0.75      0.74       614

Confusion Matrix: 
 [[344  49]
 [103 118]]
F1 Score:  0.6082474226804124
Accuracy:  0.752442996742671


In [None]:
#Test Accuracy
y_pred = ann.predict(X_test) > 0.5
cm1 = confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))
print("Confusion Matrix: \n", cm1)
print("F1 Score: ", f1_score(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.89      0.85       107
           1       0.68      0.55      0.61        47

    accuracy                           0.79       154
   macro avg       0.75      0.72      0.73       154
weighted avg       0.78      0.79      0.78       154

Confusion Matrix: 
 [[95 12]
 [21 26]]
F1 Score:  0.611764705882353
Accuracy:  0.7857142857142857
