In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings; warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [3]:
df = pd.read_csv('Data/Preprocessed.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.813399,148.0,72.0,0.696814,0.183531,0.260117,0.821764,1.36418,1
1,-0.833906,85.0,66.0,0.036596,0.183531,-0.843284,-0.168409,0.126452,0
2,1.188996,183.0,64.0,0.054048,0.183531,-1.457745,0.935284,0.230161,1
3,-0.833906,89.0,66.0,-0.674132,-0.791683,-0.586404,-1.298725,-1.480075,0
4,-1.603317,137.0,40.0,0.696814,0.34391,1.462682,2.33668,0.327328,1


In [4]:
features = df.drop('Outcome', axis=1)
label = df['Outcome']
print(features.shape, label.shape)

(768, 8) (768,)


In [5]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=63)

In [15]:
model_report = pd.DataFrame()

In [30]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=7)

accuracy = []
print('[INFO] KNN Model Performance...\n')
i = 0
for train_index, test_index in skf.split(features, label):
    # data split
    x_train_fold, x_test_fold = features.iloc[train_index], features.iloc[test_index]
    y_train_fold, y_test_fold = label.iloc[train_index], label.iloc[test_index]
    
    # model training and its accuracy
    knn_model.fit(x_train_fold, y_train_fold)
    accuracy.append(knn_model.score(x_test_fold, y_test_fold))
    
    print('[INFO] Performance on', i, 'fold')
    # model performance report
    y_pred = knn_model.predict(x_test_fold)
    print(knn_model.score(x_test_fold, y_test_fold))
    print(confusion_matrix(y_test_fold, y_pred))
    print(classification_report(y_test_fold, y_pred))
    print('--'*30, '\n')
    i += 1
    
# append the score of the model in dataframe
model_report = model_report.append({'Model': 'KNN', 'Max': max(accuracy), 
                     'Avg': np.mean(accuracy)}, ignore_index=True)
        
print('All accuracy', accuracy)
print('Max:', max(accuracy))
print('Average', np.mean(accuracy))    

[INFO] KNN Model Performance...

[INFO] Performance on 0 fold
0.7012987012987013
[[82 18]
 [28 26]]
              precision    recall  f1-score   support

           0       0.75      0.82      0.78       100
           1       0.59      0.48      0.53        54

    accuracy                           0.70       154
   macro avg       0.67      0.65      0.66       154
weighted avg       0.69      0.70      0.69       154

------------------------------------------------------------ 

[INFO] Performance on 1 fold
0.7727272727272727
[[86 14]
 [21 33]]
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       100
           1       0.70      0.61      0.65        54

    accuracy                           0.77       154
   macro avg       0.75      0.74      0.74       154
weighted avg       0.77      0.77      0.77       154

------------------------------------------------------------ 

[INFO] Performance on 2 fold
0.6883116883116883
[[78 

In [38]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(C=6, max_iter=100, multi_class='ovr')

accuracy = []
print('[INFO] Logistic Regression Model Performance...\n')
i = 0
for train_index, test_index in skf.split(features, label):
    # data split
    x_train_fold, x_test_fold = features.iloc[train_index], features.iloc[test_index]
    y_train_fold, y_test_fold = label.iloc[train_index], label.iloc[test_index]
    
    # model training and its accuracy
    lr_model.fit(x_train_fold, y_train_fold)
    accuracy.append(lr_model.score(x_test_fold, y_test_fold))
    
    print('[INFO] Performance on', i, 'fold')
    # model performance report
    y_pred = lr_model.predict(x_test_fold)
    print(lr_model.score(x_test_fold, y_test_fold))
    print(confusion_matrix(y_test_fold, y_pred))
    print(classification_report(y_test_fold, y_pred))
    print('--'*30, '\n')
    i += 1

# append the score of the model in dataframe
model_report = model_report.append({'Model': 'Logistic Regression', 'Max': max(accuracy),
                     'Avg': np.mean(accuracy)}, ignore_index=True) 

print('All accuracy', accuracy)
print('Max:', max(accuracy))
print('Average', np.mean(accuracy))

[INFO] Logistic Regression Model Performance...

[INFO] Performance on 0 fold
0.7272727272727273
[[83 17]
 [25 29]]
              precision    recall  f1-score   support

           0       0.77      0.83      0.80       100
           1       0.63      0.54      0.58        54

    accuracy                           0.73       154
   macro avg       0.70      0.68      0.69       154
weighted avg       0.72      0.73      0.72       154

------------------------------------------------------------ 

[INFO] Performance on 1 fold
0.7727272727272727
[[87 13]
 [22 32]]
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       100
           1       0.71      0.59      0.65        54

    accuracy                           0.77       154
   macro avg       0.75      0.73      0.74       154
weighted avg       0.77      0.77      0.77       154

------------------------------------------------------------ 

[INFO] Performance on 2 fold
0.720779

In [31]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=300, min_samples_leaf=4,  n_jobs=-1, random_state=21, max_depth=3)

accuracy = []
print('[INFO] Random Forest Model Performance...\n')
i = 0
for train_index, test_index in skf.split(features, label):
    # data split
    x_train_fold, x_test_fold = features.iloc[train_index], features.iloc[test_index]
    y_train_fold, y_test_fold = label.iloc[train_index], label.iloc[test_index]
    
    # model training and its accuracy
    rf_model.fit(x_train_fold, y_train_fold)
    accuracy.append(rf_model.score(x_test_fold, y_test_fold))
    
    print('[INFO] Performance on', i, 'fold')
    # model performance report
    y_pred = rf_model.predict(x_test_fold)
    print(rf_model.score(x_test_fold, y_test_fold))
    print(confusion_matrix(y_test_fold, y_pred))
    print(classification_report(y_test_fold, y_pred))
    print('--'*30, '\n')
    i += 1

# append the score of the model in dataframe
model_report =  model_report.append({'Model': 'Random Forest', 'Max': max(accuracy),
                     'Avg': np.mean(accuracy)}, ignore_index=True) 

print('All accuracy', accuracy)
print('Max:', max(accuracy))
print('Average', np.mean(accuracy))

[INFO] Random Forest Model Performance...

[INFO] Performance on 0 fold
0.6818181818181818
[[86 14]
 [35 19]]
              precision    recall  f1-score   support

           0       0.71      0.86      0.78       100
           1       0.58      0.35      0.44        54

    accuracy                           0.68       154
   macro avg       0.64      0.61      0.61       154
weighted avg       0.66      0.68      0.66       154

------------------------------------------------------------ 

[INFO] Performance on 1 fold
0.7597402597402597
[[89 11]
 [26 28]]
              precision    recall  f1-score   support

           0       0.77      0.89      0.83       100
           1       0.72      0.52      0.60        54

    accuracy                           0.76       154
   macro avg       0.75      0.70      0.72       154
weighted avg       0.75      0.76      0.75       154

------------------------------------------------------------ 

[INFO] Performance on 2 fold
0.720779220779

In [32]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear', degree=20, random_state=0)

accuracy = []
print('[INFO] SVM Model Performance...\n')
i = 0
for train_index, test_index in skf.split(features, label):
    # data split
    x_train_fold, x_test_fold = features.iloc[train_index], features.iloc[test_index]
    y_train_fold, y_test_fold = label.iloc[train_index], label.iloc[test_index]
    
    # model training and its accuracy
    svm_model.fit(x_train_fold, y_train_fold)
    accuracy.append(svm_model.score(x_test_fold, y_test_fold))
    
    print('[INFO] Performance on', i, 'fold')
    # model performance report
    y_pred = svm_model.predict(x_test_fold)
    print(svm_model.score(x_test_fold, y_test_fold))
    print(confusion_matrix(y_test_fold, y_pred))
    print(classification_report(y_test_fold, y_pred))
    print('--'*30, '\n')
    i += 1

# append the score of the model in dataframe
model_report =  model_report.append({'Model': 'SVM', 'Max': max(accuracy),
                     'Avg': np.mean(accuracy)}, ignore_index=True) 

print('All accuracy', accuracy)
print('Max:', max(accuracy))
print('Average', np.mean(accuracy))

[INFO] SVM Model Performance...

[INFO] Performance on 0 fold
0.7142857142857143
[[82 18]
 [26 28]]
              precision    recall  f1-score   support

           0       0.76      0.82      0.79       100
           1       0.61      0.52      0.56        54

    accuracy                           0.71       154
   macro avg       0.68      0.67      0.67       154
weighted avg       0.71      0.71      0.71       154

------------------------------------------------------------ 

[INFO] Performance on 1 fold
0.7792207792207793
[[87 13]
 [21 33]]
              precision    recall  f1-score   support

           0       0.81      0.87      0.84       100
           1       0.72      0.61      0.66        54

    accuracy                           0.78       154
   macro avg       0.76      0.74      0.75       154
weighted avg       0.77      0.78      0.77       154

------------------------------------------------------------ 

[INFO] Performance on 2 fold
0.7337662337662337
[[85 

In [33]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier(n_estimators=200, random_state=0, eval_metric='mlogloss', learning_rate=0.01, gamma=3,)

accuracy = []
print('[INFO] XGBoost Model Performance...\n')
i = 0
for train_index, test_index in skf.split(features, label):
    # data split
    x_train_fold, x_test_fold = features.iloc[train_index], features.iloc[test_index]
    y_train_fold, y_test_fold = label.iloc[train_index], label.iloc[test_index]
    
    # model training and its accuracy
    xgb_model.fit(x_train_fold, y_train_fold)
    accuracy.append(xgb_model.score(x_test_fold, y_test_fold))
    
    print('[INFO] Performance on', i, 'fold')
    # model performance report
    y_pred = xgb_model.predict(x_test_fold)
    print(xgb_model.score(x_test_fold, y_test_fold))
    print(confusion_matrix(y_test_fold, y_pred))
    print(classification_report(y_test_fold, y_pred))
    print('--'*30, '\n')
    i += 1

# append the score of the model in dataframe
model_report = model_report.append({'Model': 'XGBoost', 'Max': max(accuracy),
                     'Avg': np.mean(accuracy)}, ignore_index=True) 

print('All accuracy', accuracy)
print('Max:', max(accuracy))
print('Average', np.mean(accuracy))

[INFO] XGBoost Model Performance...

[INFO] Performance on 0 fold
0.7337662337662337
[[84 16]
 [25 29]]
              precision    recall  f1-score   support

           0       0.77      0.84      0.80       100
           1       0.64      0.54      0.59        54

    accuracy                           0.73       154
   macro avg       0.71      0.69      0.69       154
weighted avg       0.73      0.73      0.73       154

------------------------------------------------------------ 

[INFO] Performance on 1 fold
0.7987012987012987
[[86 14]
 [17 37]]
              precision    recall  f1-score   support

           0       0.83      0.86      0.85       100
           1       0.73      0.69      0.70        54

    accuracy                           0.80       154
   macro avg       0.78      0.77      0.78       154
weighted avg       0.80      0.80      0.80       154

------------------------------------------------------------ 

[INFO] Performance on 2 fold
0.7402597402597403
[

In [39]:
model_report.style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,Avg,Max,Model
0,0.721356,0.772727,KNN
1,0.754036,0.843137,Random Forest
2,0.764409,0.843137,SVM
3,0.764349,0.798701,XGBoost
4,0.76311,0.843137,Logistic Regression


<span style="color: orange;">__*Insights*__</span>
* `SVM` is the best performer 
* `SVM`, `Random Forest`, and `Logistic Regression` got the same max score, however `SVM` outperforms in terms of average values.
* In terms of average score, `XGBoost` and `Logistic Regression` also did the better job.
* `KNN` was the worst performer in all ascepts.