In [1]:
#Set Working Directory for saving files
import os
print(os.getcwd()) #Current working directory
os.chdir("C:\\Users\\aldwi\\OneDrive\\Desktop\\thesis\\notebook") #Set working directory
print(os.getcwd()) #Check new working directory

C:\Users\aldwi\0ADC_THESIS
C:\Users\aldwi\OneDrive\Desktop\thesis\notebook


# Hybrid Machine Learning using Ensemble Stacking Algorithm for Type II/ Gestational Diabetes Mellitus Detection

In [2]:
#Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Importing the dataset
data = pd.read_csv("C:\\Users\\aldwi\\OneDrive\\Desktop\\thesis\\notebook\\data\\4SMOTEENN_IQR_BRR.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,Blood Pressure,Skin Thickness,Insulin,BMI,Pedigree,Age,Outcome
0,1.0,85.0,66.0,29.0,78.051178,26.6,0.351,31.0,0
1,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
2,5.0,116.0,74.0,22.240142,107.556504,25.6,0.201,30.0,0
3,10.0,139.0,80.0,25.748488,152.653911,27.1,0.433877,57.0,0
4,1.0,97.0,66.0,15.0,140.0,23.2,0.487,22.0,0


In [3]:
#Setup the data preprocessing methods
from pycaret.classification import *
s = setup(data,                               #Input data
          target='Outcome',                   #Target Variable
          train_size=0.7,                     #Train-test split 70:30 ratio
          preprocess=False,                   #Turn off automated data preprocessing
          pca=True,                           #Dimensionality Reduction
          data_split_stratify=True,           #Stratified k-fold CV
          fold_strategy='stratifiedkfold',    #Stratified k-fold CV
          fold=10,                            #10-fold CV
          fold_shuffle=True,                  #Shuffle 10-fold CV
          numeric_features=['Pregnancies'],
          session_id=123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Outcome
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(3421, 9)"
5,Missing Values,False
6,Numeric Features,8
7,Categorical Features,0
8,Transformed Train Set,"(2394, 8)"
9,Transformed Test Set,"(1027, 8)"


In [4]:
from sklearn.metrics import log_loss
add_metric('logloss', 'Log Loss', log_loss, greater_is_better = False)
remove_metric('kappa')

In [5]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,MCC,Log Loss,TT (Sec)
et,Extra Trees Classifier,0.9921,0.9997,0.9952,0.9897,0.9924,0.9841,0.2741,0.048
rf,Random Forest Classifier,0.9908,0.999,0.9936,0.9889,0.9912,0.9816,0.3174,0.056
lightgbm,Light Gradient Boosting Machine,0.9904,0.9971,0.9952,0.9866,0.9908,0.9809,0.3318,0.115
xgboost,Extreme Gradient Boosting,0.9887,0.9968,0.9912,0.9873,0.9892,0.9775,0.3893,0.055
catboost,CatBoost Classifier,0.9858,0.9972,0.9912,0.9818,0.9865,0.9716,0.4903,1.073
dt,Decision Tree Classifier,0.9841,0.9842,0.9815,0.988,0.9847,0.9683,0.5483,0.006
knn,K Neighbors Classifier,0.9219,0.9848,0.9663,0.8931,0.928,0.8467,2.6981,0.209
gbc,Gradient Boosting Classifier,0.9056,0.9696,0.9342,0.8901,0.9116,0.8117,3.2603,0.042
ada,Ada Boost Classifier,0.8313,0.9182,0.8564,0.8262,0.8407,0.6625,5.8284,0.022
qda,Quadratic Discriminant Analysis,0.7828,0.8749,0.7706,0.806,0.7868,0.5677,7.5027,0.006


In [6]:
et = create_model('et')
rf = create_model('rf')
lightgbm = create_model('lightgbm')
xgboost = create_model('xgboost')
catboost = create_model('catboost')
dt = create_model('dt')
knn = create_model('knn')
gbc = create_model('gbc')
ada = create_model('ada')
qda = create_model('qda')
lr = create_model('lr')
nb = create_model('nb')
lda = create_model('lda')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,MCC,Log Loss
0,0.6375,0.0,0.976,0.5922,0.7372,0.3518,12.5206
1,0.6125,0.0,0.984,0.5748,0.7257,0.3097,13.3841
2,0.6875,0.0,0.568,0.7717,0.6544,0.396,10.7934
3,0.6458,0.0,0.416,0.8125,0.5503,0.3521,12.2325
4,0.4812,0.0,0.0,0.0,0.0,0.0,17.9197
5,0.682,0.0,0.5081,0.8077,0.6238,0.4024,10.9831
6,0.5481,0.0,1.0,0.5345,0.6966,0.1804,15.6078
7,0.7113,0.0,0.904,0.6647,0.7661,0.4453,9.9716
8,0.7197,0.0,0.888,0.6768,0.7682,0.4554,9.6826
9,0.7113,0.0,0.568,0.8256,0.673,0.4542,9.9715


In [7]:
et_rf_lightgbm_xgboost_catboost = stack_models([et,rf,lightgbm,xgboost,catboost], meta_model = et)
lr_nb_ridge_lda_svm = stack_models([lr,nb,ridge,lda,svm], meta_model = et)
et_rf_lightgbm_xgboost_catboost_lr_nb_ridge_lda_svm = stack_models([et,rf,lightgbm,xgboost,catboost, lr,nb,ridge,lda,svm], meta_model = et)
dt_knn_gbc_ada_qda = stack_models([dt,knn,gbc,ada,qda], meta_model = et)
et_rf_lightgbm = stack_models([et, rf, lightgbm], meta_model = et)
ridge_lda_svm = stack_models([ridge,lda,svm], meta_model = et)
et_rf_lightgbm_ridge_lda_svm = stack_models([et, rf, lightgbm, ridge,lda,svm], meta_model = et)
dt_knn_gbc_ada_qda_lr_nb_ridge_lda_svm = stack_models([dt,knn,gbc,ada,qda,lr,nb,ridge,lda,svm], meta_model = et)
et_rf_lightgbm_xgboost_catboost_dt_knn_gbc_ada_qda = stack_models([et,rf,lightgbm,xgboost,catboost,dt,knn,gbc,ada,qda],meta_model = et)
et_rf_lightgbm_xgboost_catboost_dt_knn_gbc_ada_qda_lr_nb_ridge_lda_svm = stack_models([et,rf,lightgbm,xgboost,catboost,dt,knn,gbc,ada,qda,lr,nb,ridge,lda,svm], meta_model = et)

stacking = [et_rf_lightgbm_xgboost_catboost,                                        #Top 5 Strongest
            lr_nb_ridge_lda_svm,                                                    #Top 5 Weakest
            et_rf_lightgbm_xgboost_catboost_lr_nb_ridge_lda_svm,                    #Combining Top 5 Strong and Weak
            dt_knn_gbc_ada_qda,                                                     #Top 5 Moderate
            et_rf_lightgbm,                                                         #Top 3 Strongest
            ridge_lda_svm,                                                          #Top 3 Weakest
            et_rf_lightgbm_ridge_lda_svm,                                           #Combining Top 3 Strong and Weak
            dt_knn_gbc_ada_qda_lr_nb_ridge_lda_svm,                                 #Top 10 Strongest
            et_rf_lightgbm_xgboost_catboost_dt_knn_gbc_ada_qda,                     #Top 10 Weakest
            et_rf_lightgbm_xgboost_catboost_dt_knn_gbc_ada_qda_lr_nb_ridge_lda_svm] #ALL

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,MCC,Log Loss
0,0.9833,0.999,0.984,0.984,0.984,0.9666,0.5757
1,0.9917,0.9992,1.0,0.9843,0.9921,0.9834,0.2878
2,0.9958,1.0,1.0,0.9921,0.996,0.9917,0.1439
3,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,0.9916,0.9999,1.0,0.9841,0.992,0.9834,0.289
5,0.9958,0.9997,1.0,0.992,0.996,0.9917,0.1445
6,0.9916,0.9999,0.9919,0.9919,0.9919,0.9832,0.289
7,0.9958,1.0,0.992,1.0,0.996,0.9917,0.1445
8,0.9916,0.9995,0.992,0.992,0.992,0.9832,0.289
9,0.9791,1.0,1.0,0.9615,0.9804,0.9588,0.7226


In [10]:
for i in stacking:
    print("\n----------------------------------------------------------------------------------\n\n  ", i.estimators_)
print("\n----------------------------------------------------------------------------------")


----------------------------------------------------------------------------------

   [ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=100, n_jobs=-1, oob_score=False,
                     random_state=123, verbose=0, warm_start=False), RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=-1, oob_score=False,
 

In [22]:
et_rf_lightgbm_xgboost_catboost_dt_knn_gbc_ada_qda

In [31]:
plot_model(et_rf_lightgbm_xgboost_catboost_dt_knn_gbc_ada_qda,plot='learning',save=True)

'Learning Curve.png'