In [132]:
# import general libraries & data preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# import different models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

# import evaluation metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# import model tuning methods
from sklearn.model_selection import GridSearchCV
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
import hyperopt


## 0. Data Preparation

In [133]:
# read data from csv file
df = pd.read_csv('cardio_train_cleaned.csv')

df.head()

Unnamed: 0,age,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,72,1,110,80,1,1,0,0,1,0,21.967
1,67,0,140,90,3,1,0,0,1,1,34.928
2,71,0,130,70,3,1,0,0,0,1,23.508
3,74,1,150,100,1,1,0,0,1,1,28.71
4,75,0,100,60,1,1,0,0,0,0,23.011


In [134]:
# split dataframe into variables and label
X = df.drop('cardio', axis=1)
y = df['cardio']

# split data into train and validation and test
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8, stratify=y, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5, stratify=y_rem, random_state=42)

## 1. Naive Bayes 

In [135]:
# hyperparameter tuning with GridSearchCV
gnb = GaussianNB()

params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
gs_NB = GridSearchCV(estimator=gnb, 
                 param_grid=params_NB,
                 verbose=2,
                 cv=10,
                 scoring='accuracy') 
gs_NB.fit(X_valid, y_valid)
best_params_base = gs_NB.best_params_
gs_NB.best_params_

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ..................................var_smoothing=1.0; total time=   0.0s
[CV] END ...................var_smoothing=0.8111308307896871; total time=   0.0s
[CV] END ...................var_smoothing=0.

[CV] END ..................var_smoothing=0.08111308307896872; total time=   0.0s
[CV] END ...................var_smoothing=0.0657933224657568; total time=   0.0s
[CV] END ...................var_smoothing=0.0657933224657568; total time=   0.0s
[CV] END ...................var_smoothing=0.0657933224657568; total time=   0.0s
[CV] END ...................var_smoothing=0.0657933224657568; total time=   0.0s
[CV] END ...................var_smoothing=0.0657933224657568; total time=   0.0s
[CV] END ...................var_smoothing=0.0657933224657568; total time=   0.0s
[CV] END ...................var_smoothing=0.0657933224657568; total time=   0.0s
[CV] END ...................var_smoothing=0.0657933224657568; total time=   0.0s
[CV] END ...................var_smoothing=0.0657933224657568; total time=   0.0s
[CV] END ...................var_smoothing=0.0657933224657568; total time=   0.0s
[CV] END ...................var_smoothing=0.0533669923120631; total time=   0.0s
[CV] END ...................

[CV] END .................var_smoothing=0.005336699231206307; total time=   0.0s
[CV] END .................var_smoothing=0.005336699231206307; total time=   0.0s
[CV] END .................var_smoothing=0.005336699231206307; total time=   0.0s
[CV] END .................var_smoothing=0.005336699231206307; total time=   0.0s
[CV] END .................var_smoothing=0.005336699231206307; total time=   0.0s
[CV] END .................var_smoothing=0.005336699231206307; total time=   0.0s
[CV] END .................var_smoothing=0.005336699231206307; total time=   0.0s
[CV] END .................var_smoothing=0.005336699231206307; total time=   0.0s
[CV] END .................var_smoothing=0.004328761281083057; total time=   0.0s
[CV] END .................var_smoothing=0.004328761281083057; total time=   0.0s
[CV] END .................var_smoothing=0.004328761281083057; total time=   0.0s
[CV] END .................var_smoothing=0.004328761281083057; total time=   0.0s
[CV] END .................va

[CV] END ................var_smoothing=0.0006579332246575676; total time=   0.0s
[CV] END ................var_smoothing=0.0006579332246575676; total time=   0.0s
[CV] END ................var_smoothing=0.0005336699231206307; total time=   0.0s
[CV] END ................var_smoothing=0.0005336699231206307; total time=   0.0s
[CV] END ................var_smoothing=0.0005336699231206307; total time=   0.0s
[CV] END ................var_smoothing=0.0005336699231206307; total time=   0.0s
[CV] END ................var_smoothing=0.0005336699231206307; total time=   0.0s
[CV] END ................var_smoothing=0.0005336699231206307; total time=   0.0s
[CV] END ................var_smoothing=0.0005336699231206307; total time=   0.0s
[CV] END ................var_smoothing=0.0005336699231206307; total time=   0.0s
[CV] END ................var_smoothing=0.0005336699231206307; total time=   0.0s
[CV] END ................var_smoothing=0.0005336699231206307; total time=   0.0s
[CV] END ................var

[CV] END ................var_smoothing=4.328761281083062e-05; total time=   0.0s
[CV] END ................var_smoothing=4.328761281083062e-05; total time=   0.0s
[CV] END ................var_smoothing=4.328761281083062e-05; total time=   0.0s
[CV] END ................var_smoothing=4.328761281083062e-05; total time=   0.0s
[CV] END ................var_smoothing=4.328761281083062e-05; total time=   0.0s
[CV] END ................var_smoothing=4.328761281083062e-05; total time=   0.0s
[CV] END ................var_smoothing=4.328761281083062e-05; total time=   0.0s
[CV] END ................var_smoothing=4.328761281083062e-05; total time=   0.0s
[CV] END ................var_smoothing=3.511191734215127e-05; total time=   0.0s
[CV] END ................var_smoothing=3.511191734215127e-05; total time=   0.0s
[CV] END ................var_smoothing=3.511191734215127e-05; total time=   0.0s
[CV] END ................var_smoothing=3.511191734215127e-05; total time=   0.0s
[CV] END ................var

[CV] END ...............var_smoothing=3.5111917342151275e-06; total time=   0.0s
[CV] END ...............var_smoothing=3.5111917342151275e-06; total time=   0.0s
[CV] END ...............var_smoothing=3.5111917342151275e-06; total time=   0.0s
[CV] END ...............var_smoothing=3.5111917342151275e-06; total time=   0.0s
[CV] END ...............var_smoothing=3.5111917342151275e-06; total time=   0.0s
[CV] END ...............var_smoothing=3.5111917342151275e-06; total time=   0.0s
[CV] END ...............var_smoothing=3.5111917342151275e-06; total time=   0.0s
[CV] END ................var_smoothing=2.848035868435799e-06; total time=   0.0s
[CV] END ................var_smoothing=2.848035868435799e-06; total time=   0.0s
[CV] END ................var_smoothing=2.848035868435799e-06; total time=   0.0s
[CV] END ................var_smoothing=2.848035868435799e-06; total time=   0.0s
[CV] END ................var_smoothing=2.848035868435799e-06; total time=   0.0s
[CV] END ................var

[CV] END ................var_smoothing=2.848035868435799e-07; total time=   0.0s
[CV] END ................var_smoothing=2.848035868435799e-07; total time=   0.0s
[CV] END ................var_smoothing=2.848035868435799e-07; total time=   0.0s
[CV] END ................var_smoothing=2.848035868435799e-07; total time=   0.0s
[CV] END ................var_smoothing=2.848035868435799e-07; total time=   0.0s
[CV] END ................var_smoothing=2.310129700083158e-07; total time=   0.0s
[CV] END ................var_smoothing=2.310129700083158e-07; total time=   0.0s
[CV] END ................var_smoothing=2.310129700083158e-07; total time=   0.0s
[CV] END ................var_smoothing=2.310129700083158e-07; total time=   0.0s
[CV] END ................var_smoothing=2.310129700083158e-07; total time=   0.0s
[CV] END ................var_smoothing=2.310129700083158e-07; total time=   0.0s
[CV] END ................var_smoothing=2.310129700083158e-07; total time=   0.0s
[CV] END ................var

[CV] END ................var_smoothing=1.519911082952933e-08; total time=   0.0s
[CV] END ................var_smoothing=1.519911082952933e-08; total time=   0.0s
[CV] END ................var_smoothing=1.519911082952933e-08; total time=   0.0s
[CV] END ................var_smoothing=1.519911082952933e-08; total time=   0.0s
[CV] END ................var_smoothing=1.519911082952933e-08; total time=   0.0s
[CV] END ................var_smoothing=1.519911082952933e-08; total time=   0.0s
[CV] END ................var_smoothing=1.232846739442066e-08; total time=   0.0s
[CV] END ................var_smoothing=1.232846739442066e-08; total time=   0.0s
[CV] END ................var_smoothing=1.232846739442066e-08; total time=   0.0s
[CV] END ................var_smoothing=1.232846739442066e-08; total time=   0.0s
[CV] END ................var_smoothing=1.232846739442066e-08; total time=   0.0s
[CV] END ................var_smoothing=1.232846739442066e-08; total time=   0.0s
[CV] END ................var

[CV] END ................var_smoothing=1.519911082952933e-09; total time=   0.0s
[CV] END ................var_smoothing=1.519911082952933e-09; total time=   0.0s
[CV] END ................var_smoothing=1.519911082952933e-09; total time=   0.0s
[CV] END ................var_smoothing=1.519911082952933e-09; total time=   0.0s
[CV] END ................var_smoothing=1.519911082952933e-09; total time=   0.0s
[CV] END ................var_smoothing=1.519911082952933e-09; total time=   0.0s
[CV] END ................var_smoothing=1.519911082952933e-09; total time=   0.0s
[CV] END ................var_smoothing=1.519911082952933e-09; total time=   0.0s
[CV] END ................var_smoothing=1.519911082952933e-09; total time=   0.0s
[CV] END ...............var_smoothing=1.2328467394420635e-09; total time=   0.0s
[CV] END ...............var_smoothing=1.2328467394420635e-09; total time=   0.0s
[CV] END ...............var_smoothing=1.2328467394420635e-09; total time=   0.0s
[CV] END ...............var_

{'var_smoothing': 0.0005336699231206307}

In [139]:
# fit hyperparamater and train data into the model
gnb = GaussianNB(var_smoothing=best_params_base['var_smoothing'])
gnb.fit(X_train, y_train)
y_pred_train = gnb.predict(X_train)
y_pred_test = gnb.predict(X_test)


# evaluation metrics
accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train, average='binary')
recall = recall_score(y_train, y_pred_train, average='binary')
f1 = f1_score(y_train, y_pred_train, average='binary')

print('Train Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test, average='binary')
recall = recall_score(y_test, y_pred_test, average='binary')
f1 = f1_score(y_test, y_pred_test, average='binary')

print('\nTest Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

# Results without hyperparameter tuning
# Accuracy:  0.7150426847218133
# Precision:  0.7561863173216885
# Recall:  0.6214114832535885
# f1_score: 0.6822061720288904

Train Results
Accuracy:  0.7167608635226536
Precision:  0.7613956105796286
Recall:  0.6142322097378277
f1_score: 0.6799422074250894

Test Results
Accuracy:  0.7204269196560925
Precision:  0.7667419112114372
Recall:  0.6168280871670703
f1_score: 0.6836632002683664


## 2. Bagging with Naive Bayes

In [140]:
# hyperparameter tuning with Hyperopt
def objective(search_space):
    gnb = GaussianNB(var_smoothing=best_params_base['var_smoothing'])
    clf = BaggingClassifier(**search_space, base_estimator=gnb)
    current_score = cross_val_score(clf, X_valid, y_valid, cv=10).mean()
    print(current_score, search_space)
    return {'loss': -current_score, 'status': STATUS_OK}

search_space = {
            'n_estimators': hp.choice('n_estimators', range(1, 1000))
        }

best_params = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=10)
print('Best:')
print(best_params)
    

0.7213232223321244                                                                                                     
{'n_estimators': 727}                                                                                                  
0.7207306297395318                                                                                                     
{'n_estimators': 26}                                                                                                   
0.7213232223321244                                                                                                     
{'n_estimators': 447}                                                                                                  
0.7211750741839762                                                                                                     
{'n_estimators': 308}                                                                                                  
0.7210269260358281                      

In [141]:
# hyperparameter tuning with GridSearchCV
gnb = GaussianNB(var_smoothing=best_params_base['var_smoothing'])
clf = BaggingClassifier(base_estimator=gnb)

params_NB = {'n_estimators': [10, 100, 1000]}
gs_NB = GridSearchCV(estimator=clf, 
                 param_grid=params_NB,
                 verbose=2,
                 cv=10,
                 scoring='accuracy') 
gs_NB.fit(X_valid, y_valid)

gs_NB.best_params_

Fitting 10 folds for each of 3 candidates, totalling 30 fits
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ....................................n_estimators=10; total time=   0.0s
[CV] END ...................................n_estimators=100; total time=   0.2s
[CV] END ...................................n_es

{'n_estimators': 100}

In [142]:
# fit hyperparamater(Hyperopt) and train data into the model
gnb = GaussianNB(var_smoothing=best_params_base['var_smoothing'])
clf = BaggingClassifier(base_estimator=gnb, n_estimators=best_params['n_estimators'], random_state=42)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

# evaluation metrics
accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train, average='binary')
recall = recall_score(y_train, y_pred_train, average='binary')
f1 = f1_score(y_train, y_pred_train, average='binary')

print('\nHyperopt')
print('Train Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test, average='binary')
recall = recall_score(y_test, y_pred_test, average='binary')
f1 = f1_score(y_test, y_pred_test, average='binary')

print('\nTest Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

# fit hyperparamater(GridSearchCV) and train data into the model
gnb = GaussianNB()
clf = BaggingClassifier(base_estimator=gnb, n_estimators=gs_NB.best_params_['n_estimators'], random_state=42)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

# evaluation metrics
accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train, average='binary')
recall = recall_score(y_train, y_pred_train, average='binary')
f1 = f1_score(y_train, y_pred_train, average='binary')

print('\nGridSearchCV')
print('Train Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test, average='binary')
recall = recall_score(y_test, y_pred_test, average='binary')
f1 = f1_score(y_test, y_pred_test, average='binary')

print('\nTest Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

# Results without hyperparameter tuning
# Accuracy:  0.7147483073299971
# Precision:  0.7560087399854334
# Recall:  0.6208133971291866
# f1_score: 0.6817733990147784


Hyperopt
Train Results
Accuracy:  0.7167793940517002
Precision:  0.761357775798209
Recall:  0.6143457042333447
f1_score: 0.6799966500565302

Test Results
Accuracy:  0.720130447672695
Precision:  0.7661654135338346
Recall:  0.6168280871670703
f1_score: 0.6834339369550637

GridSearchCV
Train Results
Accuracy:  0.7132585935328454
Precision:  0.7551096419758834
Recall:  0.6135890742632316
f1_score: 0.6770328936383371

Test Results
Accuracy:  0.7122739401126593
Precision:  0.7549569771791994
Recall:  0.610774818401937
f1_score: 0.6752551447214322


Since Accuracy, Recall and F1 score with Hyperopt tuning is higher than GridSearchCV, we will continue to use Hyperopt throughout the remaining models.

## 3. AdaBoost with Naive Bayes

In [143]:
# hyperparameter tuning with Hyperopt
def objective(search_space):
    gnb = GaussianNB(var_smoothing=best_params_base['var_smoothing'])
    clf = AdaBoostClassifier(**search_space, base_estimator=gnb)
    current_score = cross_val_score(clf, X_valid, y_valid, cv=10).mean()
    print(current_score, search_space)
    return {'loss': -current_score, 'status': STATUS_OK}

search_space = {
            'n_estimators': hp.choice('n_estimators', range(50, 1000)),
            'learning_rate': hp.quniform('learning_rate', 0, 0.05, 0.0001),
            'algorithm': hp.choice('algorithm', ['SAMME', 'SAMME.R'])
        }

best_params = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=10)
print('Best:')
print(best_params)
    

0.7257707440378064                                                                                                     
{'algorithm': 'SAMME.R', 'learning_rate': 0.0466, 'n_estimators': 785}                                                 
0.7248827343664137                                                                                                     
{'algorithm': 'SAMME.R', 'learning_rate': 0.0102, 'n_estimators': 796}                                                 
0.7208774590614353                                                                                                     
{'algorithm': 'SAMME', 'learning_rate': 0.0094, 'n_estimators': 53}                                                    
0.7269572480492361                                                                                                     
{'algorithm': 'SAMME.R', 'learning_rate': 0.002, 'n_estimators': 932}                                                  
0.7250293438839432                      

In [146]:
#### Get algorithm string as hp.choice only can display the integer
algo_dic = {0:'SAMME', 1:'SAMME.R'}
algo = algo_dic[best_params['algorithm']]

# fit hyperparamater(Hyperopt) and train data into the model
clf = AdaBoostClassifier(base_estimator=gnb, n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], algorithm=algo)
model = clf.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# evaluation metrics
accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train, average='binary')
recall = recall_score(y_train, y_pred_train, average='binary')
f1 = f1_score(y_train, y_pred_train, average='binary')

print('Train Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test, average='binary')
recall = recall_score(y_test, y_pred_test, average='binary')
f1 = f1_score(y_test, y_pred_test, average='binary')

print('\nTest Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

# Results without hyperparameter tuning
# Accuracy:  0.5079481895790403
# Precision:  0.6666666666666666
# Recall:  0.0005980861244019139
# f1_score: 0.0011951000896325067

Train Results
Accuracy:  0.7247660520707866
Precision:  0.7549537648612946
Recall:  0.6486210418794689
f1_score: 0.6977595995360479

Test Results
Accuracy:  0.7229469315149718
Precision:  0.7536231884057971
Recall:  0.6452784503631961
f1_score: 0.6952551769117886


Generally...
1. Model evaluation results is better after hyperparameter tuning by arounf 1%
2. Bagging did not improve the results significantly
3. Adaboost increase recall by around 4.5% and f1-score by 1.5%, however, precision decrease by 2%
