In [56]:
# import general libraries & data preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# import different models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

# import evaluation metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# import model tuning methods
from sklearn.model_selection import GridSearchCV
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
import hyperopt

## 0. Data Preparation

In [39]:
# read data from csv file
df = pd.read_csv('cardio_train_cleaned.csv')

df.head()

Unnamed: 0,age,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,72,1,110,80,1,1,0,0,1,0,21.967
1,67,0,140,90,3,1,0,0,1,1,34.928
2,71,0,130,70,3,1,0,0,0,1,23.508
3,74,1,150,100,1,1,0,0,1,1,28.71
4,75,0,100,60,1,1,0,0,0,0,23.011


In [40]:
# split dataframe into variables and label
X = df.drop('cardio', axis=1)
y = df['cardio']

# scaling with StandardScalar
standardScaler = StandardScaler() 
X_norm = standardScaler.fit_transform(X)

# split data into train and validation and test
X_train, X_rem, y_train, y_rem = train_test_split(X_norm,y, train_size=0.8, stratify=y, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5, stratify=y_rem, random_state=42)

## 1. Logistic Regression

In [41]:
# hyperparameter tuning with Hyperopt
def objective(search_space):
    lr = LogisticRegression(**search_space)
    current_score = cross_val_score(lr, X_valid, y_valid, cv=10).mean()
    print(current_score, search_space)
    return {'loss': -current_score, 'status': STATUS_OK}

search_space = {
            'C': hp.lognormal('C', 0, 1.0),
            'intercept_scaling': hp.lognormal('intercept_scaling', 0, 1.0),
            'solver': hp.choice('solver', ['liblinear', 'lbfgs','sag', 'saga']),
        }

best_params = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=10)
best_params_base = best_params
print('Best:')
print(best_params)

0.7250313221233103                                                                                                     
{'C': 9.123905186454783, 'intercept_scaling': 0.2245817498152391, 'solver': 'liblinear'}                               
0.7248831739751622                                                                                                     
{'C': 1.3146961170846343, 'intercept_scaling': 1.3003300925505725, 'solver': 'lbfgs'}                                  
0.7251796900758325                                                                                                     
{'C': 2.0624295903392547, 'intercept_scaling': 0.11267906904729524, 'solver': 'sag'}                                   
0.7250313221233103                                                                                                     
{'C': 1.0487412824295712, 'intercept_scaling': 1.5647593294147564, 'solver': 'liblinear'}                              
0.7250313221233103                      

In [42]:
# Get algorithm string as hp.choice only can display the integer
solver_dic = {0:'liblinear', 1:'lbfgs', 2:'sag', 3:'saga'}
solver = solver_dic[best_params_base['solver']]

# fit hyperparamater(Hyperopt) and train data into the model
lr = LogisticRegression(C=best_params_base['C'], intercept_scaling=best_params_base['intercept_scaling'], solver=solver)
lr.fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

# evaluation metrics
accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train, average='binary')
recall = recall_score(y_train, y_pred_train, average='binary')
f1 = f1_score(y_train, y_pred_train, average='binary')

print('Train Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test, average='binary')
recall = recall_score(y_test, y_pred_test, average='binary')
f1 = f1_score(y_test, y_pred_test, average='binary')

print('\nTest Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

Train Results
Accuracy:  0.7278421198925229
Precision:  0.7524718424898977
Recall:  0.6622025498429993
f1_score: 0.7044571888519972

Test Results
Accuracy:  0.7245775274236584
Precision:  0.7476027397260274
Recall:  0.6607142857142857
f1_score: 0.7014781491002571


## 2. Bagging with Logistic Regression

In [55]:
# hyperparameter tuning with Hyperopt
def objective(search_space):
    lr = LogisticRegression(C=best_params_base['C'], intercept_scaling=best_params_base['intercept_scaling'], solver=solver)
    clf = BaggingClassifier(**search_space, base_estimator=lr)
    current_score = cross_val_score(clf, X_valid, y_valid, cv=10).mean()
    print(current_score, search_space)
    return {'loss': -current_score, 'status': STATUS_OK}

search_space = {
            'n_estimators': hp.choice('n_estimators', range(1, 1000))
        }

best_params = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=10)
print('Best:')
print(best_params)
    

0.725178810858336                                                                                                      
{'n_estimators': 245}                                                                                                  
0.7253269590064841                                                                                                     
{'n_estimators': 99}                                                                                                   
0.7244378503132213                                                                                                     
{'n_estimators': 126}                                                                                                  
0.7244382899219696                                                                                                     
{'n_estimators': 850}                                                                                                  
0.7251790306627102                      

In [44]:
# fit hyperparamater(Hyperopt) and train data into the model
lr = LogisticRegression(C=best_params_base['C'], intercept_scaling=best_params_base['intercept_scaling'], solver=solver)
clf = BaggingClassifier(base_estimator=lr, n_estimators=best_params['n_estimators'], random_state=42)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

# evaluation metrics
accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train, average='binary')
recall = recall_score(y_train, y_pred_train, average='binary')
f1 = f1_score(y_train, y_pred_train, average='binary')

print('Train Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test, average='binary')
recall = recall_score(y_test, y_pred_test, average='binary')
f1 = f1_score(y_test, y_pred_test, average='binary')

print('\nTest Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

Train Results
Accuracy:  0.7278050588344297
Precision:  0.7524505588993982
Recall:  0.662126886845988
f1_score: 0.7044050469885095

Test Results
Accuracy:  0.7244292914319597
Precision:  0.7471775573041396
Recall:  0.6610169491525424
f1_score: 0.7014613778705637


## 3. AdaBoost with Logistic Regression

In [53]:
# hyperparameter tuning with Hyperopt
def objective(search_space):
    lr = LogisticRegression(C=best_params_base['C'], intercept_scaling=best_params_base['intercept_scaling'], solver=solver)
    clf = AdaBoostClassifier(**search_space, base_estimator=lr)
    current_score = cross_val_score(clf, X_valid, y_valid, cv=10).mean()
    print(current_score, search_space)
    return {'loss': -current_score, 'status': STATUS_OK}

search_space = {
            'n_estimators': hp.choice('n_estimators', range(50, 1000)),
            'learning_rate': hp.quniform('learning_rate', 0, 0.05, 0.0001),
            'algorithm': hp.choice('algorithm', ['SAMME.R','SAMME'])
        }

best_params = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=10)
print('Best:')
print(best_params)

0.7225090669304319                                                                                                     
{'algorithm': 'SAMME', 'learning_rate': 0.0425, 'n_estimators': 194}                                                   
0.7250319815364326                                                                                                     
{'algorithm': 'SAMME.R', 'learning_rate': 0.0369, 'n_estimators': 776}                                                 
0.7245868776788658                                                                                                     
{'algorithm': 'SAMME.R', 'learning_rate': 0.02, 'n_estimators': 606}                                                   
0.723845917133751                                                                                                      
{'algorithm': 'SAMME.R', 'learning_rate': 0.0115, 'n_estimators': 414}                                                 
0.7239942850862732                      

In [60]:
# Get algorithm string as hp.choice only can display the integer
algo_dic = {0:'SAMME', 1:'SAMME.R'}
algo = algo_dic[best_params['algorithm']]

# fit hyperparamater(Hyperopt) and train data into the model
clf = AdaBoostClassifier(base_estimator=lr, n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], algorithm=algo)
model = clf.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# evaluation metrics
accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train, average='binary')
recall = recall_score(y_train, y_pred_train, average='binary')
f1 = f1_score(y_train, y_pred_train, average='binary')

print('Train Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test, average='binary')
recall = recall_score(y_test, y_pred_test, average='binary')
f1 = f1_score(y_test, y_pred_test, average='binary')

print('\nTest Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

Train Results
Accuracy:  0.7256184564069305
Precision:  0.7277285904567892
Recall:  0.7027579162410623
f1_score: 0.7150253084162513

Test Results
Accuracy:  0.718055143788912
Precision:  0.7209962168978562
Recall:  0.6921912832929782
f1_score: 0.7063001852995676


All metrics decrease slighly for AdaBoost compared to base estimator