In [15]:
# import general libraries & data-preprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# import different models
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

# import evaluation metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# import model tuning methods
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
import hyperopt
from sklearn.model_selection import GridSearchCV

# visualisations
from sklearn.tree import export_graphviz
import pydotplus
from six import StringIO 
from IPython.display import Image

# import warnings
# warnings.filterwarnings("ignore")


## 0. Data preparation

In [16]:
df = pd.read_csv('cardio_train_cleaned.csv')

df.head()

Unnamed: 0,age,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,72,1,110,80,1,1,0,0,1,0,21.967
1,67,0,140,90,3,1,0,0,1,1,34.928
2,71,0,130,70,3,1,0,0,0,1,23.508
3,74,1,150,100,1,1,0,0,1,1,28.71
4,75,0,100,60,1,1,0,0,0,0,23.011


In [18]:
# split dataframe into variables and label
X = df.drop('cardio', axis=1)
y = df['cardio']

# scaling with StandardScalar
standardScaler = StandardScaler() 
X_norm = standardScaler.fit_transform(X)

# split data into train and validation and test
X_train, X_rem, y_train, y_rem = train_test_split(X_norm,y, train_size=0.8, stratify=y, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5, stratify=y_rem, random_state=42)

## 1. Support vector machine

In [19]:
# hyperparameter tuning with Hyperopt
def objective(search_space):
    svc = LinearSVC(**search_space) 
    current_score = cross_val_score(svc, X_valid, y_valid, cv=10).mean()
    print(current_score, search_space)
    return {'loss': -current_score, 'status': STATUS_OK}

search_space = {'C': hp.choice('C', range(1,10)),
                'tol': hp.quniform('tol', 0.0001, 0.1, 0.00001),
                'dual': hp.choice('dual', [False])
        }

best_params = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=10)
best_params_base = best_params
print('Best:')
print(best_params)
    

0.7242901417738212                                                                                                     
{'C': 8, 'dual': False, 'tol': 0.03991}                                                                                
0.7242901417738212                                                                                                     
{'C': 2, 'dual': False, 'tol': 0.07275000000000001}                                                                    
0.7242901417738212                                                                                                     
{'C': 3, 'dual': False, 'tol': 0.03943}                                                                                
0.7242901417738212                                                                                                     
{'C': 7, 'dual': False, 'tol': 0.05183}                                                                                
0.7242901417738212                      

In [20]:
# fit hyperparamater(Hyperopt) and train data into the model
svm = LinearSVC(C=best_params_base['C'], tol=best_params_base['tol'], dual=False)
svm.fit(X_train, y_train)
y_pred_train = svm.predict(X_train)
y_pred_test = svm.predict(X_test)

# evaluation metrics
accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train, average='binary')
recall = recall_score(y_train, y_pred_train, average='binary')
f1 = f1_score(y_train, y_pred_train, average='binary')

print('Train Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test, average='binary')
recall = recall_score(y_test, y_pred_test, average='binary')
f1 = f1_score(y_test, y_pred_test, average='binary')

print('\nTest Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

Train Results
Accuracy:  0.7260261280459557
Precision:  0.7547681539807524
Recall:  0.6527446752165853
f1_score: 0.7000588318828231

Test Results
Accuracy:  0.7245775274236584
Precision:  0.7524441340782123
Recall:  0.6522397094430993
f1_score: 0.6987678339818417


## 2. Bagging with SVM

In [21]:
# hyperparameter tuning with Hyperopt
def objective(search_space):
    svm = LinearSVC(C=best_params_base['C'], tol=best_params_base['tol'], dual=False)
    clf = BaggingClassifier(**search_space, base_estimator=svm)
    current_score = cross_val_score(clf, X_valid, y_valid, cv=10).mean()
    print(current_score, search_space)
    return {'loss': -current_score, 'status': STATUS_OK}

search_space = {
            'n_estimators': hp.choice('n_estimators', range(1, 1000))
        }

best_params = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=10)
print('Best:')
print(best_params)

0.723993625673151                                                                                                      
{'n_estimators': 287}                                                                                                  
0.7241413342125508                                                                                                     
{'n_estimators': 745}                                                                                                  
0.7242901417738212                                                                                                     
{'n_estimators': 62}                                                                                                   
0.7245868776788658                                                                                                     
{'n_estimators': 223}                                                                                                  
0.7245864380701176                      

In [22]:
# fit hyperparamater(Hyperopt) and train data into the model
svm = LinearSVC(C=best_params_base['C'], tol=best_params_base['tol'], dual=False)
clf = BaggingClassifier(base_estimator=svm, n_estimators=best_params['n_estimators'], random_state=42)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

# evaluation metrics
accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train, average='binary')
recall = recall_score(y_train, y_pred_train, average='binary')
f1 = f1_score(y_train, y_pred_train, average='binary')

print('Train Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test, average='binary')
recall = recall_score(y_test, y_pred_test, average='binary')
f1 = f1_score(y_test, y_pred_test, average='binary')

print('\nTest Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

Train Results
Accuracy:  0.7261929028073751
Precision:  0.7552888616354956
Recall:  0.6523663602315287
f1_score: 0.7000649561545955

Test Results
Accuracy:  0.7245775274236584
Precision:  0.7522679692951849
Recall:  0.652542372881356
f1_score: 0.6988654781199353


## 3. AdaBoost with SVM

In [23]:
# hyperparameter tuning with Hyperopt
def objective(search_space):
    svm = LinearSVC(C=best_params_base['C'], tol=best_params_base['tol'], dual=False)
    clf = AdaBoostClassifier(**search_space, base_estimator=svm)
    current_score = cross_val_score(clf, X_valid, y_valid, cv=10).mean()
    print(current_score, search_space)
    return {'loss': -current_score, 'status': STATUS_OK}

search_space = {
            'n_estimators': hp.choice('n_estimators', range(50, 1000)),
            'learning_rate': hp.quniform('learning_rate', 0, 0.05, 0.0001),
            'algorithm': hp.choice('algorithm', ['SAMME'])
        }

best_params = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=10)
print('Best:')
print(best_params)

0.723252665128036                                                                                                      
{'algorithm': 'SAMME', 'learning_rate': 0.0461, 'n_estimators': 865}                                                   
0.7229561490273657                                                                                                     
{'algorithm': 'SAMME', 'learning_rate': 0.038900000000000004, 'n_estimators': 169}                                     
0.7229561490273657                                                                                                     
{'algorithm': 'SAMME', 'learning_rate': 0.0252, 'n_estimators': 447}                                                   
0.7231045169798879                                                                                                     
{'algorithm': 'SAMME', 'learning_rate': 0.0358, 'n_estimators': 300}                                                   
0.7231042971755138                      

In [24]:
# Get algorithm string as hp.choice only can display the integer
algo_dic = {0:'SAMME', 1:'SAMME.R'}
algo = algo_dic[best_params['algorithm']]

# fit hyperparamater(Hyperopt) and train data into the model
svm = LinearSVC(C=best_params_base['C'], tol=best_params_base['tol'], dual=False)
clf = AdaBoostClassifier(base_estimator=svm, n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], algorithm=algo)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)


# evaluation metrics
accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train, average='binary')
recall = recall_score(y_train, y_pred_train, average='binary')
f1 = f1_score(y_train, y_pred_train, average='binary')

print('Train Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)

accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test, average='binary')
recall = recall_score(y_test, y_pred_test, average='binary')
f1 = f1_score(y_test, y_pred_test, average='binary')

print('\nTest Results')
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('f1_score:', f1)


Train Results
Accuracy:  0.7279162420087093
Precision:  0.7422480620155039
Recall:  0.6810048046003102
f1_score: 0.7103087698530138

Test Results
Accuracy:  0.7238363474651646
Precision:  0.7381818181818182
Recall:  0.6758474576271186
f1_score: 0.7056407015326276


Generally...

1. Bagging perform slightly worse than base estimator for all metrics
2. Adaboost perform better than base estimator in accuracy, recall and f1-score