# AdaBoost Classifier

In [16]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

In [2]:
cleaned_dataset = pd.read_csv('../preparation/dataframes/cleaned_dataset.csv', index_col=0)
X = cleaned_dataset.drop('outcome', axis=1)
y = cleaned_dataset['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

## Hyperparameter Tuning

### Hyperparameters to be tuned:
1. `n_estimators`
    - The maximum number of estimators at which boosting is terminated
2. `learning_rate`
    - Weight applied to each classifier at each boosting iteration

In [3]:
estimators_scores = {}
estimators_scores_list = []
for i in range(1, 100):
    model = AdaBoostClassifier(n_estimators=i, random_state=13)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    estimators_scores[i] = accuracy_score(y_test, y_pred)
    estimators_scores_list.append(accuracy_score(y_test, y_pred))

In [None]:
px.line(
    x=range(1, len(estimators_scores_list) + 1),
    y=estimators_scores_list,
    labels={'x': 'No. estimators', 'y': 'Testing accuracy'},
    title='The accuracy score of the AdaBoost Classifier as the no. estimators is varied')

In [33]:
learning_rate_scores = {}
learning_rate_scores_list = []
for i in np.arange(0.1, 5, 0.1):
    model = AdaBoostClassifier(n_estimators=28, learning_rate=i, random_state=13)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    learning_rate_scores[i] = accuracy_score(y_test, y_pred)
    learning_rate_scores_list.append(accuracy_score(y_test, y_pred))

In [None]:
px.line(
    x=np.arange(0.1, 3, 0.1),
    y=learning_rate_scores_list,
    labels={'x': 'Learning rate', 'y': 'Testing accuracy'},
    title='The accuracy score of the AdaBoost Classifier as the learning rate is varied')

In [32]:
n_estimators = list(range(20, 41))
learning_rate = np.arange(1, 3, 0.1)
hyperparameters = dict(
    n_estimators=n_estimators,
    learning_rate=learning_rate)

adaboost = AdaBoostClassifier(random_state=13)
kfold = KFold(n_splits=5, shuffle=True, random_state=13)
clf = GridSearchCV(adaboost, hyperparameters, cv=kfold)
best_model = clf.fit(X, y)

print('Best n_estimators:', best_model.best_estimator_.get_params()['n_estimators'])
print('Best learning_rate:', best_model.best_estimator_.get_params()['learning_rate'])

Best n_estimators: 38
Best learning_rate: 1.2000000000000002


## Most accurate model

In [31]:
model = AdaBoostClassifier(
    n_estimators=38,
    learning_rate=1.2,
    random_state=13)

model.fit(X_train, y_train) 
y_pred = model.predict(X_test)
print(f'Best Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Best Precision: {precision_score(y_test, y_pred, average="macro")}')
print(f'Best Recall: {recall_score(y_test, y_pred, average="macro")}')
print(f'Best F1: {f1_score(y_test, y_pred, average="macro")}')

Best Accuracy: 0.49475473732222086
Best Precision: 0.42443850946531264
Best Recall: 0.3965563826542393
Best F1: 0.3338613690249446
