In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

import pandas as pd
import numpy as np
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/MyDrive/IS4303/notebooks/new_cleaned

Mounted at /content/drive
/content/drive/MyDrive/IS4303/notebooks/new_cleaned


In [None]:
X_train = pd.read_csv('./5X_train.csv')
X_test = pd.read_csv('./4X_test.csv')
y_train = pd.read_csv('./5y_train.csv')
y_test = pd.read_csv('./4y_test.csv')

In [None]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train['is_canceled']
y_test = y_test['is_canceled']

In [None]:
adaboost_classifier = AdaBoostClassifier(random_state=123)
adaboost_params = {
    'n_estimators': [50, 100, 200, 250],
    'learning_rate': [round(0.1*x, 1) for x in range(2, 11)],
    'algorithm': ['SAMME', 'SAMME.R']
}

grid_search = HalvingGridSearchCV(adaboost_classifier, adaboost_params, cv=3, n_jobs=-1, scoring='f1', random_state=123, factor=6, verbose=10)
grid_search.fit(X_train, y_train)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 1981
max_resources_: 71340
aggressive_elimination: False
factor: 6
----------
iter: 0
n_candidates: 72
n_resources: 1981
Fitting 3 folds for each of 72 candidates, totalling 216 fits
----------
iter: 1
n_candidates: 12
n_resources: 11886
Fitting 3 folds for each of 12 candidates, totalling 36 fits
----------
iter: 2
n_candidates: 2
n_resources: 71316
Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [None]:
if 'grid_search' in locals() or 'grid_search' in globals():
    with open('AdaBoostHalvingGridSearch_Yeet_2.pkl', mode="wb") as pickle_file:
        try:
            pickle.Pickler(pickle_file).dump(grid_search)
        except Exception as e:
            print(e)

In [None]:
grid_search.best_params_

{'algorithm': 'SAMME.R', 'learning_rate': 0.8, 'n_estimators': 250}

In [None]:
grid_search.best_score_

0.7439147049084806

In [None]:
classifier = AdaBoostClassifier(random_state=123, **grid_search.best_params_)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score, recall_score, precision_score

In [None]:
predicted = classifier.predict(X_test)
pred_proba = classifier.predict_proba(X_test)[:,1]

In [None]:
print(f"ROC_AUC: {roc_auc_score(y_test, pred_proba)}")
print(f"F1: {f1_score(y_test, predicted)}")
print(f"Precision: {precision_score(y_test, predicted)}")
print(f"Recall: {recall_score(y_test, predicted)}")

ROC_AUC: 0.8100248577444668
F1: 0.6172171088251218
Precision: 0.5413105413105413
Recall: 0.7178841309823678


In [None]:
if 'classifier' in locals() or 'classifier' in globals():
    with open('AdaBoostFittedModel_Yeet.pkl', mode="wb") as pickle_file:
        try:
            pickle.Pickler(pickle_file).dump(classifier)
        except Exception as e:
            print(e)

#EXPERIMENTAL

In [None]:
X_train = pd.read_csv('./4X_train.csv')
X_test = pd.read_csv('./4X_test.csv')
y_train = pd.read_csv('./4y_train.csv')
y_test = pd.read_csv('./4y_test.csv')

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train['is_canceled']
y_test = y_test['is_canceled']

In [None]:
from sklearn.tree import ExtraTreeClassifier

In [None]:
adaboost_classifier = AdaBoostClassifier(random_state=123)
adaboost_params = {
    'estimator': [ExtraTreeClassifier(max_depth=1, min_samples_split=y, random_state=123) for y in range(2, 5)],
    'n_estimators': [25, 50, 75, 100, 200],
    'learning_rate': [round(0.1*x, 1) for x in range(1, 11)],
    'algorithm': ['SAMME', 'SAMME.R']
}

experimental_grid_search = HalvingGridSearchCV(adaboost_classifier, adaboost_params, cv=5, n_jobs=-1, scoring='f1', random_state=123)
experimental_grid_search.fit(X_train, y_train)

with open('ExperimentalAdaBoostHalvingGridSearchCVObj.pkl', mode="wb") as pickle_file:
    try:
        pickle.Pickler(pickle_file).dump(experimental_grid_search)
    except Exception as e:
        print(e)

In [None]:
print(experimental_grid_search.best_params_)
print(experimental_grid_search.best_score_)

{'algorithm': 'SAMME.R', 'estimator': ExtraTreeClassifier(max_depth=1, random_state=123), 'learning_rate': 0.6, 'n_estimators': 200}
0.6556640795511111


In [None]:
X_train = pd.read_csv('./4X_train.csv')
X_test = pd.read_csv('./4X_test.csv')
y_train = pd.read_csv('./4y_train.csv')
y_test = pd.read_csv('./4y_test.csv')

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train['is_canceled']
y_test = y_test['is_canceled']

In [None]:
adaboost_classifier = AdaBoostClassifier(random_state=123)
adaboost_params = {
    'estimator': [
        DecisionTreeClassifier(max_depth=depth, random_state=123, min_samples_split=min_split)
        for depth in range(3,6) 
        for min_split in range(2,6) 
        for class_weight in ["balanced", None]
    ],
    'n_estimators': [150, 200, 250],
    'learning_rate': [round(0.2*x, 1) for x in range(1, 6)],
    'algorithm': ['SAMME', 'SAMME.R']
}

#consider reducing cv to save time if not completed or increasing "factor" which has a default of 3 (1/factor are chosen for the next step in the halving process)
extreme_grid_search = HalvingGridSearchCV(adaboost_classifier, adaboost_params, cv=3, n_jobs=-1, scoring='f1', random_state=123, factor=8)
extreme_grid_search.fit(X_train, y_train)

with open('ExtremeAdaBoostHalvingGridSearchCVObj.pkl', mode="wb") as pickle_file:
    try:
        pickle.Pickler(pickle_file).dump(extreme_grid_search)
    except Exception as e:
        print(e)

print(extreme_grid_search.best_params_)
print(extreme_grid_search.best_score_)