In [1]:
from models.decision_tree import DecisionTreeModel
from models.adaBoost import AdaBoostModel
from models.xgboost import XGBoostModel
from models.gradient_boosting import GradientBoostingModel
from models.random_forest import RandomForestModel
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, roc_auc_score
from metrics import cross_validate_with_resampling, Metrics
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler, SMOTE
import pandas as pd
import numpy as np

In [None]:
grid_search_parameters = {
    "DecisionTreeModel":{
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': range(3, 10),
    'min_samples_split': range(5, 10),
    'min_impurity_decrease': 10.0**np.arange(-5, 0)}
}

grid_search_parameters["XGBClassifier"] = {
    'max_depth': range(3, 10),  # Similar to DecisionTree
    'learning_rate': 10.0 ** np.arange(-5, 0),  # Equivalent to eta in XGBoost
    'n_estimators': range(50, 200, 50),  # Number of gradient boosted trees
    'subsample': np.arange(0.5, 1.0, 0.1),  # Subsample ratio of the training instances
    'colsample_bytree': np.arange(0.5, 1.0, 0.1)  # Subsample ratio of columns when constructing each tree
}

grid_search_parameters["GradientBoostingClassifier"] = {
    'loss': ['log_loss', 'exponential'],  # Loss function to be optimized
    'learning_rate': 10.0 ** np.arange(-5, 0),
    'n_estimators': range(50, 200, 50),
    'max_depth': range(3, 10),
    'min_samples_split': range(5, 10),
    'min_impurity_decrease': 10.0**np.arange(-5, 0)
}

grid_search_parameters["RandomForestClassifier"] = {
    'n_estimators': range(50, 200, 50),  # Number of trees in the forest
    'criterion': ['gini', 'entropy'],
    'max_depth': range(3, 10),
    'min_samples_split': range(5, 10),
    'min_impurity_decrease': 10.0**np.arange(-5, 0)
}

grid_search_parameters["AdaBoostClassifier"] = {
    'n_estimators': range(50, 200, 50),  # Number of weak learners to train iteratively
    'learning_rate': 10.0 ** np.arange(-5, 0),  # Weight applied to each classifier at each boosting iteration
    'algorithm': ['SAMME', 'SAMME.R']  # Algorithm to use
}



### Decision tree Grid search

### XGBoost with Grid Search

In [None]:
xgb_gs = XGBoostModel("XGBoostModel with grid search")
xgb_best_params, xgb_resempling = xgb_gs.gs_parameter_tune(X, y, grid_search_parameters['XGBClassifier'], max_search=200)

In [None]:
xgb_hyper_model = XGBoostModel("XGBoostModel with hyper parameters")
xgb_hyper_model.hyper_parameter(xgb_best_params)
if(xgb_resempling):
    xgb_hyper_model.fit_with_resampling(X_train, y_train)
else:
    xgb_hyper_model.fit(X_train, y_train)

y_pred_hyper = xgb_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()

Parameters: { "criterion", "min_impurity_decrease", "min_samples_split" } are not used.



0.7235126320159793

### AdaBoost with Grid Search

In [None]:
AdaBoostModel_gs = AdaBoostModel("AdaBoostModel with grid search")
ab_best_params, ab_resempling = AdaBoostModel_gs.gs_parameter_tune(X, y, grid_search_parameters['AdaBoostClassifier'], max_search=200)

  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 30/30 [03:17<00:00,  6.60s/it]

Model: AdaBoostModel with grid search
Best Parameters: {'n_estimators': 150, 'learning_rate': 0.1, 'algorithm': 'SAMME.R'}
Resempling: True
Validation Accuracy: 0.7334537629782438





In [None]:
ab_hyper_model = AdaBoostModel("AdaBoostModel with hyper parameters")
ab_hyper_model.hyper_parameter(ab_best_params)
if(ab_resempling):
    ab_hyper_model.fit_with_resampling(X_train, y_train)
else:
    ab_hyper_model.fit(X_train, y_train)

y_pred_hyper = ab_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()



0.7435896076820467

### Gradient Boosting with Grid Search

In [None]:
GradientBoostingModel_gs = GradientBoostingModel("GradientBoostingModel with grid search")
gb_best_params, gb_resempling = GradientBoostingModel_gs.gs_parameter_tune(X, y, grid_search_parameters['GradientBoostingClassifier'], max_search=100)

100%|██████████| 100/100 [45:45<00:00, 27.46s/it]

Model: GradientBoostingModel with grid search
Best Parameters: {'loss': 'log_loss', 'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 3, 'min_samples_split': 7, 'min_impurity_decrease': 0.01}
Resempling: True
Validation Accuracy: 0.7370372507161347





In [None]:
gb_hyper_model = GradientBoostingModel("GradientBoostingModel with hyper parameters")
gb_hyper_model.hyper_parameter(gb_best_params)
if(gb_resempling):
    gb_hyper_model.fit_with_resampling(X_train, y_train)
else:
    gb_hyper_model.fit(X_train, y_train)

y_pred_hyper = gb_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()

0.7553634785234393

In [None]:
# gb_hyper_model.save('gb_hyper_model.pkl')

In [None]:
load_model = GradientBoostingModel()
load_model.load('gb_hyper_model.pkl')

In [None]:
y_pred_load = load_model.predict(X_test)
Metrics(y_pred_load, y_test).f1_score()


0.7553634785234393

### Random Forest Classifier with Grid Search

In [None]:
RandomForestModel_gs = RandomForestModel("RandomForestModel with grid search")
rf_best_params, rf_resempling = RandomForestModel_gs.gs_parameter_tune(X, y, grid_search_parameters['RandomForestClassifier'], max_search=200)

In [None]:
rf_hyper_model = RandomForestModel("RandomForestModel with hyper parameters")
rf_hyper_model.hyper_parameter(rf_best_params)
if(rf_resempling):
    rf_hyper_model.fit_with_resampling(X_train, y_train)
else:
    rf_hyper_model.fit(X_train, y_train)

y_pred_hyper = rf_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()

In [None]:
DecisionTreeModel_gs = DecisionTreeModel("DecisionTreeModel with grid search")
DT_best_params, DT_recempling = DecisionTreeModel_gs.gs_parameter_tune(X, y, grid_search_parameters['DecisionTreeModel'], max_search=200)

In [None]:
DT_hyper_model = DecisionTreeModel("DecisionTreeModel with hyper parameters")
DT_hyper_model.hyper_parameter(DT_best_params)
if(DT_recempling):
    DT_hyper_model.fit_with_resampling(X_train, y_train)
else:
    DT_hyper_model.fit(X_train, y_train)

y_pred_hyper = DT_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()

0.7409847115414075