In [37]:
from models.decision_tree import DecisionTreeModel
from models.adaBoost import AdaBoostModel
from models.xgboost import XGBoostModel
from models.gradient_boosting import GradientBoostingModel
from models.random_forest import RandomForestModel
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, roc_auc_score
from metrics import cross_validate_with_resampling, Metrics
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler, SMOTE
import pandas as pd
import numpy as np

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
decision_tree_model = DecisionTreeModel(name='DecisionTree')
print(decision_tree_model)
decision_tree_model._is_train()

DecisionTree


False

In [5]:
df = pd.read_csv('./data/Telco-Customer-Churn-encoded-data-FE.csv')
label = pd.read_csv('./data/Telco-Customer-Churn-encoded-label.csv')
df.shape, label.shape

((7043, 43), (7043, 1))

In [6]:
estimator = DecisionTreeModel().model
cross_val_score(estimator, df, label, cv = 5, scoring='roc_auc').mean()

0.6588721476177273

In [62]:
X = df.to_numpy()
y = label.to_numpy().ravel()

In [10]:
X

array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 2.98500000e+01],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        2.00000000e+00, 3.01402487e-02, 5.55735294e+01],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 4.97919556e-01, 5.40750000e+01],
       ...,
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 8.54380141e-02, 3.14954545e+01],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 2.42661448e-01, 7.66500000e+01],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        5.00000000e+00, 1.54357513e-02, 1.03704545e+02]])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5634, 43), (1409, 43), (5634, 1), (1409, 1))

In [15]:
sm = SMOTE(sampling_strategy="all")
Xr_train, yr_train = sm.fit_resample(X_train, y_train)

In [16]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print("Xr_train shape:", Xr_train.shape)
print("yr_train shape:", yr_train.shape)

X_train shape: (5634, 43)
X_test shape: (1409, 43)
y_train shape: (5634, 1)
y_test shape: (1409, 1)
Xr_train shape: (8276, 43)
yr_train shape: (8276,)


In [17]:
decision_tree_modelr = DecisionTreeModel(name='DecisionTree')

In [18]:
cross_validate_with_resampling(DecisionTreeModel(), X, y)

(0.6501798702605038, 0.6566606326729503)

In [19]:
decision_tree_model.fit(X_train, y_train)
decision_tree_modelr.fit(Xr_train, yr_train)
y_pred = decision_tree_model.predict(X_test)
yr_pred = decision_tree_modelr.predict(X_test)


roc_auc_score(yr_pred, y_test)

0.6452306060961374

In [20]:
roc_auc_score(y_pred, y_test)


0.6658975938285691

## take all Codes Below to new File named Model Selection

In [66]:
grid_search_parameters = {
    "DecisionTreeModel":{
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': range(3, 10),
    'min_samples_split': range(5, 10),
    'min_impurity_decrease': 10.0**np.arange(-5, 0)}
}

grid_search_parameters["XGBClassifier"] = {
    'max_depth': range(3, 10),  # Similar to DecisionTree
    'learning_rate': 10.0 ** np.arange(-5, 0),  # Equivalent to eta in XGBoost
    'n_estimators': range(50, 200, 50),  # Number of gradient boosted trees
    'subsample': np.arange(0.5, 1.0, 0.1),  # Subsample ratio of the training instances
    'colsample_bytree': np.arange(0.5, 1.0, 0.1)  # Subsample ratio of columns when constructing each tree
}

grid_search_parameters["GradientBoostingClassifier"] = {
    'loss': ['log_loss', 'exponential'],  # Loss function to be optimized
    'learning_rate': 10.0 ** np.arange(-5, 0),
    'n_estimators': range(50, 200, 50),
    'max_depth': range(3, 10),
    'min_samples_split': range(5, 10),
    'min_impurity_decrease': 10.0**np.arange(-5, 0)
}

grid_search_parameters["RandomForestClassifier"] = {
    'n_estimators': range(50, 200, 50),  # Number of trees in the forest
    'criterion': ['gini', 'entropy'],
    'max_depth': range(3, 10),
    'min_samples_split': range(5, 10),
    'min_impurity_decrease': 10.0**np.arange(-5, 0)
}

grid_search_parameters["AdaBoostClassifier"] = {
    'n_estimators': range(50, 200, 50),  # Number of weak learners to train iteratively
    'learning_rate': 10.0 ** np.arange(-5, 0),  # Weight applied to each classifier at each boosting iteration
    'algorithm': ['SAMME', 'SAMME.R']  # Algorithm to use
}



### Decision tree Grid search

In [48]:
DecisionTreeModel_gs = DecisionTreeModel("DecisionTreeModel with grid search")
DT_best_params, DT_recempling = DecisionTreeModel_gs.gs_parameter_tune(X, y, grid_search_parameters['DecisionTreeModel'], max_search=200)

100%|██████████| 200/200 [00:53<00:00,  3.72it/s]

Model: DecisionTreeModel with grid search
Best Parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 8, 'min_impurity_decrease': 1e-05}
Resempling: False
Validation Accuracy: 0.7230170107896974





In [49]:
DT_hyper_model = DecisionTreeModel("DecisionTreeModel with hyper parameters")
DT_hyper_model.hyper_parameter(DT_best_params)
if(DT_recempling):
    DT_hyper_model.fit_with_resampling(X_train, y_train)
else:
    DT_hyper_model.fit(X_train, y_train)

y_pred_hyper = DT_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()

0.7409847115414075

### XGBoost with Grid Search

In [None]:
xgb_gs = XGBoostModel("XGBoostModel with grid search")
xgb_best_params, xgb_resempling = xgb_gs.gs_parameter_tune(X, y, grid_search_parameters['XGBClassifier'], max_search=200)

In [None]:
xgb_hyper_model = XGBoostModel("XGBoostModel with hyper parameters")
xgb_hyper_model.hyper_parameter(xgb_best_params)
if(xgb_resempling):
    xgb_hyper_model.fit_with_resampling(X_train, y_train)
else:
    xgb_hyper_model.fit(X_train, y_train)

y_pred_hyper = xgb_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()

Parameters: { "criterion", "min_impurity_decrease", "min_samples_split" } are not used.



0.7235126320159793

### AdaBoost with Grid Search

In [67]:
AdaBoostModel_gs = AdaBoostModel("AdaBoostModel with grid search")
ab_best_params, ab_resempling = AdaBoostModel_gs.gs_parameter_tune(X, y, grid_search_parameters['AdaBoostClassifier'], max_search=200)

  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 30/30 [03:17<00:00,  6.60s/it]

Model: AdaBoostModel with grid search
Best Parameters: {'n_estimators': 150, 'learning_rate': 0.1, 'algorithm': 'SAMME.R'}
Resempling: True
Validation Accuracy: 0.7334537629782438





In [68]:
ab_hyper_model = AdaBoostModel("AdaBoostModel with hyper parameters")
ab_hyper_model.hyper_parameter(ab_best_params)
if(ab_resempling):
    ab_hyper_model.fit_with_resampling(X_train, y_train)
else:
    ab_hyper_model.fit(X_train, y_train)

y_pred_hyper = ab_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()



0.7435896076820467

### Gradient Boosting with Grid Search

In [71]:
GradientBoostingModel_gs = GradientBoostingModel("GradientBoostingModel with grid search")
gb_best_params, gb_resempling = GradientBoostingModel_gs.gs_parameter_tune(X, y, grid_search_parameters['GradientBoostingClassifier'], max_search=100)

 12%|█▏        | 12/100 [05:21<34:03, 23.22s/it] 

In [None]:
gb_hyper_model = AdaBoostModel("AdaBoostModel with hyper parameters")
gb_hyper_model.hyper_parameter(gb_best_params)
if(gb_resempling):
    gb_hyper_model.fit_with_resampling(X_train, y_train)
else:
    gb_hyper_model.fit(X_train, y_train)

y_pred_hyper = gb_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()

### Random Forest Classifier with Grid Search

In [None]:
RandomForestModel_gs = RandomForestModel("RandomForestModel with grid search")
rf_best_params, rf_resempling = RandomForestModel_gs.gs_parameter_tune(X, y, grid_search_parameters['RandomForestClassifier'], max_search=200)

In [None]:
rf_hyper_model = RandomForestModel("RandomForestModel with hyper parameters")
rf_hyper_model.hyper_parameter(rf_best_params)
if(rf_resempling):
    rf_hyper_model.fit_with_resampling(X_train, y_train)
else:
    rf_hyper_model.fit(X_train, y_train)

y_pred_hyper = rf_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()