In [2]:
from models.decision_tree import DecisionTreeModel
from models.adaBoost import AdaBoostModel
from models.xg_boost import XGBoostModel
from models.gradient_boosting import GradientBoostingModel
from models.random_forest import RandomForestModel
from models.cat_boost import CatBoostModel
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, roc_auc_score
from metrics import cross_validate_with_resampling, Metrics
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler, SMOTE
import pandas as pd
import numpy as np

## Reading Data and Setting up Grid search parameters

In [3]:
data = pd.read_csv('data/Telco-Customer-Churn-encoded-data-FE.csv')
label = pd.read_csv('data/Telco-Customer-Churn-encoded-label.csv')

In [4]:
data.head()

Unnamed: 0,gender_Male,SeniorCitizen_1,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges,tenure_group,Monthly/Total_Charges,TotalCharges/tenure
0,0,0,1,0,0,0,1,0,1,0,...,0,0,1,0,1,29.85,29.85,0,1.0,29.85
1,1,0,0,0,1,1,0,0,1,0,...,0,0,0,1,34,56.95,1889.5,2,0.03014,55.573529
2,1,0,0,0,1,1,0,0,1,0,...,0,0,0,1,2,53.85,108.15,0,0.49792,54.075
3,1,0,0,0,0,0,1,0,1,0,...,1,0,0,0,45,42.3,1840.75,3,0.02298,40.905556
4,0,0,0,0,1,1,0,0,0,1,...,0,0,1,0,2,70.7,151.65,0,0.466205,75.825


In [5]:
label[:5]

Unnamed: 0,Churn
0,0
1,0
2,1
3,0
4,1


In [6]:
X = data.to_numpy()
y = label.to_numpy().ravel()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
grid_search_parameters = {
    "DecisionTreeModel":{
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': range(3, 10),
    'min_samples_split': range(5, 10),
    'min_impurity_decrease': 10.0**np.arange(-5, 0)}
}

grid_search_parameters["XGBClassifier"] = {
    'max_depth': range(3, 10),  # Similar to DecisionTree
    'learning_rate': 10.0 ** np.arange(-5, 0),  # Equivalent to eta in XGBoost
    'n_estimators': range(50, 200, 50),  # Number of gradient boosted trees
    'subsample': np.arange(0.5, 1.0, 0.1),  # Subsample ratio of the training instances
    'colsample_bytree': np.arange(0.5, 1.0, 0.1)  # Subsample ratio of columns when constructing each tree
}

grid_search_parameters["GradientBoostingClassifier"] = {
    'loss': ['log_loss', 'exponential'],  # Loss function to be optimized
    'learning_rate': 10.0 ** np.arange(-5, 0),
    'n_estimators': range(50, 200, 50),
    'max_depth': range(3, 10),
    'min_samples_split': range(5, 10),
    'min_impurity_decrease': 10.0**np.arange(-5, 0)
}

grid_search_parameters["RandomForestClassifier"] = {
    'n_estimators': range(50, 200, 50),  # Number of trees in the forest
    'criterion': ['gini', 'entropy'],
    'max_depth': range(3, 10),
    'min_samples_split': range(5, 10),
    'min_impurity_decrease': 10.0**np.arange(-5, 0)
}

grid_search_parameters["AdaBoostClassifier"] = {
    'n_estimators': range(50, 200, 50),  # Number of weak learners to train iteratively
    'learning_rate': 10.0 ** np.arange(-5, 0),  # Weight applied to each classifier at each boosting iteration
    'algorithm': ['SAMME', 'SAMME.R']  # Algorithm to use
}

grid_search_parameters["CatBoostClassifier"] = {
    'iterations': range(50, 200, 50),  # Number of boosting iterations
    'learning_rate': 10.0 ** np.arange(-5, 0),  # Step size shrinkage used in updates during training
    'depth': range(3, 10),  # Depth of the trees
    'l2_leaf_reg': 10.0 ** np.arange(-5, 0),  # L2 regularization term on weights
    'border_count': [32, 64, 128],  # The number of splits for numerical features
    'loss_function': ['Logloss', 'CrossEntropy'],  # Loss function to be optimized
    'eval_metric': ['Logloss', 'AUC'],  # Metric used for validation data
    # 'cat_features': ['categorical_feature_index']  # Specify categorical features indices
}



### Decision tree Grid search

In [13]:
DecisionTreeModel_gs = DecisionTreeModel("DecisionTreeModel with grid search")
DT_best_params, DT_recempling = DecisionTreeModel_gs.gs_parameter_tune(X, y, grid_search_parameters['DecisionTreeModel'], max_search=200)


found 0 physical cores < 1
  File "d:\miniconda3\envs\ml-project-env\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")
100%|██████████| 200/200 [02:47<00:00,  1.19it/s]

Model: DecisionTreeModel with grid search
Best Parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 9, 'min_impurity_decrease': 0.0001}
Resempling: False
Validation Accuracy: 0.7230170107896974





In [16]:

DT_hyper_model = DecisionTreeModel("DecisionTreeModel with hyper parameters")
DT_hyper_model.hyper_parameter(DT_best_params)
if(DT_recempling):
    DT_hyper_model.fit_with_resampling(X_train, y_train)
else:
    DT_hyper_model.fit(X_train, y_train)

y_pred_hyper = DT_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()

0.741670536873865

### XGBoost with Grid Search

In [None]:
xgb_gs = XGBoostModel("XGBoostModel with grid search")
xgb_best_params, xgb_resempling = xgb_gs.gs_parameter_tune(X, y, grid_search_parameters['XGBClassifier'], max_search=200)

In [None]:
xgb_hyper_model = XGBoostModel("XGBoostModel with hyper parameters")
xgb_hyper_model.hyper_parameter(xgb_best_params)
if(xgb_resempling):
    xgb_hyper_model.fit_with_resampling(X_train, y_train)
else:
    xgb_hyper_model.fit(X_train, y_train)

y_pred_hyper = xgb_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()

Parameters: { "criterion", "min_impurity_decrease", "min_samples_split" } are not used.



0.7235126320159793

### AdaBoost with Grid Search

In [None]:
AdaBoostModel_gs = AdaBoostModel("AdaBoostModel with grid search")
ab_best_params, ab_resempling = AdaBoostModel_gs.gs_parameter_tune(X, y, grid_search_parameters['AdaBoostClassifier'], max_search=200)

  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 30/30 [03:17<00:00,  6.60s/it]

Model: AdaBoostModel with grid search
Best Parameters: {'n_estimators': 150, 'learning_rate': 0.1, 'algorithm': 'SAMME.R'}
Resempling: True
Validation Accuracy: 0.7334537629782438





In [None]:
ab_hyper_model = AdaBoostModel("AdaBoostModel with hyper parameters")
ab_hyper_model.hyper_parameter(ab_best_params)
if(ab_resempling):
    ab_hyper_model.fit_with_resampling(X_train, y_train)
else:
    ab_hyper_model.fit(X_train, y_train)

y_pred_hyper = ab_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()



0.7435896076820467

### Gradient Boosting with Grid Search

In [None]:
GradientBoostingModel_gs = GradientBoostingModel("GradientBoostingModel with grid search")
gb_best_params, gb_resempling = GradientBoostingModel_gs.gs_parameter_tune(X, y, grid_search_parameters['GradientBoostingClassifier'], max_search=100)

100%|██████████| 100/100 [45:45<00:00, 27.46s/it]

Model: GradientBoostingModel with grid search
Best Parameters: {'loss': 'log_loss', 'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 3, 'min_samples_split': 7, 'min_impurity_decrease': 0.01}
Resempling: True
Validation Accuracy: 0.7370372507161347





In [None]:
gb_hyper_model = GradientBoostingModel("GradientBoostingModel with hyper parameters")
gb_hyper_model.hyper_parameter(gb_best_params)
if(gb_resempling):
    gb_hyper_model.fit_with_resampling(X_train, y_train)
else:
    gb_hyper_model.fit(X_train, y_train)

y_pred_hyper = gb_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()

0.7553634785234393

In [None]:
# gb_hyper_model.save('gb_hyper_model.pkl')

In [None]:
load_model = GradientBoostingModel()
load_model.load('gb_hyper_model.pkl')

In [None]:
y_pred_load = load_model.predict(X_test)
Metrics(y_pred_load, y_test).f1_score()


0.7553634785234393

### Random Forest Classifier with Grid Search

In [None]:
RandomForestModel_gs = RandomForestModel("RandomForestModel with grid search")
rf_best_params, rf_resempling = RandomForestModel_gs.gs_parameter_tune(X, y, grid_search_parameters['RandomForestClassifier'], max_search=200)

In [None]:
rf_hyper_model = RandomForestModel("RandomForestModel with hyper parameters")
rf_hyper_model.hyper_parameter(rf_best_params)
if(rf_resempling):
    rf_hyper_model.fit_with_resampling(X_train, y_train)
else:
    rf_hyper_model.fit(X_train, y_train)

y_pred_hyper = rf_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()

In [None]:
DecisionTreeModel_gs = DecisionTreeModel("DecisionTreeModel with grid search")
DT_best_params, DT_recempling = DecisionTreeModel_gs.gs_parameter_tune(X, y, grid_search_parameters['DecisionTreeModel'], max_search=200)

In [None]:
DT_hyper_model = DecisionTreeModel("DecisionTreeModel with hyper parameters")
DT_hyper_model.hyper_parameter(DT_best_params)
if(DT_recempling):
    DT_hyper_model.fit_with_resampling(X_train, y_train)
else:
    DT_hyper_model.fit(X_train, y_train)

y_pred_hyper = DT_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()

0.7409847115414075

### CatBoost Classifier with Grid Search

In [None]:
CatBoost_gs = CatBoostModel("CatBoostModel with grid search")
cb_best_params, cb_resempling = CatBoost_gs.gs_parameter_tune(X, y, grid_search_parameters['CatBoostClassifier'], max_search=-1)

In [17]:
cb_hyper_model = CatBoostModel("CatBoostModel with hyper parameters")
cb_hyper_model.hyper_parameter(cb_best_params)
if(cb_resempling):
    cb_hyper_model.fit_with_resampling(X_train, y_train)
else:
    cb_hyper_model.fit(X_train, y_train)

y_pred_hyper = cb_hyper_model.predict(X_test)

Metrics(y_pred_hyper, y_test).f1_score()

0:	learn: 0.6847073	total: 2.15ms	remaining: 320ms
1:	learn: 0.6792827	total: 3.89ms	remaining: 288ms
2:	learn: 0.6717061	total: 5.75ms	remaining: 282ms
3:	learn: 0.6666778	total: 7.43ms	remaining: 271ms
4:	learn: 0.6617661	total: 8.98ms	remaining: 261ms
5:	learn: 0.6565347	total: 10.4ms	remaining: 249ms
6:	learn: 0.6466224	total: 11.9ms	remaining: 243ms
7:	learn: 0.6366963	total: 13.4ms	remaining: 238ms
8:	learn: 0.6321884	total: 14.9ms	remaining: 233ms
9:	learn: 0.6285178	total: 16.3ms	remaining: 228ms
10:	learn: 0.6202254	total: 17.8ms	remaining: 225ms
11:	learn: 0.6135358	total: 19.3ms	remaining: 222ms
12:	learn: 0.6093813	total: 20.8ms	remaining: 219ms
13:	learn: 0.6054479	total: 22.2ms	remaining: 216ms
14:	learn: 0.6015785	total: 23.7ms	remaining: 213ms
15:	learn: 0.5977211	total: 25.2ms	remaining: 211ms
16:	learn: 0.5940839	total: 26.7ms	remaining: 209ms
17:	learn: 0.5874969	total: 28.2ms	remaining: 207ms
18:	learn: 0.5842962	total: 29.7ms	remaining: 205ms
19:	learn: 0.5809552	t

0.751683778495237