In [11]:
import os 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

In [12]:
parent_dir = os.path.dirname(os.getcwd())

In [13]:
train_data = pd.read_csv(f'{os.path.dirname(os.getcwd())}/references/model_data_train.csv')

In [14]:
top_6_features = ["Tenure", "Complain", "CashbackAmount", "SatisfactionScore", "DaySinceLastOrder", 
                              "WarehouseToHome","Churn"]

top_8_features = ["Tenure", "Complain", "CashbackAmount", "SatisfactionScore", "DaySinceLastOrder", 
                              "WarehouseToHome", "OrderAmountHikeFromlastYear", "NumberOfAddress","Churn"]

top_10_features = ["Tenure", "Complain", "CashbackAmount", "SatisfactionScore", "DaySinceLastOrder", 
                   "WarehouseToHome", "OrderAmountHikeFromlastYear", "NumberOfAddress", "CouponUsed", "HourSpendOnApp","Churn"]

In [15]:
X = train_data[top_6_features].drop('Churn',axis=1)
y = train_data['Churn']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
gbc = GradientBoostingClassifier(random_state=42)

In [16]:
gbc.fit(X_train,y_train)

In [18]:
y_pred_gb = gbc.predict(X_test)

In [19]:
print(classification_report(y_test, y_pred_gb))

              precision    recall  f1-score   support

         0.0       0.90      0.92      0.91       821
         1.0       0.91      0.89      0.90       811

    accuracy                           0.90      1632
   macro avg       0.90      0.90      0.90      1632
weighted avg       0.90      0.90      0.90      1632



In [20]:
param_grid = {
        'n_estimators': [5,10,20,30,40,50,60,70,100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
    }

grid_search = GridSearchCV(gbc, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train,y_train)

In [22]:
best_gbc = grid_search.best_estimator_
y_pred_tuned = best_gbc.predict(X_test)

In [23]:
print(classification_report(y_test, y_pred_tuned))

              precision    recall  f1-score   support

         0.0       0.93      0.95      0.94       821
         1.0       0.95      0.93      0.94       811

    accuracy                           0.94      1632
   macro avg       0.94      0.94      0.94      1632
weighted avg       0.94      0.94      0.94      1632



In [22]:
def train_evaluate_tune_save_models():
    models = {
        'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42),
        'AdaBoost Classifier': AdaBoostClassifier(random_state=42)
    }
    
    param_grids = {
        'Gradient Boosting Classifier': {
            'n_estimators': [3,5,7,10,20,30,40,50,60,70,80,90,100],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5,7,9,11,13,15,17,20],
            'min_samples_split': [2, 4, 6],
        'min_samples_leaf': [1, 2, 3],
        'subsample': [0.8, 0.9, 1.0]
        },
        'AdaBoost Classifier': {
            'n_estimators': [3,5,7,10,20,30,40,50,60,70,80,90,100],
            'learning_rate': [0.01, 0.1],
            'base_estimator__max_depth': [1, 2,3,4,5,6,7,8,9,10,],
        'algorithm': ['SAMME', 'SAMME.R']
        }
    }
    # Track the best model and highest score
    best_model = None
    highest_score = 0
    
    # Split into training and validation sets
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    for name, model in models.items():
        print(f"\nTraining and Evaluating: {name}")
        
        # Train the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        print("\nBefore Tuning:")
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        
        # Hyperparameter tuning using GridSearchCV
        grid_search = GridSearchCV(model, param_grids[name], cv=3, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        best_model_grid = grid_search.best_estimator_
        best_score = grid_search.best_score_
        
        # Predictions with the tuned model
        y_pred_tuned = best_model_grid.predict(X_test)
        
        print("\nAfter Tuning:")
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred_tuned))
        print("Classification Report:")
        print(classification_report(y_test, y_pred_tuned))
        
        # Update best model if it has the highest score so far
        if best_score > highest_score:
            best_model = best_model_grid
            highest_score = best_score
    
    # Save the best model
    if best_model:
        with open('best_model.pkl', 'wb') as file:
            pickle.dump(best_model, file)
        print(f"\nBest model saved: {best_model}")

In [23]:
train_evaluate_tune_save_models()


Training and Evaluating: Gradient Boosting Classifier

Before Tuning:
Confusion Matrix:
[[504  41]
 [ 68 475]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.92      0.90       545
         1.0       0.92      0.87      0.90       543

    accuracy                           0.90      1088
   macro avg       0.90      0.90      0.90      1088
weighted avg       0.90      0.90      0.90      1088


After Tuning:
Confusion Matrix:
[[526  19]
 [ 45 498]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.92      0.97      0.94       545
         1.0       0.96      0.92      0.94       543

    accuracy                           0.94      1088
   macro avg       0.94      0.94      0.94      1088
weighted avg       0.94      0.94      0.94      1088


Training and Evaluating: AdaBoost Classifier

Before Tuning:
Confusion Matrix:
[[481  64]
 [ 77 466]]
Classification Report:
      




After Tuning:
Confusion Matrix:
[[448  97]
 [ 83 460]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.84      0.82      0.83       545
         1.0       0.83      0.85      0.84       543

    accuracy                           0.83      1088
   macro avg       0.83      0.83      0.83      1088
weighted avg       0.83      0.83      0.83      1088


Best model saved: GradientBoostingClassifier(max_depth=5, n_estimators=200, random_state=42)
