# Modelling

## 1. Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from summarytools import dfSummary

from env_setup import *

pd.set_option('display.max_columns', None)

In [2]:
# Load data
X_test = pd.read_csv(fr"{dataout}//{dataset}_us_test.csv")
y_test = X_test[['Churned']]
X_test.drop(columns=['Churned'], inplace=True)

X_train = pd.read_csv(fr"{dataout}//{dataset}_us_train.csv")
y_train = X_train[['Churned']]
X_train.drop(columns=['Churned'], inplace=True)

In [3]:
X_train.columns

Index(['Age', 'Income', 'Tenure', 'NumSupportCalls', 'NumComplaints',
       'Purchase', 'Refund', 'Subscription Renewal', 'Support Fee', 'Upgrade',
       'txn_mean', 'txn_count', 'Age_norm', 'Income_norm', 'Tenure_norm',
       'NumSupportCalls_norm', 'NumComplaints_norm', 'Purchase_norm',
       'Refund_norm', 'Subscription Renewal_norm', 'Support Fee_norm',
       'Upgrade_norm', 'txn_mean_norm', 'txn_count_norm', 'Gender_Female',
       'Gender_Male', 'Location_Rural', 'Location_Suburban', 'Location_Urban'],
      dtype='object')

In [4]:
# Original vs Binned columns
l_cols_num = [
    'Age', 'Income', 'Tenure', 'NumSupportCalls', 'NumComplaints',
    'Purchase', 'Refund', 'Subscription Renewal', 'Support Fee', 'Upgrade',
    'txn_mean', 'txn_count'
    ]

l_cols_cat = [
    #'Gender_Female', 
    'Gender_Male', 
    'Location_Rural', 'Location_Suburban', 
    #'Location_Urban',
    ]

l_cols_norm = [
    'Age_norm', 'Income_norm', 'Tenure_norm',
    'NumSupportCalls_norm', 'NumComplaints_norm', 'Purchase_norm',
    'Refund_norm', 'Subscription Renewal_norm', 'Support Fee_norm',
    'Upgrade_norm', 'txn_mean_norm', 'txn_count_norm'
    ]

## 2. Baseline Models

### 2.1. Logistic Regression

In [5]:
import pandas as pd
import statsmodels.api as sm 
from sklearn.metrics import classification_report, f1_score

class logistic_regression:
    def __init__(self, y_train, x_train, cols, name, y=y_test, x=X_test):
        self.name = name
        self.y_train = y_train
        self.x_train = x_train
        self.y = y_test
        self.x = X_test
        self.cols = cols
        self.model = sm.Logit(self.y_train, self.x_train[cols]).fit()

    def summary(self):
        print(self.model.summary())

    def predict(self, confusion=False):
        yhat = self.model.predict(self.x[self.cols]) 
        prediction = list(map(round, yhat))
        self.cm = classification_report(self.y, prediction)
        f1 = f1_score(self.y, prediction)
        self.f1 = f1
        print(f"{self.name}: F1 = {f1}")    
        if confusion:
            print ("Confusion Matrix : \n", self.cm)

    def feature_selection(self):
        # Remove columns with no statistical differences
        try:
            self.cols.remove('Age')
            self.cols.remove('TransactionDate')
            self.cols.remove('NumComplaints')
        except:
            pass
        self.model = sm.Logit(self.y_train, self.x_train[self.cols]).fit()
        self.predict()
        f1 = self.f1

        for i in self.cols:
            self.cols.remove(i)
            self.model = sm.Logit(self.y_train, self.x_train[self.cols]).fit()
            print(self.cols)
            self.predict()
            new_f1 = self.f1

            if new_f1 < f1:
                self.cols.append(i)

In [6]:
ori_baseLogit_num = logistic_regression(y_train, X_train, l_cols_num+l_cols_cat, 'ori_baseLogit_num')
ori_baseLogit_num.predict()
ori_baseLogit_norm = logistic_regression(y_train, X_train, l_cols_norm+l_cols_cat, 'ori_baseLogit_norm')
ori_baseLogit_norm.predict()

Optimization terminated successfully.
         Current function value: 0.643021
         Iterations 5
ori_baseLogit_num: F1 = 0.6291994281629736
Optimization terminated successfully.
         Current function value: 0.642764
         Iterations 5
ori_baseLogit_norm: F1 = 0.6292779071850922


### 2.2. Decision Trees

In [7]:
import pandas as pd
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV

class decision_tree:
    def __init__(self, y_train, x_train, cols, name, y=y_test, x=X_test):
        self.name = name
        self.y_train = y_train
        self.x_train = x_train
        self.y = y_test
        self.x = X_test
        self.cols = cols
        self.model = DecisionTreeClassifier().fit(self.x_train[self.cols], self.y_train)

    def summary(self):
        tree.plot_tree(self.model, feature_names=self.cols)

    def predict(self, confusion=False):
        yhat = self.model.predict(self.x[self.cols]) 
        prediction = list(map(round, yhat))
        self.cm = classification_report(self.y, prediction)
        f1 = f1_score(self.y, prediction)
        self.f1 = f1
        print(f"{self.name}: F1 = {f1}")    
        if confusion:
            print ("Confusion Matrix : \n", self.cm)

    def feature_selection(self):
        # Remove columns with no statistical differences
        try:
            self.cols.remove('Age')
            self.cols.remove('TransactionDate')
            self.cols.remove('NumComplaints')
        except:
            pass
        self.model = DecisionTreeClassifier().fit(self.x_train[self.cols], self.y_train)
        self.predict()
        f1 = self.f1

        for i in self.cols:
            self.cols.remove(i)
            self.model = DecisionTreeClassifier().fit(self.x_train[self.cols], self.y_train)
            print(self.cols)
            self.predict()
            new_f1 = self.f1

            if new_f1 < f1:
                self.cols.append(i)

    def fine_tune(self, params):
        random_search = GridSearchCV(
            DecisionTreeClassifier(),
            params, 
            n_jobs=-1, 
            cv=5, 
            scoring='f1_weighted')
        
        random_search.fit(self.x_train[self.cols], self.y_train)
        self.best_params_random = random_search.best_params_
        self.best_score_random = random_search.best_score_
        self.model = random_search.best_estimator_
        self.predict()

In [8]:
# Normalisation not required for decision trees
ori_baseDT_num = decision_tree(y_train, X_train, l_cols_num+l_cols_cat, 'ori_baseDT_num')
ori_baseDT_num.predict()

ori_baseDT_num: F1 = 0.5442552812193601


### 2.3. Naive Bayes

In [10]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, f1_score

class naive_bayes:
    def __init__(self, y_train, x_train, cols, name, y=y_test, x=X_test):
        self.name = name
        self.y_train = y_train.values.ravel()
        self.x_train = x_train
        self.y = y_test.values.ravel()
        self.x = X_test
        self.cols = cols
        self.model = GaussianNB().fit(self.x_train[self.cols].values, self.y_train)

    def predict(self, confusion=False):
        yhat = self.model.predict(self.x[self.cols].values) 
        prediction = list(map(round, yhat))
        self.cm = classification_report(self.y, prediction)
        f1 = f1_score(self.y, prediction)
        self.f1 = f1
        print(f"{self.name}: F1 = {f1}")    
        if confusion:
            print ("Confusion Matrix : \n", self.cm)

    def feature_selection(self):
        # Remove columns with no statistical differences
        try:
            self.cols.remove('Age')
            self.cols.remove('TransactionDate')
            self.cols.remove('NumComplaints')
        except:
            pass
        self.model = GaussianNB().fit(self.x_train[self.cols].values, self.y_train)
        self.predict()
        f1 = self.f1

        for i in self.cols:
            self.cols.remove(i)
            self.model = GaussianNB().fit(self.x_train[self.cols].values, self.y_train)
            print(self.cols)
            self.predict()
            new_f1 = self.f1

            if new_f1 < f1:
                self.cols.append(i)

In [11]:
ori_baseNB_num = naive_bayes(y_train, X_train, l_cols_num+l_cols_cat, 'ori_baseNB_num')
ori_baseNB_num.predict()
ori_baseNB_norm = naive_bayes(y_train, X_train, l_cols_norm+l_cols_cat, 'ori_baseNB_norm')
ori_baseNB_norm.predict()

ori_baseNB_num: F1 = 0.6533789563729684
ori_baseNB_norm: F1 = 0.6538527250982402


### 2.4. Random Forest

In [12]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score

class random_forest:
    def __init__(self, y_train, x_train, cols, name, y=y_test, x=X_test):
        self.name = name
        self.y_train = y_train
        self.x_train = x_train[cols]
        self.y = y_test
        self.x = X_test[cols]
        self.cols = cols
        self.model = RandomForestClassifier(n_estimators = 10).fit(self.x_train[cols], self.y_train)

    def summary(self):
        tree.plot_tree(self.model, feature_names=self.cols)

    def predict(self, confusion=False):
        yhat = self.model.predict(self.x[self.cols]) 
        prediction = list(map(round, yhat))
        self.cm = classification_report(self.y, prediction)
        f1 = f1_score(self.y, prediction)
        self.f1 = f1
        print(f"{self.name}: F1 = {f1}")    
        if confusion:
            print ("Confusion Matrix : \n", self.cm)

In [13]:
# Normalisation and Balancing not required for decision trees
ori_baseRF_num = random_forest(y_train, X_train, l_cols_num+l_cols_cat, 'ori_baseRF_num')
ori_baseRF_num.predict()

  return fit_method(estimator, *args, **kwargs)


ori_baseRF_num: F1 = 0.5632995990070652


## 3. Fine-Tuning

In [14]:
# Feature selection - LR
smenn_FT_Logit_num = logistic_regression(y_train, X_train, l_cols_num+l_cols_cat, 'smenn_FT_Logit_num')
smenn_FT_Logit_num.predict()
smenn_FT_Logit_num.feature_selection()
smenn_FT_Logit_num.feature_selection()

Optimization terminated successfully.
         Current function value: 0.643021
         Iterations 5
smenn_FT_Logit_num: F1 = 0.6291994281629736
Optimization terminated successfully.
         Current function value: 0.645487
         Iterations 5
smenn_FT_Logit_num: F1 = 0.6252819633673193
Optimization terminated successfully.
         Current function value: 0.648894
         Iterations 5
['Tenure', 'NumSupportCalls', 'NumComplaints', 'Purchase', 'Refund', 'Subscription Renewal', 'Support Fee', 'Upgrade', 'txn_mean', 'txn_count', 'Gender_Male', 'Location_Rural', 'Location_Suburban']
smenn_FT_Logit_num: F1 = 0.613018401538039
Optimization terminated successfully.
         Current function value: 0.646471
         Iterations 5
['Tenure', 'NumComplaints', 'Purchase', 'Refund', 'Subscription Renewal', 'Support Fee', 'Upgrade', 'txn_mean', 'txn_count', 'Gender_Male', 'Location_Rural', 'Location_Suburban', 'Income']
smenn_FT_Logit_num: F1 = 0.6246960281005134
Optimization terminated succes

In [15]:
# Feature selection - DT
ori_FS_DT_num = decision_tree(y_train, X_train, l_cols_num+l_cols_cat, 'sm_baseDT_num')
ori_FS_DT_num.predict()
ori_FS_DT_num.feature_selection()
ori_FS_DT_num.feature_selection()

sm_baseDT_num: F1 = 0.5437100213219617
sm_baseDT_num: F1 = 0.5463797875189715
['Tenure', 'NumSupportCalls', 'NumComplaints', 'Purchase', 'Refund', 'Subscription Renewal', 'Support Fee', 'Upgrade', 'txn_mean', 'txn_count', 'Gender_Male', 'Location_Rural', 'Location_Suburban']
sm_baseDT_num: F1 = 0.5415989893521025
['Tenure', 'NumComplaints', 'Purchase', 'Refund', 'Subscription Renewal', 'Support Fee', 'Upgrade', 'txn_mean', 'txn_count', 'Gender_Male', 'Location_Rural', 'Location_Suburban', 'Income']
sm_baseDT_num: F1 = 0.5495049504950495
['Tenure', 'NumComplaints', 'Refund', 'Subscription Renewal', 'Support Fee', 'Upgrade', 'txn_mean', 'txn_count', 'Gender_Male', 'Location_Rural', 'Location_Suburban', 'Income']
sm_baseDT_num: F1 = 0.5445885154802858
['Tenure', 'NumComplaints', 'Refund', 'Support Fee', 'Upgrade', 'txn_mean', 'txn_count', 'Gender_Male', 'Location_Rural', 'Location_Suburban', 'Income', 'Purchase']
sm_baseDT_num: F1 = 0.5527090957779166
['Tenure', 'NumComplaints', 'Refund',

In [16]:
# Feature selection - NB
sm_FT_NB_num = naive_bayes(y_train, X_train, l_cols_num+l_cols_cat, 'sm_FT_NB_num')
sm_FT_NB_num.predict()
sm_FT_NB_num.feature_selection()
sm_FT_NB_num.feature_selection()

sm_FT_NB_num: F1 = 0.6533789563729684
sm_FT_NB_num: F1 = 0.6531519972628518
['Tenure', 'NumSupportCalls', 'NumComplaints', 'Purchase', 'Refund', 'Subscription Renewal', 'Support Fee', 'Upgrade', 'txn_mean', 'txn_count', 'Gender_Male', 'Location_Rural', 'Location_Suburban']
sm_FT_NB_num: F1 = 0.6539481563863462
['Tenure', 'NumComplaints', 'Purchase', 'Refund', 'Subscription Renewal', 'Support Fee', 'Upgrade', 'txn_mean', 'txn_count', 'Gender_Male', 'Location_Rural', 'Location_Suburban']
sm_FT_NB_num: F1 = 0.654007356085878
['Tenure', 'NumComplaints', 'Refund', 'Subscription Renewal', 'Support Fee', 'Upgrade', 'txn_mean', 'txn_count', 'Gender_Male', 'Location_Rural', 'Location_Suburban']
sm_FT_NB_num: F1 = 0.6533230690274571
['Tenure', 'NumComplaints', 'Refund', 'Support Fee', 'Upgrade', 'txn_mean', 'txn_count', 'Gender_Male', 'Location_Rural', 'Location_Suburban']
sm_FT_NB_num: F1 = 0.6534382483749572
['Tenure', 'NumComplaints', 'Refund', 'Support Fee', 'txn_mean', 'txn_count', 'Gender_

### Best Models

In [18]:
# Logistic Regression
LR_model = logistic_regression(y_train, X_train, ['NumComplaints', 'Gender_Male', 'NumSupportCalls', 'Tenure'], 'LR_model')
LR_model.predict(confusion=True)

Optimization terminated successfully.
         Current function value: 0.656889
         Iterations 5
LR_model: F1 = 0.5846300715990453
Confusion Matrix : 
               precision    recall  f1-score   support

           0       0.60      0.68      0.64      5616
           1       0.63      0.55      0.58      5616

    accuracy                           0.61     11232
   macro avg       0.61      0.61      0.61     11232
weighted avg       0.61      0.61      0.61     11232



In [19]:
LR_model.summary()

                           Logit Regression Results                           
Dep. Variable:                Churned   No. Observations:                44924
Model:                          Logit   Df Residuals:                    44920
Method:                           MLE   Df Model:                            3
Date:                Thu, 29 Aug 2024   Pseudo R-squ.:                 0.05231
Time:                        05:38:37   Log-Likelihood:                -29510.
converged:                       True   LL-Null:                       -31139.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
NumComplaints       0.1271      0.006     20.460      0.000       0.115       0.139
Gender_Male         0.2552      0.019     13.785      0.000       0.219       0.291
NumSupportCalls     0.0717      

In [21]:
# Decision Tree
DT_model = decision_tree(y_train, X_train, ['NumComplaints', 'Gender_Male', 'Tenure', 'Refund'], 'DT_model')
DT_model.predict(confusion=True)

DT_model: F1 = 0.5812876736262776
Confusion Matrix : 
               precision    recall  f1-score   support

           0       0.58      0.55      0.56      5616
           1       0.57      0.59      0.58      5616

    accuracy                           0.57     11232
   macro avg       0.57      0.57      0.57     11232
weighted avg       0.57      0.57      0.57     11232



In [22]:
# Naive Bayes
NB_model = naive_bayes(y_train, X_train, ['NumComplaints', 'Gender_Male', 'Tenure'], 'NB_model')
NB_model.predict(confusion=True)

NB_model: F1 = 0.6390845070422535
Confusion Matrix : 
               precision    recall  f1-score   support

           0       0.64      0.62      0.63      5616
           1       0.63      0.65      0.64      5616

    accuracy                           0.63     11232
   macro avg       0.64      0.63      0.63     11232
weighted avg       0.64      0.63      0.63     11232



In [51]:
# Define the parameter distribution to sample from
param_dist = {'criterion':['gini','entropy'],
              'max_depth':np.arange(1,10).tolist()[0::2],
              'min_samples_split':np.arange(100,200).tolist()[0::2],
              'max_leaf_nodes':np.arange(3,10).tolist()[0::2]}

ori_FT_DT_num = decision_tree(y_train, X_train, ['NumSupportCalls', 'TransactionType_Refund', 'Income', 'Tenure', 'Location_Rural'], 'ori_FT_DT_num')
ori_FT_DT_num.fine_tune(param_dist)

ori_FT_DT_num: F1 = 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [51]:
print(ori_FT_DT_num.best_params_random)
print('\n')
print(ori_FT_DT_num.best_score_random)

{'criterion': 'entropy', 'max_depth': 15, 'max_leaf_nodes': 17, 'min_samples_split': np.float64(0.018242481428361046)}


0.6036802569001319
