In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


#import all models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import sklearn.model_selection as model_selection
import xgboost as xgb
import lightgbm as lgb

#import all metrics
import sklearn.metrics as metrics

import warnings
warnings.filterwarnings('ignore')
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
#read data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

train.shape, test.shape

((5634, 17), (1409, 17))

In [4]:
train.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,Churn
0,0,Yes,Yes,43,No,DSL,No,Yes,No,No,No,No,Month-to-month,No,Mailed check,50.2,0
1,0,Yes,Yes,37,Yes,Fiber optic,No,Yes,Yes,No,No,Yes,Month-to-month,Yes,Electronic check,95.15,0
2,0,No,No,35,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Credit card (automatic),85.95,1
3,0,Yes,No,69,No phone service,DSL,Yes,No,Yes,Yes,Yes,Yes,Two year,No,Bank transfer (automatic),60.25,0
4,0,Yes,Yes,69,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),20.2,0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5634 entries, 0 to 5633
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SeniorCitizen     5634 non-null   int64  
 1   Partner           5634 non-null   object 
 2   Dependents        5634 non-null   object 
 3   tenure            5634 non-null   int64  
 4   MultipleLines     5634 non-null   object 
 5   InternetService   5634 non-null   object 
 6   OnlineSecurity    5634 non-null   object 
 7   OnlineBackup      5634 non-null   object 
 8   DeviceProtection  5634 non-null   object 
 9   TechSupport       5634 non-null   object 
 10  StreamingTV       5634 non-null   object 
 11  StreamingMovies   5634 non-null   object 
 12  Contract          5634 non-null   object 
 13  PaperlessBilling  5634 non-null   object 
 14  PaymentMethod     5634 non-null   object 
 15  MonthlyCharges    5634 non-null   float64
 16  Churn             5634 non-null   int64  


In [6]:
get_obj_cols = [col for col in train.columns if train[col].dtype == 'object']
get_int_cols = [col for col in train.columns if train[col].dtype != 'object']

# Response Encoding

[More info check this link](https://medium.com/@thewingedwolf.winterfell/response-coding-for-categorical-data-7bb8916c6dc1)

In [7]:
#create a class response_encoding which fit and transform the categorical columns

class response_encoding:
  """
  This function is used to fit and transform the dataframe in one go.
  This is only made for binary classification problems.
  """
  def __init__(self,cols,target = 'Churn',alpha = 0, target_value = 1):
    """
    Parameters:
    -----------
    cols: list of categorical columns
    target: the target column
    alpha: the smoothing parameter
    target_value: the target value
    """
    self.cols = cols
    self.master_dict = {} #storing the original values
    self.alpha = alpha #smoothing parameter
    self.target = target
    self.target_value = 1
    
  def fit(self,df):
    alpha = self.alpha
    target = self.target
    for column in self.cols:
      unique_values = df[column].unique() #all unique values in that categorical column
      dict_values = {} #storing the response encoding values for target=1
      for value in unique_values:
        total = len(df[df[column]==value]) #the total no. of datapoints with 'value' catgeory
        sum_promoted = len(df[(df[column]==value) & (df[target]==self.target_value)]) #no. of all datapoints with category being 'value' and target=='yes'
        dict_values[value] = np.round((sum_promoted+alpha)/(total+alpha*len(unique_values)),2) #storing the obtained result in a dictionary
      dict_values['UNK']=0.5 #unknown categories that are not seen in train will be assigned a score of 0.5
      self.master_dict[column] = dict_values.copy() #storing the original values in a dictionary
    
    return None
    
  def transform(self,df):
    for column in self.cols:
      df[column] = df[column].map(self.master_dict[column]) #map the values in the column to the dictionary
    return df

In [8]:
#fit the response_encoding class to the train data
resp_enc = response_encoding(cols=get_obj_cols,target='Churn',alpha=0.1)
resp_enc.fit(train)

train = resp_enc.transform(train)
test = resp_enc.transform(test)

train.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,Churn
0,0,0.2,0.15,43,0.25,0.19,0.41,0.22,0.39,0.41,0.34,0.33,0.43,0.17,0.19,50.2,0
1,0,0.2,0.15,37,0.29,0.42,0.41,0.22,0.23,0.41,0.34,0.31,0.43,0.33,0.46,95.15,0
2,0,0.33,0.31,35,0.29,0.19,0.15,0.4,0.23,0.16,0.3,0.31,0.11,0.33,0.15,85.95,1
3,0,0.2,0.31,69,0.26,0.19,0.15,0.4,0.23,0.16,0.3,0.31,0.03,0.17,0.17,60.25,0
4,0,0.2,0.15,69,0.25,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.03,0.17,0.17,20.2,0


# Modelling

In [9]:
#split the data into x and y
x_train = train.drop(['Churn'],axis=1)
y_train = train['Churn']

x_test = test.drop(['Churn'],axis=1)
y_test = test['Churn']

In [11]:
#for xgboost
#weight = no. of negative classes/no. of positive classes
weight = (y_train == 0).sum()/(y_train == 1).sum() 

In [19]:
models = {
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=420),
    'Random Forest': RandomForestClassifier(class_weight='balanced_subsample',n_jobs = -1,
                                            n_estimators=50, max_depth=10, random_state=420),  
    'XGBoost': xgb.XGBClassifier(scale_pos_weight = weight,use_label_encoder=False,
                                n_jobs = -1, random_state=420),
    'LightGBM': lgb.LGBMClassifier(class_weight='balanced', n_jobs = -1, random_state=420)
}

cv = model_selection.RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=420)

for model_name, model in models.items():
    print(model_name)
    for score in ["roc_auc", "f1", "precision", "recall"]:
        scores = model_selection.cross_val_score(model, x_train, y_train, scoring=score, cv=cv, n_jobs=-1)
        print(f"Mean {score}: {np.mean(scores):.2f}")
    print('\n')

Decision Tree
Mean roc_auc: 0.65
Mean f1: 0.48
Mean precision: 0.48
Mean recall: 0.48


Random Forest
Mean roc_auc: 0.83
Mean f1: 0.61
Mean precision: 0.57
Mean recall: 0.67


XGBoost
Mean roc_auc: 0.84
Mean f1: 0.63
Mean precision: 0.52
Mean recall: 0.80


LightGBM
Mean roc_auc: 0.82
Mean f1: 0.60
Mean precision: 0.52
Mean recall: 0.71




The best model is found to be XGBoost which had the large scores in both roc-auc, f1, precision and recall.

# Next Steps

* Hyperparameter tune the XGBoost model and validate it on test data
* Study feature importance using Shapley values
* Build python scripts to ensure future training can be done smoothly and can be reproduced