In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures

# Load data

In [5]:
data = pd.read_csv('data/telecom_churn.csv')
data.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [6]:
X = data.drop('Churn', axis=1)
y = data[['Churn']]

X.shape, y.shape

((3333, 19), (3333, 1))

# Trian | Test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(2499, 19) (2499, 1)
(834, 19) (834, 1)


# Pipeline for preprocessing

In [8]:
categorical_features = list(X.columns[X.dtypes == object])
numeric_features = list(X.columns[X.dtypes != object])
print("Categorical features:\n  ", categorical_features, '\n')
print('Numerical features:\n  ', numeric_features)

Categorical features:
   ['State', 'International plan', 'Voice mail plan'] 

Numerical features:
   ['Account length', 'Area code', 'Number vmail messages', 'Total day minutes', 'Total day calls', 'Total day charge', 'Total eve minutes', 'Total eve calls', 'Total eve charge', 'Total night minutes', 'Total night calls', 'Total night charge', 'Total intl minutes', 'Total intl calls', 'Total intl charge', 'Customer service calls']


In [9]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline(steps=[
    ('polynomial', PolynomialFeatures()),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, categorical_features), 
    ('numerical', numeric_transformer, numeric_features)
])

In [10]:
def get_clf(model):
    return Pipeline(steps=[
        ('preprocessor', preprocessor), 
        ('model', model)
    ])

In [11]:
print(f"min accuracy score: {round(data['Churn'].value_counts(normalize=True).max(), 3)}")

min accuracy score: 0.855


### List to store all the scores

In [12]:
all_scores = []
def add_score(model_name, grid_search, desc, add_to_list=True):
    data = {'name': model_name}
    
    data['test_accuracy'] = accuracy_score(y_test, grid_search.best_estimator_.predict(X_test))
    data['test_roc_auc'] = roc_auc_score(y_test, grid_search.best_estimator_.predict_proba(X_test)[:, 1])
    
    data['train_accuracy'] = accuracy_score(y_train, grid_search.best_estimator_.predict(X_train))
    data['train_roc_auc'] = roc_auc_score(y_train, grid_search.best_estimator_.predict_proba(X_train)[:, 1])
    
    data['best_params'] = grid_search.best_params_
    
    if add_to_list:
        all_scores.append(data)
    return data


# Logistic regression

In [13]:
lr_clf = get_clf(LogisticRegression())

In [14]:
lr_clf.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                     transformer_weights=None,
                     transformers=[('categorical',
                                    Pipeline(memory=None,
                                             steps=[('onehot',
                                                     OneHotEncoder(categorical_features=None,
                                                                   categories=None,
                                                                   drop=None,
                                                                   dtype=<class 'numpy.float64'>,
                                                                   handle_unknown='ignore',
                                                                   n_values=None,
                                                                   sparse=True))],
                                             verbos

In [15]:
param_grid = {
    'preprocessor__numerical__polynomial__degree': [1, 2, 3], 
    'model__class_weight': ['balanced', None], 
    'model__penalty': ['l1', 'l2'], 
    'model__C': [0.2, 0.5, 0.9, 1]
}

lr_grid_search = GridSearchCV(estimator=lr_clf, param_grid=param_grid, n_jobs=-3, cv=5, scoring='roc_auc')
lr_grid_search.fit(X_train, y_train)

add_score(model_name='logistic regression', 
         grid_search=lr_grid_search, 
         desc='scorer: roc_auc')

{'name': 'logistic regression',
 'test_accuracy': 0.9328537170263789,
 'test_roc_auc': 0.9187926697808121,
 'train_accuracy': 0.9327731092436975,
 'train_roc_auc': 0.939473418873466,
 'best_params': {'model__C': 1,
  'model__class_weight': None,
  'model__penalty': 'l1',
  'preprocessor__numerical__polynomial__degree': 3}}

# Random Forest

In [16]:
rf_clf = get_clf(RandomForestClassifier())

rf_clf.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                     transformer_weights=None,
                     transformers=[('categorical',
                                    Pipeline(memory=None,
                                             steps=[('onehot',
                                                     OneHotEncoder(categorical_features=None,
                                                                   categories=None,
                                                                   drop=None,
                                                                   dtype=<class 'numpy.float64'>,
                                                                   handle_unknown='ignore',
                                                                   n_values=None,
                                                                   sparse=True))],
                                             verbos

In [17]:
param_grid = {
#     'preprocessor__numerical__polynomial__degree': [1, 2, 3], 
    'model__class_weight': ['balanced', None], 
    'model__criterion': ['gini', 'entropy'], 
    'model__min_samples_leaf': [1, 2], 
    'model__min_samples_split': [2, 3, 4], 
    'model__n_estimators': [500, 1000]
}

In [18]:
rf_grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, n_jobs=-3, cv=3, scoring='roc_auc', verbose=True)
rf_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-3)]: Done  38 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-3)]: Done 144 out of 144 | elapsed:  6.4min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('categorical',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('onehot',
                                                                                          OneHotEncoder(categorical_features=None,
                                                                                                        categories=None,
 

In [19]:
add_score(model_name='Random Forest', 
         grid_search=rf_grid_search, 
         desc='scorer: roc_auc')

{'name': 'Random Forest',
 'test_accuracy': 0.9592326139088729,
 'test_roc_auc': 0.9469416851158532,
 'train_accuracy': 1.0,
 'train_roc_auc': 1.0,
 'best_params': {'model__class_weight': 'balanced',
  'model__criterion': 'entropy',
  'model__min_samples_leaf': 1,
  'model__min_samples_split': 3,
  'model__n_estimators': 1000}}

In [20]:
all_scores

[{'name': 'logistic regression',
  'test_accuracy': 0.9328537170263789,
  'test_roc_auc': 0.9187926697808121,
  'train_accuracy': 0.9327731092436975,
  'train_roc_auc': 0.939473418873466,
  'best_params': {'model__C': 1,
   'model__class_weight': None,
   'model__penalty': 'l1',
   'preprocessor__numerical__polynomial__degree': 3}},
 {'name': 'Random Forest',
  'test_accuracy': 0.9592326139088729,
  'test_roc_auc': 0.9469416851158532,
  'train_accuracy': 1.0,
  'train_roc_auc': 1.0,
  'best_params': {'model__class_weight': 'balanced',
   'model__criterion': 'entropy',
   'model__min_samples_leaf': 1,
   'model__min_samples_split': 3,
   'model__n_estimators': 1000}}]