# US Churn &mdash; Baseline Model

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "US_Churn"

COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
else:
    ROOT = "./"

DEBUG = False
SEED = 1612

In [None]:
if COLAB:
    from google.colab import drive
    if not os.path.isdir("/content/gdrive"):
        drive.mount("/content/gdrive")

Mounted at /content/gdrive


## Imports

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

## Datasets

In [None]:
df = pd.read_pickle(f"{ROOT}/data/churn.pkl")
print(df.shape)
df.head()

(3333, 20)


Unnamed: 0,State,Account_Length,Area_Code,Intl_Plan,VMail_Plan,VMail_Message,Day_Mins,Day_Calls,Day_Charge,Eve_Mins,Eve_Calls,Eve_Charge,Night_Mins,Night_Calls,Night_Charge,Intl_Mins,Intl_Calls,Intl_Charge,CustServ_Calls,Churn
0,KS,128,0,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,OH,107,0,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
2,NJ,137,0,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
3,OH,84,2,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,OK,75,0,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


In [None]:
# drop feature that I have not yet considered properly in EDA
for c in ["State"]:
    if c in df.columns: df.drop(c, axis='columns', inplace=True)

## Pre-process Data

In [None]:
df_train, df_test = train_test_split(df, stratify=df.Churn, train_size=0.60, random_state=SEED)
df_train.shape, df_test.shape

((1999, 19), (1334, 19))

In [None]:
target = "Churn"
cat_features = [c for c in df.select_dtypes("category").columns if c not in target]
num_features = [c for c in df.select_dtypes(["int","float"]).columns if c not in target]
features = cat_features + num_features

print(f"Traget: {target}")

print(f"Categorical Features: {cat_features}")
print(f"Numerical Features: {num_features}")

Traget: Churn
Categorical Features: ['Area_Code', 'Intl_Plan', 'VMail_Plan']
Numerical Features: ['Account_Length', 'VMail_Message', 'Day_Mins', 'Day_Calls', 'Day_Charge', 'Eve_Mins', 'Eve_Calls', 'Eve_Charge', 'Night_Mins', 'Night_Calls', 'Night_Charge', 'Intl_Mins', 'Intl_Calls', 'Intl_Charge', 'CustServ_Calls']


In [None]:
ss = StandardScaler()

X_train = ss.fit_transform(df_train[features])
y_train = df_train[target].values

X_test = ss.transform(df_test[features])
y_test = df_test[target].values

### Dummy model - predict no churn

 * Since 85% do not churn a model that always predicts target=0 will have accuracy of 85%

In [None]:
df.Churn.value_counts(normalize=True)

0    0.855086
1    0.144914
Name: Churn, dtype: float64

In [None]:
# always perdict 0 (no churn)
y_dummy_pred = np.zeros_like(y_train)

# accuracy is a bad metric to use - as dataset is imbalanced
print(confusion_matrix(y_train, y_dummy_pred))
print(classification_report(y_train, y_dummy_pred, zero_division=False))

[[1709    0]
 [ 290    0]]
              precision    recall  f1-score   support

           0       0.85      1.00      0.92      1709
           1       0.00      0.00      0.00       290

    accuracy                           0.85      1999
   macro avg       0.43      0.50      0.46      1999
weighted avg       0.73      0.85      0.79      1999



In [None]:
model = LogisticRegression()

In [None]:
# how well does model work on data it saw during training - overestimate how good the model is
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[1663   46]
 [ 225   65]]
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1709
           1       0.59      0.22      0.32       290

    accuracy                           0.86      1999
   macro avg       0.73      0.60      0.62      1999
weighted avg       0.84      0.86      0.84      1999



## Model Selection

In [None]:
models = {
    "NB": GaussianNB(),
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "DT(max_depth=3)": DecisionTreeClassifier(max_depth=3),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(),
}

In [None]:
for name,model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=10)
    print(f"{name:20s} accuracy = {scores.mean():.3%} ± {scores.std():.3%}")

NB                   accuracy = 85.944% ± 2.165%
LR                   accuracy = 86.194% ± 1.498%
DT                   accuracy = 90.945% ± 1.738%
DT(max_depth=3)      accuracy = 89.793% ± 1.750%
KNN                  accuracy = 89.344% ± 1.521%
SVC                  accuracy = 91.345% ± 1.291%
