## Setup

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "Churn"
COLAB = 'google.colab' in sys.modules
DEBUG = False
SEED = 666

In [4]:
COLAB = 'google.colab' in sys.modules

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)
else:
  ROOT = "./"

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Load Dataset

In [5]:
df = pd.read_pickle(f"{ROOT}/data/data.pkl")
print(df.shape)
df.head(2)

(7032, 21)


Unnamed: 0,CustomerID,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,Female,No,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,2,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   CustomerID        7032 non-null   int64   
 1   Gender            7032 non-null   category
 2   SeniorCitizen     7032 non-null   category
 3   Partner           7032 non-null   category
 4   Dependents        7032 non-null   category
 5   Tenure            7032 non-null   int64   
 6   PhoneService      7032 non-null   category
 7   MultipleLines     7032 non-null   category
 8   InternetService   7032 non-null   category
 9   OnlineSecurity    7032 non-null   category
 10  OnlineBackup      7032 non-null   category
 11  DeviceProtection  7032 non-null   category
 12  TechSupport       7032 non-null   category
 13  StreamingTV       7032 non-null   category
 14  StreamingMovies   7032 non-null   category
 15  Contract          7032 non-null   category
 16  PaperlessBilling  7032 n

## Preprocessing Dataset

### Identify target and features

In [7]:
target = "Churn"
print(f"target = {target}")

cat_features = [c for c in df.select_dtypes("category").columns if c!= target]
print(f"\nCategorical features ({len(cat_features)}): {cat_features}")

num_features = [c for c in df.select_dtypes(["int","float"]).columns if c!= target]
print(f"\nNumerical features ({len(num_features)}): {num_features}")



target = Churn

Categorical features (16): ['Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

Numerical features (4): ['CustomerID', 'Tenure', 'MonthlyCharges', 'TotalCharges']


### Train/Test split

* Split data train/test -> this time (60% train/40% test)

In [8]:
df.Churn.value_counts(normalize=True)

No     0.734215
Yes    0.265785
Name: Churn, dtype: float64

* `train_size` -> % must use float / num_of_rows must use int
* `randon_state` -> 
* `stratify` -> divide most equally the rate of yes/no between train and test

In [9]:
from sklearn.model_selection import train_test_split

df_train,df_test = train_test_split(df, train_size=0.60, stratify=df[target], random_state=SEED)

print(df_train.shape,df_test.shape)

(4219, 21) (2813, 21)


In [10]:
df_train.Churn.value_counts(normalize=True)

No     0.734297
Yes    0.265703
Name: Churn, dtype: float64

In [11]:
df_test.Churn.value_counts(normalize=True)

No     0.734092
Yes    0.265908
Name: Churn, dtype: float64

### Encode target

* `LabelEncoder` takes a column with strings (categorical) and change to numerical by a Map

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(df_train[target])

y_train = le.transform(df_train[target])
y_test = le.transform(df_test[target])

In [13]:
df_train[target].head()

193      No
6108     No
3580    Yes
2195     No
3809     No
Name: Churn, dtype: category
Categories (2, object): ['No', 'Yes']

In [14]:
y_train[:5]

array([0, 0, 1, 0, 0])

In [15]:
le.inverse_transform(y_train[:5])

array(['No', 'No', 'Yes', 'No', 'No'], dtype=object)

### Encode Categorical features

In [16]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

ohe.fit(df_train[cat_features])

X_cat_train = ohe.transform(df_train[cat_features])
X_cat_test = ohe.transform(df_test[cat_features])

df_cat_train = pd.DataFrame(X_cat_train.toarray(), columns=ohe.get_feature_names_out())
df_cat_test = pd.DataFrame(X_cat_test.toarray(), columns=ohe.get_feature_names_out())
print(df_cat_train.shape, df_cat_test.shape)

df_cat_train.head(2)

(4219, 36) (2813, 36)


Unnamed: 0,Gender_Female,Gender_Male,SeniorCitizen_No,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_Yes,TechSupport_No,TechSupport_Yes,StreamingTV_No,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


### Scale / Transform numerical features

In [17]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

ss.fit(df_train[num_features])

X_num_train = ss.transform(df_train[num_features])
X_num_test = ss.transform(df_test[num_features])

df_num_train = pd.DataFrame(X_num_train, columns=ss.get_feature_names_out())
df_num_test = pd.DataFrame(X_num_test, columns=ss.get_feature_names_out())
print(df_num_train.shape, df_num_test.shape)

df_num_train.head(2)



(4219, 4) (2813, 4)


Unnamed: 0,CustomerID,Tenure,MonthlyCharges,TotalCharges
0,-1.640247,1.597275,1.487874,2.557942
1,1.267948,-0.596193,-1.32382,-0.804777


### Construct dataframe for model features

In [18]:
df_model_train = pd.concat([df_cat_train, df_num_train], axis=1)
df_model_test = pd.concat([df_cat_test, df_num_test], axis=1)
print(df_model_train.shape, df_model_test.shape)

(4219, 40) (2813, 40)


## Model Selection

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

classifiers = {
    "KNN" : KNeighborsClassifier(),
    "KNN(3)" : KNeighborsClassifier(3),
    "DT" : DecisionTreeClassifier(),
    "DT(max_depth=5)" : DecisionTreeClassifier(max_depth=5),
    "LR" : LogisticRegression(max_iter=1000),
    "RF" : RandomForestClassifier(),
    "AdaBoost" : AdaBoostClassifier(),
}

In [20]:
from sklearn.metrics import accuracy_score

for name, model in classifiers.items():
    
    model.fit(df_model_train, y_train)
    
    # Scoring on SEEN data - effectively "useless"
    y_pred = model.predict(df_model_train)
    train_accuracy = accuracy_score(y_train, y_pred)
    
    # Scoring on UNSEEN data - important
    y_pred = model.predict(df_model_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    
    print(f"{name:20s} accuracy\ttrain = {train_accuracy:.2%} \ttest = {test_accuracy:.2%}")

KNN                  accuracy	train = 83.50% 	test = 75.68%
KNN(3)               accuracy	train = 86.32% 	test = 74.65%
DT                   accuracy	train = 100.00% 	test = 73.09%
DT(max_depth=5)      accuracy	train = 80.28% 	test = 78.74%
LR                   accuracy	train = 80.37% 	test = 80.41%
RF                   accuracy	train = 100.00% 	test = 78.95%
AdaBoost             accuracy	train = 80.90% 	test = 80.02%


In [21]:
from sklearn.model_selection import cross_val_score

for name, model in classifiers.items():
    
    scores = cross_val_score(model, df_model_train, y_train, cv=10)
    
    print(f"{name:20s} accuracy\tCV= {scores.mean():.2%} \tstd = {scores.std():.2%}")

KNN                  accuracy	CV= 76.16% 	std = 2.03%
KNN(3)               accuracy	CV= 74.61% 	std = 1.70%
DT                   accuracy	CV= 71.86% 	std = 2.14%
DT(max_depth=5)      accuracy	CV= 78.22% 	std = 1.17%
LR                   accuracy	CV= 80.07% 	std = 1.69%
RF                   accuracy	CV= 78.69% 	std = 1.38%
AdaBoost             accuracy	CV= 79.66% 	std = 1.66%
