### Customer churn model using the "Telco-Customer-Churn" Kaggle dataset

In [301]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# first, import the dataset
df = pd.read_csv(r"./data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

#### Data exploration

In [302]:
# explore the data a bit
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [303]:
# although the costumerID column should have unique IDs, double-check
print(len(df.customerID.unique()) == len(df))

True


In [304]:
# the customerID column will not serve any purpose for the ML model, since every ID is unique
# therefore, this column is dropped
df.drop('customerID', axis='columns', inplace=True )


In [305]:
# check if all of the columns are in the correct data format
df.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [306]:
# the TotalCharges column is of type string, although it expreses a number
print(f"first 5 elements in the TotalCharges column: {df.TotalCharges.tolist()[:5]}")

first 5 elements in the TotalCharges column: ['29.85', '1889.5', '108.15', '1840.75', '151.65']


In [307]:
# this generally happens when there are empty cells or non-numeric cells
# first make a copy of the original dataframe
df_copy = df.copy(deep = True)

# use pd.to_numeric() to convert all values to numeric. The assumption is that this will not be able to convert
# all the cells to numeric.

# get the index positions of the presumably wrong entries in the dataframe 
incorrect_values_idx = df_copy[pd.to_numeric(df_copy.TotalCharges, errors='coerce').isna()].index.to_numpy()

# if the assumption is wrong, this will print an empty array
print(incorrect_values_idx)

[ 488  753  936 1082 1340 3331 3826 4380 5218 6670 6754]


Only 11 entries (out of more than 7000) have issues. Normally these entries can just be dropped, however, if they do indeed contain numerical values, but have a typo, they can be changed.

In [308]:
df.loc[incorrect_values_idx, 'TotalCharges']

488      
753      
936      
1082     
1340     
3331     
3826     
4380     
5218     
6670     
6754     
Name: TotalCharges, dtype: object

All of the wrong entries are in fact empty cells. Nothing further to be done here, except drop the respective rows.

In [309]:
# delete the copy dataframe from memory
del df_copy

df = df[df.TotalCharges != ' ']
df.reset_index(inplace=True)
df.loc[:,'TotalCharges'] = pd.to_numeric(df.TotalCharges)

#### Encode the categorical features

In [310]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from seaborn import load_dataset
import pandas as pd
from sklearn import preprocessing

y = df.pop('Churn')
X = df

# OHE the categorical features
categorical_columns = X.select_dtypes(include=object).columns.to_numpy()

transformer = make_column_transformer(
    (OneHotEncoder(), categorical_columns),
    remainder='passthrough')

transformed = transformer.fit_transform(X)
X = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())

# encode target values from 'Yes'/'No' to 1 and 0
label = preprocessing.LabelEncoder()
label.fit(y.values.tolist())

y = label.transform(y.values.tolist())

#### Train/test split

In [312]:
# now that the categorical features are encoded, split the dataframe into train-test partitions
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Implement XGB Classifier and Random Forest

In [313]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# simple XGB classifier
clf_xgb = XGBClassifier(random_state=15)
clf_xgb.fit(X_train, y_train)
preds_xgb = clf_xgb.predict_proba(X_test)[:, 1]

# simple Random Forest Classifier
clf_rf = RandomForestClassifier(random_state=15)
clf_rf.fit(X_train, y_train)
preds_rf = clf_rf.predict_proba(X_test)[:, 1]

print(f'XGB classifier score: {metrics.roc_auc_score(y_test, preds_xgb)}')
print(f'Random Forest classifier score: {metrics.roc_auc_score(y_test, preds_rf)}')

XGB classifier score: 0.801018268787758
Random Forest classifier score: 0.8159286331799287


In [314]:
# Take a look at the input-output shapes
print(f"input shape: {X_train.shape}")
print(f"output shape: {y_train.shape}")

input shape: (5625, 46)
output shape: (5625,)


#### Implement neural network with pytorch

In [316]:
import torch
from torch import nn
from torcheval.metrics.functional import binary_accuracy

X_nn = torch.from_numpy(X.to_numpy()).type(torch.float)
y_nn = torch.from_numpy(y).type(torch.float)

In [319]:
X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X_nn, y_nn, test_size=0.2, random_state=42)

In [354]:
model = nn.Sequential(
    nn.Linear(in_features=X_train.shape[1], out_features=36),
    nn.ReLU(),
    nn.Linear(in_features=36, out_features=16),
    nn.ReLU(),
    nn.Linear(in_features=16, out_features=8),
    nn.ReLU(),
    nn.Linear(in_features=8, out_features=1)
)

# binary cross-entropy loss
loss_fn = nn.BCEWithLogitsLoss()

# ADAM optimizer
optimizer = torch.optim.Adam(params=model.parameters(), 
                            lr=0.01)

In [355]:
# Build training and evaluation loop
for epoch in range(2000):
    ### Training
    model.train()

    # Forward pass (model outputs raw logits)
    y_logits = model(X_train_nn).squeeze() 
    y_pred = torch.round(torch.sigmoid(y_logits)) 
  
    # Calculate loss/accuracy
    loss = loss_fn(y_logits, y_train_nn) 
    acc = binary_accuracy(y_pred, y_train_nn) 

    # Optimizer zero grad
    optimizer.zero_grad()

    # Loss backwards
    loss.backward()

    # Optimizer step
    optimizer.step()

    ### Testing
    model.eval()
    with torch.inference_mode():
        # Forward pass
        test_logits = model(X_test_nn).squeeze() 
        test_pred = torch.round(torch.sigmoid(test_logits))
        # Caculate loss/accuracy
        test_loss = loss_fn(test_logits,
                            y_test_nn)
        test_acc = binary_accuracy(test_pred, y_test_nn)

    # Print out what's happening every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")

Epoch: 0 | Loss: 4.28758, Accuracy: 0.73% | Test loss: 10.49431, Test acc: 0.27%
Epoch: 100 | Loss: 0.55137, Accuracy: 0.73% | Test loss: 0.55335, Test acc: 0.73%
Epoch: 200 | Loss: 0.47933, Accuracy: 0.78% | Test loss: 0.49014, Test acc: 0.77%
Epoch: 300 | Loss: 0.42549, Accuracy: 0.80% | Test loss: 0.44277, Test acc: 0.78%
Epoch: 400 | Loss: 0.41052, Accuracy: 0.81% | Test loss: 0.43667, Test acc: 0.79%
Epoch: 500 | Loss: 0.40518, Accuracy: 0.81% | Test loss: 0.43285, Test acc: 0.79%
Epoch: 600 | Loss: 0.40246, Accuracy: 0.81% | Test loss: 0.43577, Test acc: 0.79%
Epoch: 700 | Loss: 0.40030, Accuracy: 0.81% | Test loss: 0.43453, Test acc: 0.79%
Epoch: 800 | Loss: 0.39664, Accuracy: 0.81% | Test loss: 0.43507, Test acc: 0.79%
Epoch: 900 | Loss: 0.41188, Accuracy: 0.81% | Test loss: 0.44937, Test acc: 0.78%
Epoch: 1000 | Loss: 0.39418, Accuracy: 0.82% | Test loss: 0.43814, Test acc: 0.79%
Epoch: 1100 | Loss: 0.39776, Accuracy: 0.81% | Test loss: 0.44952, Test acc: 0.79%
Epoch: 1200 | L