In [20]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from typing import Any
import time

In [21]:
# Colors
RED = "\033[91m"
GREEN = "\033[92m"
YELLOW = "\033[93m"
BLUE = "\033[94m"
END = "\033[0m"

# Data Analysis

In [22]:
# Load data into dataframes
train_df, test_df = pd.read_csv("churn_train_2.csv"), pd.read_csv("churn_test_2.csv")

In [23]:
def data_overview(data:pd.DataFrame, show_churners:bool=False, head:bool=True) -> None:
    """ Training data overview """
    datapoints_n = len(data)
    duplicates_n = data.duplicated().sum()
    missing_n = data.isna().sum().sum()
    
    print("------------------------------------------------------------------------------------")
    print("Datapoints: ", datapoints_n)
    print("Duplicates: ", duplicates_n)
    print("Missing: ", missing_n)

    if head:
        print("------------------------------------------------------------------------------------")
        print( data.head() )

    print("------------------------------------------------------------------------------------")
    print( data.info() )
    print("------------------------------------------------------------------------------------")

    if show_churners:
        churn_val_count = data["Churn"].value_counts()
        churn = churn_val_count["Yes"]
        nochurn = churn_val_count["No"]
        print( f"Target distribution: {churn} churned, {nochurn} didn't." )
    

In [24]:
# Show data
print(YELLOW+"VV TRAIN DATA VV")
data_overview(train_df, True)
print(END)
print(GREEN+"VV TEST DATA VV")
data_overview(test_df, True)
print(END)

[93mVV TRAIN DATA VV
------------------------------------------------------------------------------------
Datapoints:  5634
Duplicates:  0
Missing:  0
------------------------------------------------------------------------------------
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  1768-ZAIFU  Female              1      No         No       1           No   
1  2159-TURXX    Male              0      No         No       5          Yes   
2  8261-GWDBQ  Female              1     Yes         No      60          Yes   
3  2732-ISEZX  Female              0      No         No       5          Yes   
4  7607-QKKTJ    Male              0     Yes        Yes      45          Yes   

      MultipleLines InternetService       OnlineSecurity  ...  \
0  No phone service             DSL                   No  ...   
1                No              No  No internet service  ...   
2               Yes     Fiber optic                  Yes  ...   
3                No   

The data features strings as opposed to numbers for many columns. Customer ID is redundant and will be removed.

# Preprocess the Data

In [25]:
# Define a function that turns gets rid of all string values and uses nominal data instead
# This is ideal for ANN and the kind of string data we are working with
def str_to_nominal(data: pd.DataFrame) -> pd.DataFrame:
    """Turn all string values for the dataframe into nominal data"""

    # Get all non-numeric columns
    categorical_cols = data.select_dtypes(include=['object']).columns

    # Drop first column so that we don't get "multicollinearity" by doing drop="first"
    # Otherwise we would end up with redundant columns
    encoder = OneHotEncoder(sparse_output=False, drop="first")
    encoded_data = encoder.fit_transform(data[categorical_cols])

    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

    # Combine the new encoded columns with number columns
    number_cols = data.drop(columns=categorical_cols)
    number_cols.reset_index(drop=True, inplace=True)
    encoded_df.reset_index(drop=True, inplace=True)

    result_df = pd.concat([number_cols, encoded_df], axis=1)

    return result_df

In [26]:
# New refined train and test sets
train_final = train_df.copy()
test_final = test_df.copy()

# Drop ID columns
train_final.drop(columns="customerID", inplace=True)
test_final.drop(columns="customerID", inplace=True)

# Convert TotalCharges to numbers, it is strings for some reason...
# We also set errors="coerce" so that strings that couldn't convert turn into NaN
train_final["TotalCharges"] = pd.to_numeric(train_final['TotalCharges'], errors='coerce')
test_final["TotalCharges"] = pd.to_numeric(test_final['TotalCharges'], errors='coerce')

# Convert string values for our dataframes
train_final = str_to_nominal(train_final)
test_final = str_to_nominal(test_final)

# Drop duplicates and NaN values
train_final.drop_duplicates(inplace=True)
train_final.dropna(inplace=True)
test_final.drop_duplicates(inplace=True)
test_final.dropna(inplace=True)

# Show data again
print(YELLOW+"VV TRAIN DATA VV")
data_overview(train_final, head=False)
print(END)
print(GREEN+"VV TEST DATA VV")
data_overview(test_final, head=False)
print(END)

[93mVV TRAIN DATA VV
------------------------------------------------------------------------------------
Datapoints:  5614
Duplicates:  0
Missing:  0
------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 5614 entries, 0 to 5633
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          5614 non-null   int64  
 1   tenure                                 5614 non-null   int64  
 2   MonthlyCharges                         5614 non-null   float64
 3   TotalCharges                           5614 non-null   float64
 4   gender_Male                            5614 non-null   float64
 5   Partner_Yes                            5614 non-null   float64
 6   Dependents_Yes                         5614 non-null   float64
 7   PhoneService_Yes                       5614 

This will fit our ANN.

# ANN Training
We'll use scikit-learn's multi-layer perceptron (MLP) which is a feedforward neural network (FFNN). It is the fastest and simplest solution for the kind of simple data we are working with.

In [48]:
def train_and_eval_ann(hlayers:int, neurons:int, lrate:float, **extra_params:dict[str:Any]) -> None:
    # Standardize our data, this is usually prefered when training ANN as opposed to normalizing
    scaler = StandardScaler()
    X_train, y_train = train_final.drop(columns=["Churn_Yes"]), train_final["Churn_Yes"]
    X_test, y_test = test_final.drop(columns=["Churn_Yes"]), test_final["Churn_Yes"]
    X_train_scaled:np.ndarray = scaler.fit_transform( X_train )
    X_test_scaled:np.ndarray = scaler.transform( X_test )

    # Initialize the FFNN classifier
    hlayer_sizes = tuple(neurons for _ in range(hlayers))
    mpl = MLPClassifier(hidden_layer_sizes=hlayer_sizes, random_state=42, learning_rate_init=lrate, **extra_params)

    # Train the classifier
    start_time = time.time()
    mpl.fit(X_train_scaled, y_train)
    end_time = time.time()
    train_elapsed = end_time - start_time
    

    print(f"{BLUE}>> {hlayers} HIDDEN LAYERS WITH {neurons} NEURONS ({lrate} LEARNING RATE) <<{END}")

    y_pred = mpl.predict(X_test_scaled)

    # Evaluate with acccuracy score
    accuracy = accuracy_score(y_test, y_pred)
    print(GREEN+"Accuracy:", accuracy, END)

    # Evaluate train time
    print(YELLOW+"Train time: ", train_elapsed, END)

    # Evaluate with confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"{RED}{cm}{END}")

In [28]:
train_and_eval_ann(4, 5, 0.01)
train_and_eval_ann(2, 100, 0.01)
train_and_eval_ann(10, 100, 0.01)
train_and_eval_ann(2, 100, 0.0001)
train_and_eval_ann(2, 100, 0.00001)

[94m>> 4 HIDDEN LAYERS WITH 5 NEURONS (0.01 LEARNING RATE) <<[0m
[92mAccuracy: 0.7934472934472935 [0m
[93mTrain time:  0.587498664855957 [0m
[91m[[903 119]
 [171 211]][0m
[94m>> 2 HIDDEN LAYERS WITH 100 NEURONS (0.01 LEARNING RATE) <<[0m
[92mAccuracy: 0.7535612535612536 [0m
[93mTrain time:  4.533908367156982 [0m
[91m[[895 127]
 [219 163]][0m
[94m>> 10 HIDDEN LAYERS WITH 100 NEURONS (0.01 LEARNING RATE) <<[0m
[92mAccuracy: 0.7621082621082621 [0m
[93mTrain time:  35.09653329849243 [0m
[91m[[876 146]
 [188 194]][0m




[94m>> 2 HIDDEN LAYERS WITH 100 NEURONS (0.0001 LEARNING RATE) <<[0m
[92mAccuracy: 0.7870370370370371 [0m
[93mTrain time:  12.342050313949585 [0m
[91m[[891 131]
 [168 214]][0m
[94m>> 2 HIDDEN LAYERS WITH 100 NEURONS (1e-05 LEARNING RATE) <<[0m
[92mAccuracy: 0.7970085470085471 [0m
[93mTrain time:  15.78143048286438 [0m
[91m[[906 116]
 [169 213]][0m




There are slight variation in accuracy with the different parameters, and the mistakes the models make vary. The most time consuming model was also the one that gave the worst score... Some models reached their maximum iterations before convergence. The last model did so, but still won the contest in highest accuracy, interestingly enough.

In [29]:
# Let's try evaulating with the winning parameters, except we let it actually converge
# Just out of curiousity
train_and_eval_ann(2, 100, 0.00001, max_iter=400)

[94m>> 2 HIDDEN LAYERS WITH 100 NEURONS (1e-05 LEARNING RATE) <<[0m
[92mAccuracy: 0.7948717948717948 [0m
[93mTrain time:  16.019712448120117 [0m
[91m[[904 118]
 [170 212]][0m


Performs a bit worse...

In [47]:
# 4 Hidden layers with 5 neurons performed best, let's start with that
# Lets try a different learning rate
train_and_eval_ann(4, 5, 0.13)

[94m>> 4 HIDDEN LAYERS WITH 5 NEURONS (0.13 LEARNING RATE) <<[0m
[92mAccuracy: 0.8012820512820513 [0m
[93mTrain time:  0.3999309539794922 [0m
[91m[[931  91]
 [188 194]][0m


I toyed around with a higher learning rate, and 0.13 seems to give a score above 80%

In [74]:
# Let's tweak additional parameters
train_and_eval_ann(4, 5, 0.13)

[94m>> 4 HIDDEN LAYERS WITH 5 NEURONS (0.13 LEARNING RATE) <<[0m
[92mAccuracy: 0.7984330484330484 [0m
[93mTrain time:  0.2757112979888916 [0m
[91m[[934  88]
 [195 187]][0m
