In [484]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from typing import Any
# from matplotlib import pyplot as plt
# from matplotlib.colors import LinearSegmentedColormap

In [485]:
# Colors
RED = "\033[91m"
GREEN = "\033[92m"
YELLOW = "\033[93m"
BLUE = "\033[94m"
END = "\033[0m"

# Handle Data

In [486]:
# Load data into dataframes
train_df, test_df = pd.read_csv("churn_train_2.csv"), pd.read_csv("churn_test_2.csv")

In [487]:
def data_overview(data:pd.DataFrame, show_churners:bool=False) -> None:
    """ Training data overview """
    datapoints_n = len(data)
    duplicates_n = data.duplicated().sum()
    missing_n = data.isna().sum().sum()
    print("------------------------------------------------------------------------------------")
    print("Datapoints: ", datapoints_n)
    print("Duplicates: ", duplicates_n)
    print("Missing: ", missing_n)
    print("------------------------------------------------------------------------------------")
    print( data.head() )
    print("------------------------------------------------------------------------------------")
    print( data.info() )
    print("------------------------------------------------------------------------------------")
    if show_churners:
        churn_val_count = data["Churn"].value_counts()
        churn = churn_val_count["Yes"]
        nochurn = churn_val_count["No"]
        print( f"Target distribution: {churn} churned, {nochurn} didn't." )
    

In [488]:
print(YELLOW+"VV TRAIN DATA VV")
data_overview(train_df, True)
print(END)
print(GREEN+"VV TEST DATA VV")
data_overview(test_df, True)
print(END)

[93mVV TRAIN DATA VV
------------------------------------------------------------------------------------
Datapoints:  5634
Duplicates:  0
Missing:  0
------------------------------------------------------------------------------------
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  1768-ZAIFU  Female              1      No         No       1           No   
1  2159-TURXX    Male              0      No         No       5          Yes   
2  8261-GWDBQ  Female              1     Yes         No      60          Yes   
3  2732-ISEZX  Female              0      No         No       5          Yes   
4  7607-QKKTJ    Male              0     Yes        Yes      45          Yes   

      MultipleLines InternetService       OnlineSecurity  ...  \
0  No phone service             DSL                   No  ...   
1                No              No  No internet service  ...   
2               Yes     Fiber optic                  Yes  ...   
3                No   

The data features strings as opposed to numbers for many columns. Customer ID is redundant and will be removed.

In [489]:
train_final = train_df.copy() # New refined train set
test_final = test_df.copy() # New refined test set

# Drop ID columns
train_final.drop(columns="customerID", inplace=True)
test_final.drop(columns="customerID", inplace=True)

In [490]:
# Define a function that turns gets rid of all string values and uses nominal data instead
# This is ideal for ANN and the kind of string data we are working with
def str_to_nominal(data: pd.DataFrame) -> pd.DataFrame:
    """Turn all string values for the dataframe into nominal data"""

    # Get all non-numeric columns
    categorical_cols = data.select_dtypes(include=['object']).columns

    # Drop first column so that we don't get "multicollinearity" by doing drop="first"
    # Otherwise we would end up with redundant columns
    encoder = OneHotEncoder(sparse_output=False, drop="first")
    encoded_data = encoder.fit_transform(data[categorical_cols])

    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

    # Combine the new encoded columns with number columns
    number_cols = data.drop(columns=categorical_cols)
    number_cols.reset_index(drop=True, inplace=True)
    encoded_df.reset_index(drop=True, inplace=True)

    result_df = pd.concat([number_cols, encoded_df], axis=1)

    return result_df

In [491]:
# Convert string values for our dataframes

train_final = str_to_nominal(train_final)
test_final = str_to_nominal(test_final)

print(YELLOW+"VV TRAIN DATA VV")
data_overview(train_final)
print(END)
print(GREEN+"VV TEST DATA VV")
data_overview(test_final)
print(END)

gender excluded from non_numeric_cols
Partner excluded from non_numeric_cols
Dependents excluded from non_numeric_cols
PhoneService excluded from non_numeric_cols
MultipleLines excluded from non_numeric_cols
InternetService excluded from non_numeric_cols
OnlineSecurity excluded from non_numeric_cols
OnlineBackup excluded from non_numeric_cols
DeviceProtection excluded from non_numeric_cols
TechSupport excluded from non_numeric_cols
StreamingTV excluded from non_numeric_cols
StreamingMovies excluded from non_numeric_cols
Contract excluded from non_numeric_cols
PaperlessBilling excluded from non_numeric_cols
PaymentMethod excluded from non_numeric_cols
TotalCharges excluded from non_numeric_cols
Churn excluded from non_numeric_cols
gender excluded from non_numeric_cols
Partner excluded from non_numeric_cols
Dependents excluded from non_numeric_cols
PhoneService excluded from non_numeric_cols
MultipleLines excluded from non_numeric_cols
InternetService excluded from non_numeric_cols
Onlin

Isn't it lovely?

# ANN Training
We'll use scikit-learn's multi-layer perceptron (MLP) which is a feedforward neural network (FFNN). It is the fastest and simplest solution for the kind of simple data we are working with.

In [492]:
def train_and_eval_ann(hlayers:int, neurons:int, learning_rate:float, **extra_params:dict[str:Any]) -> None:
    # Standardize our data, this is usually prefered when training ANN as opposed to normalizing
    scaler = StandardScaler()
    X_train, y_train = train_final.drop(columns=["Churn_Yes"]), train_final["Churn_Yes"]
    X_test, y_test = test_final.drop(columns=["Churn_Yes"]), test_final["Churn_Yes"]
    X_train_scaled:np.ndarray = scaler.fit_transform( X_train )
    X_test_scaled:np.ndarray = scaler.transform( X_test )

    # Initialize the FFNN classifier
    hlayer_sizes = tuple(neurons for _ in range(hlayers))
    print(hlayer_sizes)
    mpl = MLPClassifier(hidden_layer_sizes=hlayer_sizes, random_state=42, learning_rate_init=learning_rate, **extra_params)

    # Train the classifier
    mpl.fit(X_train_scaled, y_train)

    # Evaluate
    y_pred = mpl.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

In [493]:
train_and_eval_ann(4, 5, 0.01)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- TotalCharges_100.25
- TotalCharges_1001.5
- TotalCharges_1005.7
- TotalCharges_1009.25
- TotalCharges_102.45
- ...
Feature names seen at fit time, yet now missing:
- TotalCharges_100.2
- TotalCharges_100.35
- TotalCharges_100.4
- TotalCharges_100.8
- TotalCharges_100.9
- ...
