In [119]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
# from sklearn.metrics import accuracy_score, confusion_matrix
# from matplotlib import pyplot as plt
# from matplotlib.colors import LinearSegmentedColormap

In [120]:
# Colors
RED = "\033[91m"
GREEN = "\033[92m"
YELLOW = "\033[93m"
BLUE = "\033[94m"
END = "\033[0m"

# Handle Data

In [121]:
# Load data into dataframes
train_df, test_df = pd.read_csv("churn_train_2.csv"), pd.read_csv("churn_test_2.csv")

In [122]:
def data_overview(data:pd.DataFrame, show_churners:bool=False) -> None:
    """ Training data overview """
    datapoints_n = len(data)
    duplicates_n = data.duplicated().sum()
    missing_n = data.isna().sum().sum()
    print("------------------------------------------------------------------------------------")
    print("Datapoints: ", datapoints_n)
    print("Duplicates: ", duplicates_n)
    print("Missing: ", missing_n)
    print("------------------------------------------------------------------------------------")
    print( data.head() )
    print("------------------------------------------------------------------------------------")
    if show_churners:
        churn_val_count = data["Churn"].value_counts()
        churn = churn_val_count["Yes"]
        nochurn = churn_val_count["No"]
        print( f"Target distribution: {churn} churned, {nochurn} didn't." )
        print("------------------------------------------------------------------------------------")

In [123]:
print(YELLOW+"VV TRAIN DATA VV")
data_overview(train_df, True)
print(END)
print(GREEN+"VV TEST DATA VV")
data_overview(test_df, True)
print(END)

[93mVV TRAIN DATA VV
------------------------------------------------------------------------------------
Datapoints:  5634
Duplicates:  0
Missing:  0
------------------------------------------------------------------------------------
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  1768-ZAIFU  Female              1      No         No       1           No   
1  2159-TURXX    Male              0      No         No       5          Yes   
2  8261-GWDBQ  Female              1     Yes         No      60          Yes   
3  2732-ISEZX  Female              0      No         No       5          Yes   
4  7607-QKKTJ    Male              0     Yes        Yes      45          Yes   

      MultipleLines InternetService       OnlineSecurity  ...  \
0  No phone service             DSL                   No  ...   
1                No              No  No internet service  ...   
2               Yes     Fiber optic                  Yes  ...   
3                No   

The data features strings as opposed to numbers for many columns. Customer ID is redundant and will be removed.

In [124]:
train_final = train_df.copy() # New refined train set
test_final = test_df.copy() # New refined test set

# Drop ID columns
train_final.drop(columns="customerID", inplace=True)
test_final.drop(columns="customerID", inplace=True)

In [125]:
# Define a function that turns all string values for a dataframe into nominal data
# This is ideal for ANN and the kind of data we are working with
def str_to_nominal(data:pd.DataFrame) -> pd.DataFrame:
    """ Turn all string values for the dataframe into nominal data """
    # Initialize the OneHotEncoder
    onehot_encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid multicollinearity

    # Fit and transform the data
    encoded_data = onehot_encoder.fit_transform(data)

    # Create a DataFrame with the encoded data
    encoded_df = pd.DataFrame( encoded_data, columns=onehot_encoder.get_feature_names_out(data.columns) )
    return encoded_df