In [177]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
# from sklearn.metrics import accuracy_score, confusion_matrix
# from matplotlib import pyplot as plt
# from matplotlib.colors import LinearSegmentedColormap

In [178]:
# Colors
RED = "\033[91m"
GREEN = "\033[92m"
YELLOW = "\033[93m"
BLUE = "\033[94m"
END = "\033[0m"

# Handle Data

In [179]:
# Load data into dataframes
train_df, test_df = pd.read_csv("churn_train_2.csv"), pd.read_csv("churn_test_2.csv")

In [180]:
def data_overview(data:pd.DataFrame, show_churners:bool=False) -> None:
    """ Training data overview """
    datapoints_n = len(data)
    duplicates_n = data.duplicated().sum()
    missing_n = data.isna().sum().sum()
    print("------------------------------------------------------------------------------------")
    print("Datapoints: ", datapoints_n)
    print("Duplicates: ", duplicates_n)
    print("Missing: ", missing_n)
    print("------------------------------------------------------------------------------------")
    print( data.head() )
    print("------------------------------------------------------------------------------------")
    if show_churners:
        churn_val_count = data["Churn"].value_counts()
        churn = churn_val_count["Yes"]
        nochurn = churn_val_count["No"]
        print( f"Target distribution: {churn} churned, {nochurn} didn't." )
        print("------------------------------------------------------------------------------------")

In [181]:
print(YELLOW+"VV TRAIN DATA VV")
data_overview(train_df, True)
print(END)
print(GREEN+"VV TEST DATA VV")
data_overview(test_df, True)
print(END)

[93mVV TRAIN DATA VV
------------------------------------------------------------------------------------
Datapoints:  5634
Duplicates:  0
Missing:  0
------------------------------------------------------------------------------------
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  1768-ZAIFU  Female              1      No         No       1           No   
1  2159-TURXX    Male              0      No         No       5          Yes   
2  8261-GWDBQ  Female              1     Yes         No      60          Yes   
3  2732-ISEZX  Female              0      No         No       5          Yes   
4  7607-QKKTJ    Male              0     Yes        Yes      45          Yes   

      MultipleLines InternetService       OnlineSecurity  ...  \
0  No phone service             DSL                   No  ...   
1                No              No  No internet service  ...   
2               Yes     Fiber optic                  Yes  ...   
3                No   

The data features strings as opposed to numbers for many columns. Customer ID is redundant and will be removed.

In [182]:
train_final = train_df.copy() # New refined train set
test_final = test_df.copy() # New refined test set

# Drop ID columns
train_final.drop(columns="customerID", inplace=True)
test_final.drop(columns="customerID", inplace=True)

In [190]:
# Define a function that turns all string values for a dataframe into nominal data
# This is ideal for ANN and the kind of string data we are working with
def str_to_nominal(data:pd.DataFrame) -> pd.DataFrame:
    """ Turn all string values for the dataframe into nominal data """
    categorical_cols = data.select_dtypes(include=['object']).columns # Identify all string columns
    encoder = OneHotEncoder()
    encoded_data = encoder.fit_transform(data[categorical_cols])

    print(encoded_data)
    # encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))
    # return encoded_df
    
    return data

In [191]:
train_final = str_to_nominal(train_final)
test_final = str_to_nominal(test_final)

print(YELLOW+"VV TRAIN DATA VV")
data_overview(train_final)
print(END)
print(GREEN+"VV TEST DATA VV")
data_overview(test_final)
print(END)

  (0, 0)	1.0
  (0, 2)	1.0
  (0, 4)	1.0
  (0, 6)	1.0
  (0, 9)	1.0
  (0, 11)	1.0
  (0, 14)	1.0
  (0, 17)	1.0
  (0, 20)	1.0
  (0, 23)	1.0
  (0, 26)	1.0
  (0, 29)	1.0
  (0, 32)	1.0
  (0, 36)	1.0
  (0, 39)	1.0
  (0, 1777)	1.0
  (0, 5332)	1.0
  (1, 1)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (1, 7)	1.0
  (1, 8)	1.0
  (1, 13)	1.0
  (1, 15)	1.0
  (1, 18)	1.0
  :	:
  (5632, 23)	1.0
  (5632, 28)	1.0
  (5632, 31)	1.0
  (5632, 32)	1.0
  (5632, 36)	1.0
  (5632, 39)	1.0
  (5632, 1637)	1.0
  (5632, 5332)	1.0
  (5633, 1)	1.0
  (5633, 3)	1.0
  (5633, 4)	1.0
  (5633, 7)	1.0
  (5633, 10)	1.0
  (5633, 12)	1.0
  (5633, 14)	1.0
  (5633, 19)	1.0
  (5633, 20)	1.0
  (5633, 23)	1.0
  (5633, 26)	1.0
  (5633, 29)	1.0
  (5633, 32)	1.0
  (5633, 36)	1.0
  (5633, 37)	1.0
  (5633, 520)	1.0
  (5633, 5331)	1.0
  (0, 0)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 9)	1.0
  (0, 11)	1.0
  (0, 14)	1.0
  (0, 19)	1.0
  (0, 20)	1.0
  (0, 23)	1.0
  (0, 26)	1.0
  (0, 31)	1.0
  (0, 32)	1.0
  (0, 35)	1.0
  (0, 40)	1.0
  (0, 1388)	1.0
 