# Import necessary libraries

In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.metrics import roc_auc_score, confusion_matrix, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression 

# Data Transformation

    Normalizing column headers and columns with object datatypes

    Handling null values for both numerical and categorical columns

    Applying one-hot encoding on the dataset

In [106]:
df = pd.read_csv('../dataset/telco_customer_churn.csv')

df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

categorical_cols = list(df.dtypes[df.dtypes == 'object'].index)

numerical_cols = list(df.dtypes[df.dtypes != 'object'].index)

dependent_var = 'churn'

In [107]:
df[categorical_cols] = df[categorical_cols].apply(lambda x: x.str.strip().str.lower().str.replace(' ', '_'))

df[dependent_var] = df[dependent_var].apply(lambda x: 1 if x == 'yes' else 0).astype(int)

categorical_cols.remove(dependent_var)

df[categorical_cols] = df[categorical_cols].fillna('NA')

df[numerical_cols] = df[numerical_cols].fillna(0)

In [108]:
df_one_hot =  pd.get_dummies(df)

In [109]:
df_one_hot

Unnamed: 0,seniorcitizen,tenure,monthlycharges,churn,customerid_0002-orfbo,customerid_0003-mknfe,customerid_0004-tlhlj,customerid_0011-igkff,customerid_0013-exchz,customerid_0013-mhzwf,...,totalcharges_995.35,totalcharges_996.45,totalcharges_996.85,totalcharges_996.95,totalcharges_997.65,totalcharges_997.75,totalcharges_998.1,totalcharges_999.45,totalcharges_999.8,totalcharges_999.9
0,0,1,29.85,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,34,56.95,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,2,53.85,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,45,42.30,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,2,70.70,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7039,0,72,103.20,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7040,0,11,29.60,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7041,1,4,74.40,1,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Prepare for Training

    Split the data into Training, Validation, and Testing Datasets

    Prepare Xs (independent vars) and Y (dependent var)


In [110]:
df_full_train, df_test, y_full_train, y_test = train_test_split(
    df_one_hot,
    df[dependent_var], 
    test_size=0.2, 
    random_state=1, 
    stratify=df[dependent_var]
)

df_train, df_val, y_train, y_val = train_test_split(
    df_full_train,
    y_full_train,
    test_size=0.25,
    random_state=1,
    stratify=y_full_train
)

In [111]:
X_full_train = df_full_train.drop(columns=dependent_var)

X_train = df_train.drop(columns=dependent_var)

X_val = df_val.drop(columns=dependent_var)

X_test = df_test.drop(columns=dependent_var)

# Create a training and validation framework

    Declare the model to be used

    Declare metrics to be shown

In [112]:
def train_logistic_regression(X, y, C = 1):
    
    model = LogisticRegression(max_iter=1000, C=C)

    model.fit(X, y)

    return model

In [113]:
def eval_logistic_regression(model, X, y, threshold = 0.5):

    y_proba = model.predict_proba(X)[:, 1]

    y_pred = (y_proba >= threshold).astype(int)

    return roc_auc_score(y, y_pred), precision_recall_fscore_support(y, y_pred), confusion_matrix(y, y_pred)

In [114]:
trained_model = train_logistic_regression(X_train, y_train)

In [115]:
roc_score, prfs, conf_matrix = eval_logistic_regression(trained_model, X_val, y_val)

In [116]:
print(roc_score, prfs, conf_matrix)

0.7290578418455657 (array([0.84981685, 0.66246057]), array([0.89661836, 0.56149733]), array([0.8725905 , 0.60781476]), array([1035,  374], dtype=int64)) [[928 107]
 [164 210]]
