# Import necessary libraries

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.metrics import roc_auc_score, confusion_matrix, precision_recall_fscore_support, classification_report, roc_curve, accuracy_score
from sklearn.linear_model import LogisticRegression 
import pickle

import json

# Data Transformation

    Normalizing column headers and columns with object datatypes

    Handling null values for both numerical and categorical columns

    Applying one-hot encoding on the dataset

In [19]:
df = pd.read_csv('../dataset/telco_customer_churn.csv')

df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

df = df.apply(lambda x: x.str.strip().str.lower().str.replace(" ", "_") if x.dtype == 'object' else x)

In [20]:
df['seniorcitizen'] = df['seniorcitizen'].apply(lambda x: 'yes' if x == 1 else 'no')

In [21]:
categorical_cols = [
    'gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice',
    'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup',
    'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies',
    'contract', 'paperlessbilling', 'paymentmethod'
]

numerical_cols = ['tenure', 'monthlycharges', 'totalcharges']

dependent_var = 'churn'

In [22]:
df[dependent_var] = df[dependent_var].apply(lambda x: 1 if x == 'yes' else 0).astype(int)

df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')

df[categorical_cols] = df[categorical_cols].fillna('NA')

df[numerical_cols] = df[numerical_cols].fillna(0)

In [23]:
X = pd.get_dummies(df[numerical_cols + categorical_cols])

y = df[dependent_var]

# Prepare for Training

    Split the data into Training, Validation, and Testing Datasets

    Prepare Xs (independent vars) and Y (dependent var)


In [24]:
X_full_train, X_test, y_full_train, y_test = train_test_split(
    X,
    y, 
    test_size=0.2, 
    random_state=1, 
    stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_full_train,
    y_full_train,
    test_size=0.25,
    random_state=1,
    stratify=y_full_train
)

# Create a training and validation framework

    Declare the model to be used

    Declare metrics to be shown

In [25]:
def train_logistic_regression(X, y, C = 1):
    
    model = LogisticRegression(max_iter=5000, C=C)

    model.fit(X, y)

    return model

In [26]:
def predict(model, X, threshold = 0.5):

    y_proba = model.predict_proba(X)[:, 1]

    return (y_proba >= threshold).astype(int), y_proba

# Cross Validation

    Check model performance for threshhold range

    Check model performance for Kfold = 5

In [27]:
thresholds = np.arange(0, 1, 0.1)

trained_model = train_logistic_regression(X_train, y_train)

t_scores = []

In [28]:
for t in thresholds:

    y_pred, y_proba = predict(trained_model, X_val, threshold=t)

    auc = accuracy_score(y_val, y_pred)
    
    print(f't = {t}, auc = {auc}')

t = 0.0, auc = 0.2654364797728886
t = 0.1, auc = 0.6295244854506742
t = 0.2, auc = 0.7104329311568488
t = 0.30000000000000004, auc = 0.7636621717530163
t = 0.4, auc = 0.7877927608232789
t = 0.5, auc = 0.8069552874378992
t = 0.6000000000000001, auc = 0.7970191625266146
t = 0.7000000000000001, auc = 0.7735982966643009
t = 0.8, auc = 0.7388218594748048
t = 0.9, auc = 0.7345635202271115


In [29]:
n_splits = 5

scores = []

In [30]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

for train_idx, val_idx in kfold.split(X_full_train, y_full_train):

    X_train, X_val = X_full_train.iloc[train_idx], X_full_train.iloc[val_idx]

    y_train, y_val = y_full_train.iloc[train_idx].values, y_full_train.iloc[val_idx].values

    model = train_logistic_regression(X_train, y_train)

    y_pred, y_proba = predict(model, X_val, threshold=0.5)

    auc = accuracy_score(y_val, y_pred)

    scores.append(auc)

In [31]:
print(f'mean_auc: {np.mean(scores)}')
print(f'std_auc: {np.std(scores)}')

mean_auc: 0.8028056693370065
std_auc: 0.011635031992423944


# Training for Final Model

    Use full training and test datasets

    Use threshold = 0.5 

In [32]:
model = train_logistic_regression(X_full_train, y_full_train)

In [33]:
y_pred, y_proba = predict(model, X_test, threshold = 0.5)

roc = roc_auc_score(y_test, y_proba)
auc = accuracy_score(y_test, y_pred)

roc, auc

(np.float64(0.8350280296571857), 0.8097941802696949)

# Exporting Model

In [34]:
output_file = 'model_C=1.bin'

with open(output_file, 'wb') as f_out:

    pickle.dump(model, f_out)