In [None]:
""" 
In this tutorial, you will learn how to analyse and predict churn data using Scikit-learn, 
telecommunication data, and SMOTE. Specifically, how to: 

Work on a project approach
Automate the exploratory analysis of churn data
Use Scikit-learn to predict customer churn using telecommunication data 
Create detailed explicit functions 
How to deal with Imbalanced data using SMOTE
Metrics appropriate for imbalance data prediction

"""

In [None]:
""" Import packages """
import os
import pandas as pd
import numpy as np
import missingno as msno   # To assess missing values pattern
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, precision_recall_curve, auc

from collections import Counter
from imblearn.over_sampling import SMOTE


In [None]:
""" Load the data """
# Path to the data folder
dataf = os.path.join(os.getcwd(), os.pardir, "data")

# name of the csv file
csv_file = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

# Load the csv file
churndf = pd.read_csv( os.path.join(dataf, csv_file) )

churndf

In [None]:
""" Data cleaning 
Some numeric columns have a different data type, let's create a function that converts 
specified columns in a given data frame to numeric data type.
"""

# Convert columns to numeric
from convert_col_to_numeric import convert_col_to_numeric
churndf = convert_col_to_numeric( df = churndf, columns = ["TotalCharges", "MonthlyCharges" ] )

# Check and remove NAs
print( churndf.isnull().sum() )  # Missing values per column
churndf = churndf.dropna()       # Remove rows with missing values
churndf.isnull().sum()           # Verify if they are removed

# Drop the customerID  as it is not needed in the analysis
churndf = churndf.drop(['customerID'], axis = 1)

In [None]:
""" Data Characteristics """

from data_characteristics import data_characteristics
print( data_characteristics( df = churndf ) )


In [None]:
""" EDA : Exploratory data analysis 
"""

# the column SeniorCitizen is a catgorical variable, but is encoded as 
#boolean values which might be misleading Python to consider it as numerical. 
#Let's code it as follows: 0 as No, and 1 as Yes
churndf["SeniorCitizen"]= churndf["SeniorCitizen"].map({0: "No", 1: "Yes"})

from automated_eda_plotly import automated_eda_plotly
automated_eda_plotly(data=churndf, target_var="Churn")



In [None]:
""" Data preprocessing

Note that this prediction process we will use random forest classifier from Scikit-learn package

In this analysis we are dealing with categorical data some are binary and other nominal. and because we are mainly
using an ML algorithm that relies on numerical computations encoding the categorical variable is our next step. 
"""

from label_encode_columns import label_encode_columns
from convert_to_binary import convert_to_binary

cols_to_label_encode = [ 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                        'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']
cols_to_convert_to_binary = [ 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn' ]

# Convert categorical variables to label encoded using the function label_encode_columns
churndf_copy = churndf
churndf_copy = label_encode_columns( churndf_copy, cols_to_label_encode)
churndf_copy = convert_to_binary( churndf_copy, cols_to_convert_to_binary)

# Check if the conversion worked as expected. 
churndf_copy


In [None]:
''' 
Churn data prediction using random forest
'''

# First, let's split the data into train and test. Where the former is used to learn teh relationship between teh features and the Churn variable, 
# and the latter is used to evaluate the performance of the model. Let's use the function split_dataset
from split_dataset import split_dataset

SEED = 50
X_train, X_test, Y_train, Y_test = split_dataset(df = churndf_copy, target_column = 'Churn', test_size=0.2, random_state=SEED)

# Next, let's create an instance of the random forest model with some specific hyperparameter values using the function fit_rf_model
from build_rf_model import build_rf_model


churn_rf_model = build_rf_model( X_train, Y_train, n_estimators=500, oob_score=True, n_jobs=4, 
              random_state=SEED, max_features="sqrt", max_leaf_nodes=30 )

# Then let's evaluate the fitted model to find out whether it makes accurate prediction. 
from assess_model_accuracy import assess_model_accuracy
accuracy = assess_model_accuracy(churn_rf_model, X_test, Y_test)

print(f"The accuracy of the fitted model: {accuracy['accuracy']:.2f}")

# Also plot the confusion matrix to evealuate which label were predicted better / worst
from plot_confusion_matrix import plot_confusion_matrix
plot_confusion_matrix(Y_test, accuracy['predictions'], 
                      title="RF confusion matrix")

# Print the classification report
print(classification_report( Y_test, accuracy['predictions'] ))


In [None]:
""" Let's evaluate the model using alternative metrics like ROC-AUC and PR-AUC  """
    
from evaluate_model_probabilities import evaluate_model_probabilities

# Compute ROC-AUC & PR-AUC and plot the precision-recall curve using the function evaluate_model_probabilities
roc_pr_auc = evaluate_model_probabilities(churn_rf_model, X_test, Y_test)

In [None]:
""" Synthesize new training samples using SMOTE """

from apply_smote import apply_smote

# Apply SMOTE to the training data
X_train_new, Y_train_new = apply_smote(X_train, Y_train, seed=SEED, k_neighbors= 5)

# Combine features and target into one DataFrame
smote_churndf  = pd.concat([X_train_new, Y_train_new], axis=1)

# Carry out a new EDA to evaluate the distribution of the Churn variable in relation to the feastures 
automated_eda_plotly(data=smote_churndf, target_var="Churn")


In [None]:
""" 
Churn data prediction using random forest

"""
churn_rf_mod_new = build_rf_model( X_train_new, Y_train_new, n_estimators=50, oob_score=True, n_jobs=4, 
              random_state=SEED, max_features="sqrt", max_leaf_nodes=30 )

# Then let's evaluate the fitted model to find out whether it makes accurate prediction. 
accuracy_new = assess_model_accuracy(churn_rf_mod_new, X_test, Y_test)

print(f"The accuracy of the fitted model: {accuracy_new['accuracy']:.2f}")

# Also plot the confusion matrix to evealuate which label were predicted better / worst
plot_confusion_matrix(Y_test, accuracy_new['predictions'], 
                      title="RF confusion matrix")

# Print the classification report
print(classification_report( Y_test, accuracy_new['predictions'] ))

# The precision-recal curve
from evaluate_model_probabilities import evaluate_model_probabilities

# Compute ROC-AUC & PR-AUC and plot the precision-recall curve using the function evaluate_model_probabilities
roc_pr_auc = evaluate_model_probabilities(churn_rf_mod_new, X_test, Y_test)


""" 
Thank you for following through till the end. I look forward to receiving your feedback and earring more about how this blog was helpful to you. 
Do reach out if you have got any question. 

"""