## Customer Churn Prediction Model

Using the [Telco customer churn data](https://www.kaggle.com/code/mechatronixs/telco-churn-prediction-feature-engineering-eda/data) from kaggle, train a machine learning model to predict customer churn.

In [116]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from statistics import mean

import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [117]:
# get training data
train = pd.read_csv("./data/training_data.csv")
# drop customer ID: not a feature for training 
train.drop("customerID", axis=1, inplace=True)

# getting validation data
val = pd.read_csv("./data/validation_data.csv")

In [118]:
train.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Male,0,No,No,5,Yes,No,DSL,No,No,Yes,Yes,Yes,Yes,Month-to-month,No,Bank transfer (automatic),75.15,392.65,No
1,Male,0,Yes,No,66,Yes,Yes,DSL,No,No,No,Yes,No,Yes,Month-to-month,No,Electronic check,63.85,4264.6,No
2,Male,0,Yes,Yes,42,Yes,No,DSL,No,No,Yes,Yes,Yes,Yes,One year,No,Electronic check,73.15,3088.25,No
3,Male,0,No,No,19,Yes,Yes,DSL,No,No,No,No,Yes,Yes,Month-to-month,No,Mailed check,69.6,1394.55,No
4,Male,0,No,No,59,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),20.2,1192.3,No


In [119]:
val.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0420-HLGXF,Female,1,No,No,39,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.75,4036.0,No
1,5193-QLVZB,Male,0,No,No,63,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Two year,Yes,Bank transfer (automatic),104.75,6536.5,No
2,5598-IKHQQ,Female,0,No,No,72,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),25.45,1866.45,No
3,8749-CLJXC,Male,0,No,No,1,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,20.05,20.05,No
4,2252-NKNSI,Male,0,No,Yes,52,Yes,Yes,DSL,Yes,...,No,Yes,Yes,Yes,Two year,Yes,Mailed check,85.15,4461.85,No


In [120]:
train.head(4)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Male,0,No,No,5,Yes,No,DSL,No,No,Yes,Yes,Yes,Yes,Month-to-month,No,Bank transfer (automatic),75.15,392.65,No
1,Male,0,Yes,No,66,Yes,Yes,DSL,No,No,No,Yes,No,Yes,Month-to-month,No,Electronic check,63.85,4264.6,No
2,Male,0,Yes,Yes,42,Yes,No,DSL,No,No,Yes,Yes,Yes,Yes,One year,No,Electronic check,73.15,3088.25,No
3,Male,0,No,No,19,Yes,Yes,DSL,No,No,No,No,Yes,Yes,Month-to-month,No,Mailed check,69.6,1394.55,No


In [121]:
def OHE(dataframe, column):
    dummy_df = pd.get_dummies(dataframe[column],prefix=column)
    dataframe = pd.concat((dataframe, dummy_df), axis=1)
    del(dataframe[column])
    return dataframe

In [122]:
train.isnull().any()  # no nulls

gender              False
SeniorCitizen       False
Partner             False
Dependents          False
tenure              False
PhoneService        False
MultipleLines       False
InternetService     False
OnlineSecurity      False
OnlineBackup        False
DeviceProtection    False
TechSupport         False
StreamingTV         False
StreamingMovies     False
Contract            False
PaperlessBilling    False
PaymentMethod       False
MonthlyCharges      False
TotalCharges        False
Churn               False
dtype: bool

In [123]:

#columns_cat=["gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod" ]			

In [124]:
#for column in columns_cat:
#    train[column] = train[column].astype('category')

In [125]:
#train["Churn"] = train["Churn"].astype('category')

In [126]:
#train["TotalCharges"].replace(to_replace=" ", value=0, inplace=True)

#train["TotalCharges"]=pd.to_numeric(train["TotalCharges"],downcast='float')

In [127]:
#for column in columns_cat:
#    train=OHE(train, column)

In [128]:
categorical_columns = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'Churn']
# converting all the categorical columns to numeric
col_mapper = {}
for col in categorical_columns:
    le = LabelEncoder()
    le.fit(train.loc[:, col])
    class_names = le.classes_
    train.loc[:, col] = le.transform(train.loc[:, col])
    # saving encoder for each column to be able to inverse-transform later
    col_mapper.update({col: le})



# converting "Total Charges" to numeric
train.replace(" ", "0", inplace=True)
train.loc[:, "TotalCharges"] = pd.to_numeric(train.loc[:, "TotalCharges"])

In [129]:
def pre_process_data(df, label_encoder_dict):
    df.drop("customerID", axis=1, inplace=True)
    for col in df.columns:
        if col in list(label_encoder_dict.keys()):
            column_le = label_encoder_dict[col]
            df.loc[:, col] = column_le.transform(df.loc[:, col])
        else:
            continue

    return df

In [130]:
#X = train.drop("Churn", axis = "columns")
#y = train["Churn"]

In [131]:
# splitting into X and Y
x_train = train.drop("Churn", axis=1)
y_train = train.loc[:, "Churn"]

# fitting model
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

In [132]:
#rfc_gg = RandomForestClassifier(criterion = "gini", min_samples_split=20, n_estimators=200)

#rfc_scores_gg = cross_val_score(rfc_gg, X, y, cv = 5, scoring = "f1_micro")

#print("random forest gini f1 score is on average", mean(rfc_scores_gg))

In [133]:
# pre-processing validation data
val = pre_process_data(val, col_mapper)

# split validation set 
x_val = val.drop("Churn", axis=1)
y_val = val.loc[:, "Churn"]

# predicting on validation
predictions = model.predict(x_val)
precision, recall, fscore, support = precision_recall_fscore_support(y_val, predictions)
accuracy = accuracy_score(y_val, predictions)
print(f"Validation accuracy is: {round(accuracy, 3)}")

Validation accuracy is: 0.837


In [134]:
pwd

'c:\\Users\\alfre\\DSR_practical_DS'

In [136]:
# pickling mdl

pickler = open("churn_prediction_model.pkl", "wb")
pickle.dump(model, pickler)
pickler.close()

In [138]:
# pickling label encoder dict 
pickler = open("churn_prediction_label_encoders.pkl", "wb")
pickle.dump(col_mapper, pickler)
pickler.close()