Install and Import Needed Libraries

In [0]:
%pip install xgboost

In [0]:
%pip install mlflow

In [0]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
 
from xgboost import XGBClassifier
 
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingClassifier
 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, log_loss, precision_recall_curve, auc, average_precision_score
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
 
import matplotlib.pyplot as plt
import mlflow
import pandas as pd
import numpy as np

Set up mlflow experiment in the user's personal workspace folder

In [0]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

Set up mlflow experiment in the user's personal workspace folder

In [0]:
useremail = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()
experiment_name = f"/Users/{useremail}/churn"
mlflow.set_experiment(experiment_name) 

Data Preparation based on EDA


In [0]:
def datapreparation(df):
    # Define the new column names
    new_column_names = [
        "customerID", "gender", "SeniorCitizen", "Partner", "Dependents",
        "tenure", "PhoneService", "MultipleLines", "InternetService", 
        "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport",
        "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling",
        "PaymentMethod", "MonthlyCharges", "TotalCharges", "Churn"
    ]

    # Rename the columns
    df.columns = new_column_names

    # Drop the first row
    df = df.drop(index=0)
    df.drop(["customerID"], inplace=True, axis=1)
    
    df.TotalCharges = df.TotalCharges.replace(" ", np.nan)
    df.TotalCharges.fillna(0, inplace=True)
    df.TotalCharges = df.TotalCharges.astype(float)
    
    cols1 = ['Partner', 'Dependents', 'PaperlessBilling', 'Churn', 'PhoneService']
    for col in cols1:
        df[col] = df[col].apply(lambda x: 0 if x == "No" else 1)
    
    df.gender = df.gender.apply(lambda x: 0 if x == "Male" else 1)
    df.MultipleLines = df.MultipleLines.map({'No phone service': 0, 'No': 0, 'Yes': 1})
    
    cols2 = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
    for col in cols2:
        df[col] = df[col].map({'No internet service': 0, 'No': 0, 'Yes': 1})
    
    df = pd.get_dummies(df, columns=['InternetService', 'Contract', 'PaymentMethod'], drop_first=True)
    
    return df


Import Dataframe

In [0]:
# Create a Spark DataFrame from the table
df_spark = spark.read.table("WA_Fn_UseC__Telco_Customer_Churn_csv")

# Convert the Spark DataFrame to a Pandas DataFrame
df = df_spark.toPandas()

df = datapreparation(df)

# Display the DataFrame
df

Some Feature engineering

In [0]:
# Ensure that SeniorCitizen, tenure, and MonthlyCharges are numeric
df['SeniorCitizen'] = pd.to_numeric(df['SeniorCitizen'], errors='coerce')
df['tenure'] = pd.to_numeric(df['tenure'], errors='coerce')
df['MonthlyCharges'] = pd.to_numeric(df['MonthlyCharges'], errors='coerce')

X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure all columns in X_train and X_test are numeric
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Check the data types
print(X_train.dtypes)
print(X_test.dtypes)

Logistic Regression

In [0]:
# train the model
lreg_model = LogisticRegression(max_iter=10000, class_weight='balanced')
lreg_model.fit(X_train, y_train)
 
# predict
lreg_y_prob = lreg_model.predict_proba(X_test)
 
# evaluate
lreg_ap = average_precision_score(y_test, lreg_y_prob[:,1])

Extreme Gradient Boosted Tree (XGBoost)

In [0]:
# Ensure that SeniorCitizen, tenure, and MonthlyCharges are numeric
df['SeniorCitizen'] = pd.to_numeric(df['SeniorCitizen'], errors='coerce')
df['tenure'] = pd.to_numeric(df['tenure'], errors='coerce')
df['MonthlyCharges'] = pd.to_numeric(df['MonthlyCharges'], errors='coerce')

X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure all columns in X_train and X_test are numeric
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Check the data types
print(X_train.dtypes)
print(X_test.dtypes)



Define weights for the XGBoost model 

In [0]:
weights = compute_class_weight(
  'balanced', 
  classes=np.unique(y_train), 
  y=y_train
  )
 
weights

Extreme Gradient Boosted Tree (XGBoost)

In [0]:
# normalize class weights so that positive class reflects a 1.0 weight on negative class
scale = weights[1]/weights[0]
 
# train the model
xgb_model = XGBClassifier(scale_pos_weight=scale) # similar to class_weights arg but applies to positive class only
xgb_model.fit(X_train, y_train)
 
# predict
xgb_y_prob = xgb_model.predict_proba(X_test)
 
# evaluate
xgb_ap = average_precision_score(y_test, xgb_y_prob[:,1])

Neural Network

In [0]:
# train the model
mlp_model = MLPClassifier(activation='relu', max_iter=1000)  # does not support class weighting
mlp_model.fit(X_train, y_train)
 
# predict
mlp_y_prob = mlp_model.predict_proba(X_test)
 
# evaluate
mlp_ap = average_precision_score(y_test, mlp_y_prob[:,1])

Compare Model Results

In [0]:
print('Logistic Regression AP:\t\t{0:.6f}'.format(lreg_ap))
# print('RandomForest Classifier AP:\t{0:.6f}'.format(rfc_ap))
print('XGBoost Classifier AP:\t\t{0:.6f}'.format(xgb_ap))
print('MLP (Neural Network) AP:\t{0:.6f}'.format(mlp_ap))

Hist Gradient Boost Classifier

In [0]:
# compute sample weights (functionally equivalent to class weights when done in this manner)
sample_weights = compute_sample_weight(
  'balanced', 
  y=y_train
  )
 
# train the model
hgb_model = HistGradientBoostingClassifier(loss='binary_crossentropy', max_iter=1000)
hgb_model.fit(X_train, y_train, sample_weight=sample_weights)  # weighting applied to individual samples
 
# predict
hgb_y_prob = hgb_model.predict_proba(X_test)
 
# evaluate
hgb_ap = average_precision_score(y_test, hgb_y_prob[:,1])
print('HistGB Classifier AP:\t{0:.6f}'.format(hgb_ap))

Voting Ensemble

In [0]:
# train the model
vote_model = VotingClassifier(
  estimators=[
    ('hgb', HistGradientBoostingClassifier(loss='binary_crossentropy', max_iter=1000)), 
    ('xgb', XGBClassifier()),
    ('mlp', MLPClassifier(activation='relu', max_iter=1000))
    ],
  voting='soft'
  )
vote_model.fit(X_train, y_train)
 
# predict
vote_y_prob = vote_model.predict_proba(X_test)

# evaluate
vote_ap = average_precision_score(y_test, vote_y_prob[:,1])
print('Voting AP:\t{0:.6f}'.format(vote_ap))

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-2884970239900466>:2[0m
[1;32m      1[0m [38;5;66;03m# train the model[39;00m
[0;32m----> 2[0m vote_model [38;5;241m=[39m VotingClassifier(
[1;32m      3[0m   estimators[38;5;241m=[39m[
[1;32m      4[0m     ([38;5;124m'[39m[38;5;124mhgb[39m[38;5;124m'[39m, HistGradientBoostingClassifier(loss[38;5;241m=[39m[38;5;124m'[39m[38;5;124mbinary_crossentropy[39m[38;5;124m'[39m, max_iter[38;5;241m=[39m[38;5;241m1000[39m)), 
[1;32m      5[0m     ([38;5;124m'[39m[38;5;124mxgb[39m[38;5;124m'[39m, XGBClassifier()),
[1;32m      6[0m     ([38;5;124m'[39m[38;5;124mmlp[39m[38;5;124m'[39m, MLPClassifier(activation[38;5;241m=[39m[38;5;124m'[39m[38;5;124mrelu[39m[38;5;124m'[39m, max_iter[38;5;241m=[39m[38;5;241m1000[39m))
[1;32m      7[0m     ],
[1;32m 