# Predict Customer Churn

Setting up the environment...

In [None]:
!pip install -r requirements_py3.8_local.txt

In [None]:
!pip check

## Importing libraries and configurations

In [None]:
import shap
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import plot_roc_curve, classification_report

In [None]:
import os
os.environ['QT_QPA_PLATFORM']='offscreen'

## Reading data

In [None]:
df = pd.read_csv(r"./data/bank_data.csv", index_col = 0)
df.head()

## EDA

In [None]:
print(f'Shape of data: {df.shape}')
print(f'Columns of the data:')
df.columns.values

In [None]:
print('Null values count per column:')
df.isnull().sum()

In [None]:
print('Stats for quantitative columns:')
df.describe()

In [None]:
# categorical variables
cat_columns = [
    'Attrition_Flag',
    'Gender',
    'Education_Level',
    'Marital_Status',
    'Income_Category',
    'Card_Category'
]

# quantitative variables
quant_columns = [
    'Customer_Age',
    'Dependent_count', 
    'Months_on_book',
    'Total_Relationship_Count', 
    'Months_Inactive_12_mon',
    'Contacts_Count_12_mon', 
    'Credit_Limit', 
    'Total_Revolving_Bal',
    'Avg_Open_To_Buy', 
    'Total_Amt_Chng_Q4_Q1', 
    'Total_Trans_Amt',
    'Total_Trans_Ct', 
    'Total_Ct_Chng_Q4_Q1', 
    'Avg_Utilization_Ratio'
]

In [None]:
# plot barplots for categorical variables
fig, axes = plt.subplots(2, 3, figsize = (30, 10))
for i, col in enumerate(cat_columns):
    r = i//3
    c = i%3
    df[col].value_counts('normalize').plot.bar(figure = fig, ax = axes[r][c])
    axes[r][c].set_title(col, fontdict = {'fontsize': 'x-large'})
    axes[r][c].tick_params(axis = 'x', rotation = 0)

fig.suptitle('Categorical variables plot', fontsize = 'xx-large')
plt.show()
# To save the plot
# plt.savefig('categorical_variables_plot.png')

In [None]:
# plot histograms for quantitative variables
fig, axes = plt.subplots(5, 3, figsize = (30, 25))
for i, col in enumerate(quant_columns):
    r = i//3
    c = i%3
    df[col].hist(figure = fig, bins = 40, ax = axes[r][c])
    axes[r][c].set_title(col, fontdict = {'fontsize': 'x-large'})

fig.suptitle('Quantitative variables plot', fontsize = 'xx-large')
plt.show()
# To save the plot
# plt.savefig('quantitative_variables_plot.png')

In [None]:
# plot distributions of 'Total_Trans_Ct' and add a smooth curve obtained using a kernel density estimate
plt.figure(figsize = (20, 10)) 
plt.title('KDE plot of total transactions')
sns.histplot(df['Total_Trans_Ct'], stat = 'density', kde = True)
plt.show()
# To save the plot
# plt.savefig('total_transactions_plot.png')

In [None]:
# plot correlation heatmap for all variables 
plt.figure(figsize = (20, 10)) 
sns.heatmap(df.corr(), annot = False, cmap = 'Blues', linewidths = 2)
plt.title('Correlation map')
plt.show()
# To save the plot
# plt.savefig('correlation_map.png')

## ETL

In [None]:
# Transform target column
df['Churn'] = df['Attrition_Flag'].apply(lambda val: 0 if val == "Existing Customer" else 1)

### Choose appropriate encoding method

#### 1) Encoding categorical variables by mean target variable


In [None]:
to_encode_variables = [
    'Gender', 
    'Education_Level', 
    'Marital_Status', 
    'Income_Category', 
    'Card_Category'
]

In [None]:
# encoding categorical variables using mean target variables
mean_encoded_cols = []
for col in to_encode_variables:
    col_lst = []
    col_groups_map = df.groupby(col)['Churn'].mean().to_dict()
    col_name = col + '_Churn'
    df[col_name] = df[col].map(col_groups_map)
    mean_encoded_cols.append(col_name)
    
print(f'Shape of new data: {df.shape}')
print(f'Encoded columns: {mean_encoded_cols}')

#### 2) Encoding categorical variables by one-hot encoding

In [None]:
# encoding categorical variables using one-hot encoding
one_hot_encoded_cols = []
for col in to_encode_variables:
    tmp_df = pd.get_dummies(df[col], prefix = col, drop_first = True)
    one_hot_encoded_cols.extend(tmp_df.columns)
    df = pd.concat([df, tmp_df], axis = 1)

print(f'Shape of new data: {df.shape}')
print(f'Encoded columns: {one_hot_encoded_cols}')

In [None]:
y = df['Churn']

keep_mean_cols = quant_columns + mean_encoded_cols
keep_ohe_cols = quant_columns + one_hot_encoded_cols

## Model and feature selection

### 1) Model training and prediction with target mean encoded categorical variables

In [None]:
X_mean = pd.DataFrame()
X_mean[keep_mean_cols] = df[keep_mean_cols]
X_mean.head()

In [None]:
# This cell may take up to 15-20 minutes to run
# train test split 
X_train_mean, X_test_mean, y_train, y_test = train_test_split(X_mean, y, test_size = 0.3, random_state = 42)

# grid search
rfc_mean = RandomForestClassifier(random_state = 42)
# Use a different solver if the default 'lbfgs' fails to converge
# Reference: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
lrc_mean = LogisticRegression(solver = 'lbfgs', max_iter = 3000, verbose = 1)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt'],
    'max_depth'   : [4, 5, 100],
    'criterion'   : ['gini', 'entropy']
}

cv_rfc_mean = GridSearchCV(
    estimator  = rfc_mean, 
    param_grid = param_grid, 
    cv         = 5, 
    verbose    = 1,
    n_jobs     = -1
)
cv_rfc_mean.fit(X_train_mean, y_train)

lrc_mean.fit(X_train_mean, y_train)

y_train_preds_rf_mean = cv_rfc_mean.best_estimator_.predict(X_train_mean)
y_test_preds_rf_mean = cv_rfc_mean.best_estimator_.predict(X_test_mean)

y_train_preds_lr_mean = lrc_mean.predict(X_train_mean)
y_test_preds_lr_mean = lrc_mean.predict(X_test_mean)

### 2) Model training and prediction with one-hot encoded categorical variables

In [None]:
X_ohe = pd.DataFrame()
X_ohe[keep_ohe_cols] = df[keep_ohe_cols]
X_ohe.head()

In [None]:
# This cell may take up to 15-20 minutes to run
# train test split 
X_train_ohe, X_test_ohe, y_train, y_test = train_test_split(X_ohe, y, test_size = 0.3, random_state = 42)

# grid search
rfc_ohe = RandomForestClassifier(random_state = 42)
# Use a different solver if the default 'lbfgs' fails to converge
# Reference: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
lrc_ohe = LogisticRegression(solver = 'lbfgs', max_iter = 3000, verbose = 1)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt'],
    'max_depth'   : [4, 5, 100],
    'criterion'   : ['gini', 'entropy']
}

cv_rfc_ohe = GridSearchCV(
    estimator  = rfc_ohe, 
    param_grid = param_grid, 
    cv         = 5, 
    verbose    = 1,
    n_jobs     = -1
)
cv_rfc_ohe.fit(X_train_ohe, y_train)

lrc_ohe.fit(X_train_ohe, y_train)

y_train_preds_rf_ohe = cv_rfc_ohe.best_estimator_.predict(X_train_ohe)
y_test_preds_rf_ohe = cv_rfc_ohe.best_estimator_.predict(X_test_ohe)

y_train_preds_lr_ohe = lrc_ohe.predict(X_train_ohe)
y_test_preds_lr_ohe = lrc_ohe.predict(X_test_ohe)

### Model and feature selection scores

In [None]:
# scores
print('Random Forest results for mean encoding')
print('Test results')
print(classification_report(y_test, y_test_preds_rf_mean))
print('Train results')
print(classification_report(y_train, y_train_preds_rf_mean))

print('Logistic Regression results for mean encoding')
print('Test results')
print(classification_report(y_test, y_test_preds_lr_mean))
print('Train results')
print(classification_report(y_train, y_train_preds_lr_mean))

print('Random Forest results for one-hot encoding')
print('Test results')
print(classification_report(y_test, y_test_preds_rf_ohe))
print('Train results')
print(classification_report(y_train, y_train_preds_rf_ohe))

print('Logistic Regression results for one-hot encoding')
print('Test results')
print(classification_report(y_test, y_test_preds_lr_ohe))
print('Train results')
print(classification_report(y_train, y_train_preds_lr_ohe))

In [None]:
# selecting best features
X, X_train, X_test = X_mean, X_train_mean, X_test_mean
y_train_preds_rf, y_test_preds_rf = y_train_preds_rf_mean, y_test_preds_rf_mean
y_train_preds_lr, y_test_preds_lr = y_train_preds_lr_mean, y_test_preds_lr_mean
cv_rfc = cv_rfc_mean
lrc = lrc_mean

print(f'Best random forest classifier parameters:\n{cv_rfc.best_params_}')

## Model evaluation

In [None]:
lrc_plot = plot_roc_curve(lrc, X_test, y_test)

In [None]:
# ROC plots
plt.figure(figsize = (15, 8))
ax = plt.gca()
rfc_disp = plot_roc_curve(cv_rfc.best_estimator_, X_test, y_test, ax = ax, alpha = 0.8)
lrc_plot.plot(ax = ax, alpha = 0.8)
plt.show()
# To save the plot
# plt.savefig('roc_curve.png')

In [None]:
# save best model
joblib.dump(cv_rfc.best_estimator_, './models/rfc_model.pkl')
joblib.dump(lrc, './models/lrc_model.pkl')

In [None]:
rfc_model = joblib.load('./models/rfc_model.pkl')
lr_model = joblib.load('./models/lrc_model.pkl')

In [None]:
lrc_plot = plot_roc_curve(lr_model, X_test, y_test)

In [None]:
# ROC plots
plt.figure(figsize = (15, 8))
ax = plt.gca()
rfc_disp = plot_roc_curve(rfc_model, X_test, y_test, ax = ax, alpha = 0.8)
lrc_plot.plot(ax = ax, alpha = 0.8)
plt.show()
# To save the plot
# plt.savefig('roc_curve.png')

In [None]:
explainer = shap.TreeExplainer(cv_rfc.best_estimator_)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type = "bar")
# To save the plot
# shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
# plt.savefig('featture_impacts.png')

In [None]:
# Calculate feature importances for random forest classifier
importances = cv_rfc.best_estimator_.feature_importances_
# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Rearrange feature names so they match the sorted feature importances
names = [X.columns[i] for i in indices]

# Create plot
plt.figure(figsize = (20, 5))

# Create plot title
plt.title("Feature Importance - random forest classifier")
plt.ylabel('Importance')

# Add bars
plt.bar(range(X.shape[1]), importances[indices])

# Add feature names as x-axis labels
plt.xticks(range(X.shape[1]), names, rotation = 90)
plt.show()
# To save the plot
# plt.savefig('feature_importance.png')

In [None]:
# Calculate feature importances for logistic regression classifier
importances = lr_model.coef_[0]
# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Rearrange feature names so they match the sorted feature importances
names = [X.columns[i] for i in indices]

# Create plot
plt.figure(figsize = (20, 5))

# Create plot title
plt.title("Feature Importance - logistic regression classifier")
plt.ylabel('Importance')

# Add bars
plt.bar(range(X.shape[1]), importances[indices])

# Add feature names as x-axis labels
plt.xticks(range(X.shape[1]), names, rotation = 90)
plt.show()
# To save the plot
# plt.savefig('feature_importance.png')

In [None]:
plt.rc('figure', figsize=(5, 5))
#plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}) old approach
plt.text(0.01, 1.25, str('Random Forest Train'), {'fontsize': 10}, fontproperties = 'monospace')
plt.text(0.01, 0.05, str(classification_report(y_test, y_test_preds_rf)), {'fontsize': 10}, fontproperties = 'monospace') # approach improved by OP -> monospace!
plt.text(0.01, 0.6, str('Random Forest Test'), {'fontsize': 10}, fontproperties = 'monospace')
plt.text(0.01, 0.7, str(classification_report(y_train, y_train_preds_rf)), {'fontsize': 10}, fontproperties = 'monospace') # approach improved by OP -> monospace!
plt.axis('off')
plt.show()
# To save the plot
# plt.savefig('random_forest_scores.png')

In [None]:
plt.rc('figure', figsize=(5, 5))
plt.text(0.01, 1.25, str('Logistic Regression Train'), {'fontsize': 10}, fontproperties = 'monospace')
plt.text(0.01, 0.05, str(classification_report(y_train, y_train_preds_lr)), {'fontsize': 10}, fontproperties = 'monospace') # approach improved by OP -> monospace!
plt.text(0.01, 0.6, str('Logistic Regression Test'), {'fontsize': 10}, fontproperties = 'monospace')
plt.text(0.01, 0.7, str(classification_report(y_test, y_test_preds_lr)), {'fontsize': 10}, fontproperties = 'monospace') # approach improved by OP -> monospace!
plt.axis('off')
plt.show()
# To save the plot
# plt.savefig('logistic_regression_scores.png')