In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("C:/ML_Projects/customer_churn_predictor/data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [3]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

raw = "C:/ML_Projects/customer_churn_predictor/data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv"
processed = "C:/ML_Projects/customer_churn_predictor/data/processed"
target = 'Churn'

In [4]:
from src.data_preprocessing import load_data,basic_clean,feature_engineer,split_save,get_num_and_cat_cols

df = load_data(raw)
df = basic_clean(df)
df = feature_engineer(df)
df.head()
x_train,x_test,y_train,y_test = split_save(df)

INFO:src.data_preprocessing:loaded raw data: (7043, 21)


In [5]:
num_cols,cat_cols = get_num_and_cat_cols(df)


In [6]:
from src.pipeline import build_preprocessor
preprocessor = build_preprocessor(num_cols,cat_cols)

In [7]:
from src.trainer import build_model_candidates
models = build_model_candidates(preprocessor)
models.keys()

dict_keys(['logisticregresion', 'RandomForest', 'xgb'])

In [8]:
param_grids = {
    'RandomForest':{
        'clf__n_estimators':[100,200],
        'clf__max_depth':[None,10]
    },
    'xgb':{
        'clf__n_estimators':[100,200],
        'clf__max_depth':[3,6]
    },
    'logisticregresion':{
        'clf__C':[0.01,0.1,1]
    }
}

In [9]:
from sklearn.model_selection import GridSearchCV
best_models = {}
for name,model in models.items():
    grid = param_grids[name]
    if grid:
        gs = GridSearchCV(model,grid,scoring='average_precision')
        gs.fit(x_train,y_train)
        best_models[name] = gs.best_estimator_
    else:
        model.fit(x_train,y_train)
        best_models[name] = model

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


In [10]:
from sklearn.metrics import average_precision_score,classification_report,confusion_matrix
results = {}
for name,m in best_models.items():
    try:
        y_score = m.predict_proba(x_test)[:,-1]
    except Exception:
        y_score = m.predict(x_test)
    ap = average_precision_score(y_test,y_score)
    y_pred = m.predict(x_test)
    results[name] = {'ap':ap}
    

In [11]:
best_name = max(results.keys(),key=lambda k:results[k]['ap'])
best_model = best_models[best_name]
import joblib
joblib.dump(best_model,"C:/ML_Projects/customer_churn_predictor/models/best_model.joblib")

['C:/ML_Projects/customer_churn_predictor/models/best_model.joblib']

In [12]:
from src.evaluation import plot_confusion, plot_presicion_recall, save_top_features_perm
plot_confusion(best_model,x_test,y_test)
plot_presicion_recall(best_model,x_test,y_test)
save_top_features_perm(best_model,x_test,y_test)

Unnamed: 0,features,importances
1,num__tenure,0.034493
18,cat__OnlineSecurity_No,0.03286
7,cat__Partner_Yes,0.029383
23,cat__OnlineBackup_Yes,0.026544
39,cat__PaperlessBilling_No,0.025479
4,cat__gender_Female,0.025266
21,cat__OnlineBackup_No,0.024982
20,cat__OnlineSecurity_Yes,0.023776
6,cat__Partner_No,0.023634
5,cat__gender_Male,0.022782


In [15]:
import shap
import matplotlib.pyplot as plt

pre = best_model.named_steps['preprocessor']
clf = best_model.named_steps['clf']
if hasattr (clf,'named_steps'):
    final_estimator = list(clf.named_steps.values())[-1]
else:
    final_estimator = clf

x_trans = pre.transform(x_test)
feature_names = None
try:
    feature_names = pre.get_feature_names_out()
except:
    feature_names = x_test.columns.to_list()


if any(s in str(type(final_estimator)).lower() for s in ['xgb','gradientboost','randomforest']):
    explainer = shap.TreeExplainer(final_estimator)
    shap_vals = explainer.shap_values(x_trans)
    shap.summary_plot(shap_vals,x_trans,feature_names=feature_names,show=False)
    plt.savefig("C:/ML_Projects/customer_churn_predictor/reports/shap_summary.png")
    plt.close()
else:
    explainer = shap.Explainer(final_estimator,x_trans)
    shap_vals = explainer(x_trans)
    shap.summary_plot(shap_vals,x_trans,feature_names=feature_names,show=False)
    plt.savefig("C:/ML_Projects/customer_churn_predictor/reports/shap_summary.png")
    plt.close()

  from .autonotebook import tqdm as notebook_tqdm
