In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sklearn
from imblearn.over_sampling import SMOTE 
import json
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import sklearn.linear_model as linearModels
import sklearn.ensemble as ensembleModels
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score, classification_report

In [None]:
df = pd.read_csv('.data/telco_customer_churn.csv', index_col='customerID')
df.head().T

In [None]:
columns = dict(
    numeric=['tenure', 'MonthlyCharges', 'TotalCharges'],
    categoricals=['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling',
                  'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup','DeviceProtection', 
                  'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod',]
)

# EDA

## Numerics

In [None]:
fig = px.box(df, y='Churn', x='tenure')
fig.show()

In [None]:
fig = px.box(df, y='Churn', x='MonthlyCharges')
fig.show()

In [None]:
fig = px.box(df.assign(TotalCharges=lambda x: x.TotalCharges.replace(' ', 0).astype(float)), y='Churn', x='TotalCharges')
fig.show()

## Categoricals

In [None]:
df[['PaymentMethod', 'Churn']].value_counts().sort_index()

In [None]:
rows, cols = 8, 2

fig = make_subplots(rows=rows, cols=cols, subplot_titles=columns['categoricals'])

for row in range(rows):
    for col in range(cols):
        colname = columns['categoricals'][row*cols+col]
        plot_df = (df.reset_index()
            .groupby([colname, 'Churn'])
            ['customerID'].nunique()
            .unstack('Churn')
            .reset_index()
            .rename(columns={colname: colname.title(), 'customerID': '# of Customers'})
        )

        for churn_group in ['No', 'Yes']:
            fig.add_trace(
                go.Bar(y=plot_df[colname.title()], x=plot_df[churn_group], 
                       name=churn_group, legendgroup=churn_group,
                       marker_color=('red' if churn_group == 'Yes' else 'green'),
                       orientation='h'),
                row+1, col+1
            )
        
fig.update_layout(
    height=1000,
    width=1500,
    barmode='stack'
)
fig.update_traces(insidetextanchor='middle')
        
fig.show()

# Preprocessing

In [None]:
target_col_name = 'Churn'
X = df.loc[:, [x for x in df.columns if x != target_col_name]]

y = df.loc[:, [target_col_name]]

## Features

In [None]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=' ', strategy='constant', fill_value=0)),
    ('scaler', StandardScaler()),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_pipeline, columns['numeric']),
        ('categoricals', OneHotEncoder(), columns['categoricals'])
    ],
    remainder='passthrough'
)
X_trans = preprocessor.fit_transform(X)

### Feature Checks

In [None]:
df_X_trans = pd.DataFrame(data=X_trans, index=df.index, columns=preprocessor.get_feature_names_out())
df_joined = df.join(df_X_trans)
cols = [col for col in df_joined.columns if 'PaymentMethod' in col]
df_joined[cols]

## Target

In [None]:
target_encoder = OneHotEncoder(categories=[['No', 'Yes']], drop='first', sparse_output=False)
y_trans = target_encoder.fit_transform(y).ravel()

## Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_trans, y_trans, train_size = 0.8, random_state=444)

## Target

# Baseline

In [None]:
model = ensembleModels.RandomForestClassifier(random_state=11)
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)

print(f"""\
Accuracy:  {accuracy:0.1%}
Precision: {precision:0.1%}
Recall:    {recall:0.1%}
""")

In [None]:
importances = pd.DataFrame(list(zip(preprocessor.get_feature_names_out(), model.feature_importances_)), columns = ['feature', 'importance'])
importances.sort_values('importance', ascending=False)