In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sklearn
from imblearn.over_sampling import SMOTE 
import json
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import sklearn.linear_model as linearModels
import sklearn.ensemble as ensembleModels
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import mlflow
from mlflow.models import infer_signature

mlflow.set_experiment('Telco')

In [None]:
df = pd.read_csv('.data/telco_customer_churn.csv', index_col='customerID')
df.head().T

In [None]:
columns = dict(
    numeric=['tenure', 'MonthlyCharges', 'TotalCharges'],
    categoricals=['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling',
                  'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup','DeviceProtection', 
                  'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod',]
)

# Preprocessing

In [None]:
target_col_name = 'Churn'
X = df.loc[:, [x for x in df.columns if x != target_col_name]]

y = df.loc[:, [target_col_name]]

## Features

In [None]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=' ', strategy='constant', fill_value=0)),
    ('scaler', StandardScaler()),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_pipeline, columns['numeric']),
        ('categoricals', OneHotEncoder(), columns['categoricals'])
    ],
    remainder='passthrough'
)
X_trans = preprocessor.fit_transform(X)

### Feature Checks

In [None]:
df_X_trans = pd.DataFrame(data=X_trans, index=df.index, columns=preprocessor.get_feature_names_out())
df_joined = df.join(df_X_trans)
cols = [col for col in df_joined.columns if 'PaymentMethod' in col]
df_joined[cols]

## Target

In [None]:
target_encoder = OneHotEncoder(categories=[['No', 'Yes']], drop='first', sparse_output=False)
y_trans = target_encoder.fit_transform(y).ravel()

## Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_trans, y_trans, train_size = 0.8, random_state=444)

## Target

# Baseline

In [None]:
with mlflow.start_run(run_name='baseline__depth_and_estimators'):
    model = ensembleModels.RandomForestClassifier(random_state=11, max_depth=3, n_estimators=1000)
    mlflow.log_params(model.get_params(deep=True))
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    metrics = dict(
        accuracy = accuracy_score(y_test, y_pred),
        precision = precision_score(y_test, y_pred),
        recall = recall_score(y_test, y_pred),
        f1 = f1_score(y_test, y_pred)
    ) 

    print(f"""\
    Accuracy:  {metrics['accuracy']:0.1%}
    Precision: {metrics['precision']:0.1%}
    Recall:    {metrics['recall']:0.1%}
    F1:        {metrics['f1']:0.1%}
    """)

    mlflow.log_metrics(metrics)

    signature = infer_signature(X_train, model.predict(X_train))

    mlflow.sklearn.log_model(
        sk_model=model,
        name='baseline',
        signature=signature,
        input_example=X_train[:5]
    )

    importances = pd.DataFrame(list(zip(preprocessor.get_feature_names_out(), model.feature_importances_)), columns = ['feature', 'importance'])
    importances.sort_values('importance', ascending=False)

In [None]:
importances = (pd.DataFrame(list(zip(preprocessor.get_feature_names_out(), model.feature_importances_)), columns=['feature', 'importance'])
    .set_index('feature')
    .sort_values('importance', ascending=False)
)
importances

In [None]:
importances['categoricals__TechSupport_Yes']