# Download do Dataset

In [1]:
#https://www.kaggle.com/datasets/blastchar/telco-customer-churn/data

In [2]:
#%%sh
#pip install kaggle

In [3]:
#!mkdir -p ~/.kaggle
#!mv kaggle.json ~/.kaggle/
#!chmod 600 ~/.kaggle/kaggle.json

In [4]:
#!kaggle datasets download -d blastchar/telco-customer-churn

# Importação de Bilbiotecas

In [5]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_curve, roc_auc_score
from scipy.stats import ks_2samp

import matplotlib.pyplot as plt
from sklearn import tree

import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
warnings.simplefilter(action = 'ignore', category = pd.errors.SettingWithCopyWarning)

# Split Conjunto Treino/Teste

In [6]:
alvo = 'Churn'
df_tot = pd.read_csv('telco-customer-churn.zip', dtype = {'SeniorCitizen': str})
df_tot[alvo] = df_tot[alvo].map({'Yes': 1, 'No': 0})
df_tot.loc[df_tot['TotalCharges'] == ' ', 'TotalCharges'] = -1
df_tot['TotalCharges'] = df_tot['TotalCharges'].astype(float)

In [7]:
df_teste = df_tot.sample(frac = 0.25, random_state = 42)
df = df_tot[~df_tot.index.isin(df_teste.index)]

In [8]:
display(display(df_tot[['Partner', alvo]].groupby('Partner').agg(['count', 'mean'])))

Unnamed: 0_level_0,Churn,Churn
Unnamed: 0_level_1,count,mean
Partner,Unnamed: 1_level_2,Unnamed: 2_level_2
No,3641,0.32958
Yes,3402,0.196649


None

# Tratamento de Features

In [9]:
colunas_id = ['customerID']
features_categoricas = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling',
                        'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                        'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']
features_numericas = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [10]:
dict_dummies = {}
for f in features_categoricas:
    lista_dummies = list(df[f].value_counts().index[:-1])
    dict_dummies[f] = lista_dummies

for k,v in dict_dummies.items():
    for value in v:
        df[k + '_' + value] = (df[k] == value).astype(int)
        df_teste[k + '_' + value] = (df_teste[k] == value).astype(int)
    df.drop(k, axis = 1, inplace = True)
    df_teste.drop(k, axis = 1, inplace = True)
df.columns = [c.replace(' ', '_') for c in df.columns]
df_teste.columns = [c.replace(' ', '_') for c in df_teste.columns]

In [11]:
features = ['tenure', 'MonthlyCharges', 'TotalCharges',
            'gender_Male', 'SeniorCitizen_0', 'Partner_No', 'Dependents_No',
            'PhoneService_Yes', 'PaperlessBilling_Yes', 'MultipleLines_No',
            'MultipleLines_Yes', 'InternetService_Fiber_optic',
            'InternetService_DSL', 'OnlineSecurity_No', 'OnlineSecurity_Yes',
            'OnlineBackup_No', 'OnlineBackup_Yes', 'DeviceProtection_No',
            'DeviceProtection_Yes', 'TechSupport_No', 'TechSupport_Yes',
            'StreamingTV_No', 'StreamingTV_Yes', 'StreamingMovies_No',
            'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_Two_year',
            'PaymentMethod_Electronic_check', 'PaymentMethod_Mailed_check',
            'PaymentMethod_Bank_transfer_(automatic)']

# Treinamento do Modelo

In [12]:
#Simula treinar com uma amostra da base
df_train = df.sample(frac = 1.0, random_state = 42)

In [13]:
X = df_train[features]
y = df_train[alvo].astype(int)

treinar_arvore = False

In [14]:
if treinar_arvore:
    clf = DecisionTreeClassifier(random_state = 42)
    param_grid = {
        "max_depth": [1],
    }
else:
    clf = LGBMClassifier(
                            objective = 'binary',
                            boosting_type = 'gbdt',
                            force_row_wise = True,
                            random_state = 42,
                            n_jobs = -1,
                            verbose = -1
                            )
    #Espaço de busca
    param_grid = {
        "n_estimators": [10, 20],
        "learning_rate": [0.03, 0.1],
        "max_depth": [2, 4, 6],
    }

cv = StratifiedKFold(n_splits = 4, shuffle = True, random_state = 42)

#Métricas: usamos roc_auc como métrica principal para refit
grid = GridSearchCV(
        estimator = clf,
        param_grid = param_grid,
        scoring = {'roc_auc': 'roc_auc'},
        refit = 'roc_auc',
        cv = cv,
        n_jobs = -1,
        verbose = 1
    )

grid.fit(X, y)

#Modelo final já vem refit com os melhores hiperparâmetros
best_model = grid.best_estimator_

df_results = pd.DataFrame(grid.cv_results_)
if treinar_arvore:
    df_results = df_results[['param_max_depth', 'mean_test_roc_auc', 'std_test_roc_auc', 'rank_test_roc_auc']]
    plt.figure(figsize=(16,8))
    tree.plot_tree(best_model,
                feature_names = features,
                #class_names = [str(c) for c in clf.classes_],
                filled = True,
                rounded = True)
    plt.show()
else:
    df_results = df_results[['param_learning_rate', 'param_max_depth', 'param_n_estimators', 'mean_test_roc_auc', 'std_test_roc_auc', 'rank_test_roc_auc']]
df_results = df_results.sort_values('rank_test_roc_auc')
display(df_results)

print("\n=== MELHORES PARÂMETROS (refit=roc_auc) ===")
print(grid.best_params_)

probs = best_model.predict_proba(X)[:, 1]
print('AUC:')
print(roc_auc_score(y, probs))
print('KS:')
print(ks_2samp(probs[y == 0], probs[y == 1]).statistic)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


Unnamed: 0,param_learning_rate,param_max_depth,param_n_estimators,mean_test_roc_auc,std_test_roc_auc,rank_test_roc_auc
9,0.1,4,20,0.843079,0.008318,1
8,0.1,4,10,0.84047,0.007677,2
11,0.1,6,20,0.839615,0.00705,3
10,0.1,6,10,0.839561,0.007002,4
7,0.1,2,20,0.839059,0.010293,5
5,0.03,6,20,0.838968,0.006543,6
3,0.03,4,20,0.838567,0.007217,7
2,0.03,4,10,0.836052,0.006127,8
4,0.03,6,10,0.835817,0.005855,9
6,0.1,2,10,0.831529,0.010632,10



=== MELHORES PARÂMETROS (refit=roc_auc) ===
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 20}
AUC:
0.8620803973470761
KS:
0.5648509763617677


# Avaliação no Conjunto de Teste

In [15]:
X_teste = df_teste[features]
y_teste = df_teste[alvo]

In [16]:
probs_teste = best_model.predict_proba(X_teste)[:, 1]
print('AUC:')
print(roc_auc_score(y_teste, probs_teste))
print('KS:')
print(ks_2samp(probs_teste[y_teste == 0], probs_teste[y_teste == 1]).statistic)

AUC:
0.8573178000188902
KS:
0.5790632460371484


# Análise em Subconjuntos

In [17]:
def analise_subconjunto(probs, y, mask):
    print('AUC:')
    print(roc_auc_score(y[mask], probs[mask]))
    print('KS:')
    print(ks_2samp(probs[(y == 0) & mask], probs[(y == 1) & mask]).statistic)

In [18]:
print(50*'=')
print('Avaliação Solteiros:')
print()
mask_partner_no = (df['Partner_No'] == 1).values
analise_subconjunto(probs, y, mask_partner_no)

print('')
print(50*'=')
print('Avaliação Casados:')
print()
mask_partner_yes = (df['Partner_No'] == 0).values
analise_subconjunto(probs, y, mask_partner_yes)

Avaliação Solteiros:

AUC:
0.861794011722336
KS:
0.5616312407315868

Avaliação Casados:

AUC:
0.8622982917316088
KS:
0.5711563961197571


In [19]:
print(50*'=')
print('Avaliação Solteiros:')
print()
mask_partner_no = (df_teste['Partner_No'] == 1).values
analise_subconjunto(probs_teste, y_teste, mask_partner_no)

print('')
print(50*'=')
print('Avaliação Casados:')
print()
mask_partner_yes = (df_teste['Partner_No'] == 0).values
analise_subconjunto(probs_teste, y_teste, mask_partner_yes)

Avaliação Solteiros:

AUC:
0.8374756218746846
KS:
0.5281776587434438

Avaliação Casados:

AUC:
0.8676907612970894
KS:
0.6350056813215628
