# Download do Dataset

In [1]:
#https://www.kaggle.com/datasets/blastchar/telco-customer-churn/data

In [2]:
#%%sh
#pip install kaggle

In [3]:
#!mkdir -p ~/.kaggle
#!mv kaggle.json ~/.kaggle/
#!chmod 600 ~/.kaggle/kaggle.json

In [4]:
#!kaggle datasets download -d blastchar/telco-customer-churn

# Importação de Bilbiotecas

In [5]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_curve, roc_auc_score
from scipy.stats import ks_2samp

import matplotlib.pyplot as plt
from sklearn import tree

import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
warnings.simplefilter(action = 'ignore', category = pd.errors.SettingWithCopyWarning)

# Split Conjunto Treino/Teste

In [6]:
alvo = 'Churn'
df_tot = pd.read_csv('telco-customer-churn.zip', dtype = {'SeniorCitizen': str})
df_tot[alvo] = df_tot[alvo].map({'Yes': 1, 'No': 0})
df_tot.loc[df_tot['TotalCharges'] == ' ', 'TotalCharges'] = -1
df_tot['TotalCharges'] = df_tot['TotalCharges'].astype(float)

In [7]:
df_teste = df_tot.sample(frac = 0.25, random_state = 42)
df = df_tot[~df_tot.index.isin(df_teste.index)]

In [8]:
display(display(df_tot[['Partner', alvo]].groupby('Partner').agg(['count', 'mean'])))

Unnamed: 0_level_0,Churn,Churn
Unnamed: 0_level_1,count,mean
Partner,Unnamed: 1_level_2,Unnamed: 2_level_2
No,3641,0.32958
Yes,3402,0.196649


None

# Tratamento de Features

In [9]:
colunas_id = ['customerID']
features_categoricas = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling',
                        'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                        'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']
features_numericas = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [10]:
dict_dummies = {}
for f in features_categoricas:
    lista_dummies = list(df[f].value_counts().index[:-1])
    dict_dummies[f] = lista_dummies

for k,v in dict_dummies.items():
    for value in v:
        df[k + '_' + value] = (df[k] == value).astype(int)
        df_teste[k + '_' + value] = (df_teste[k] == value).astype(int)
    df.drop(k, axis = 1, inplace = True)
    df_teste.drop(k, axis = 1, inplace = True)
df.columns = [c.replace(' ', '_') for c in df.columns]
df_teste.columns = [c.replace(' ', '_') for c in df_teste.columns]

In [11]:
features = ['tenure', 'MonthlyCharges', 'TotalCharges',
            'gender_Male', 'SeniorCitizen_0', 'Partner_No', 'Dependents_No',
            'PhoneService_Yes', 'PaperlessBilling_Yes', 'MultipleLines_No',
            'MultipleLines_Yes', 'InternetService_Fiber_optic',
            'InternetService_DSL', 'OnlineSecurity_No', 'OnlineSecurity_Yes',
            'OnlineBackup_No', 'OnlineBackup_Yes', 'DeviceProtection_No',
            'DeviceProtection_Yes', 'TechSupport_No', 'TechSupport_Yes',
            'StreamingTV_No', 'StreamingTV_Yes', 'StreamingMovies_No',
            'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_Two_year',
            'PaymentMethod_Electronic_check', 'PaymentMethod_Mailed_check',
            'PaymentMethod_Bank_transfer_(automatic)']

# Treinamento do Modelo

In [12]:
#Simula treinar com uma amostra da base
df_train = df.sample(frac = 1.0, random_state = 42)

In [13]:
X = df_train[features]
y = df_train[alvo].astype(int)

treinar_arvore = False

### Só Solteiros

In [14]:
mask = (df_train['Partner_No'] == 1).values
X_train = X[mask]
y_train = y[mask]

if treinar_arvore:
    clf = DecisionTreeClassifier(random_state = 42)
    param_grid = {
        "max_depth": [1],
    }
else:
    clf = LGBMClassifier(
                            objective = 'binary',
                            boosting_type = 'gbdt',
                            force_row_wise = True,
                            random_state = 42,
                            n_jobs = -1,
                            verbose = -1
                            )
    #Espaço de busca
    param_grid = {
        "n_estimators": [10, 20],
        "learning_rate": [0.03, 0.1],
        "max_depth": [2, 4, 6],
    }

cv = StratifiedKFold(n_splits = 4, shuffle = True, random_state = 42)

#Métricas: usamos roc_auc como métrica principal para refit
grid = GridSearchCV(
        estimator = clf,
        param_grid = param_grid,
        scoring = {'roc_auc': 'roc_auc'},
        refit = 'roc_auc',
        cv = cv,
        n_jobs = -1,
        verbose = 1
    )

grid.fit(X_train, y_train)

#Modelo final já vem refit com os melhores hiperparâmetros
best_model_solteiros = grid.best_estimator_

df_results = pd.DataFrame(grid.cv_results_)
if treinar_arvore:
    df_results = df_results[['param_max_depth', 'mean_test_roc_auc', 'std_test_roc_auc', 'rank_test_roc_auc']]
    plt.figure(figsize=(16,8))
    tree.plot_tree(best_model_solteiros,
                feature_names = features,
                #class_names = [str(c) for c in clf.classes_],
                filled = True,
                rounded = True)
    plt.show()
else:
    df_results = df_results[['param_learning_rate', 'param_max_depth', 'param_n_estimators', 'mean_test_roc_auc', 'std_test_roc_auc', 'rank_test_roc_auc']]
df_results = df_results.sort_values('rank_test_roc_auc')
display(df_results)

print("\n=== MELHORES PARÂMETROS (refit=roc_auc) ===")
print(grid.best_params_)

probs_solteiros = best_model_solteiros.predict_proba(X_train)[:, 1]
print('AUC:')
print(roc_auc_score(y_train, probs_solteiros))
print('KS:')
print(ks_2samp(probs_solteiros[y_train == 0], probs_solteiros[y_train == 1]).statistic)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


Unnamed: 0,param_learning_rate,param_max_depth,param_n_estimators,mean_test_roc_auc,std_test_roc_auc,rank_test_roc_auc
9,0.1,4,20,0.817882,0.012982,1
11,0.1,6,20,0.817642,0.015232,2
10,0.1,6,10,0.815295,0.015265,3
7,0.1,2,20,0.81312,0.015973,4
8,0.1,4,10,0.813034,0.014646,5
5,0.03,6,20,0.811758,0.01455,6
4,0.03,6,10,0.80985,0.01516,7
3,0.03,4,20,0.809443,0.01752,8
6,0.1,2,10,0.805782,0.017273,9
2,0.03,4,10,0.803794,0.016361,10



=== MELHORES PARÂMETROS (refit=roc_auc) ===
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 20}
AUC:
0.8567507314778057
KS:
0.5595345742725656


### Só Casados

In [15]:
mask = (df_train['Partner_No'] == 0).values
X_train = X[mask]
y_train = y[mask]

if treinar_arvore:
    clf = DecisionTreeClassifier(random_state = 42)
    param_grid = {
        "max_depth": [1]
    }
else:
    clf = LGBMClassifier(
                            objective = 'binary',
                            boosting_type = 'gbdt',
                            force_row_wise = True,
                            random_state = 42,
                            n_jobs = -1,
                            verbose = -1
                            )
    #Espaço de busca
    param_grid = {
        "n_estimators": [10, 20],
        "learning_rate": [0.03, 0.1],
        "max_depth": [2, 4, 6],
    }

cv = StratifiedKFold(n_splits = 4, shuffle = True, random_state = 42)

#Métricas: usamos roc_auc como métrica principal para refit
grid = GridSearchCV(
        estimator = clf,
        param_grid = param_grid,
        scoring = {'roc_auc': 'roc_auc'},
        refit = 'roc_auc',
        cv = cv,
        n_jobs = -1,
        verbose = 1
    )

grid.fit(X_train, y_train)

#Modelo final já vem refit com os melhores hiperparâmetros
best_model_casados = grid.best_estimator_

df_results = pd.DataFrame(grid.cv_results_)
if treinar_arvore:
    df_results = df_results[['param_max_depth', 'mean_test_roc_auc', 'std_test_roc_auc', 'rank_test_roc_auc']]
    plt.figure(figsize=(16,8))
    tree.plot_tree(best_model_casados,
                feature_names = features,
                #class_names = [str(c) for c in clf.classes_],
                filled = True,
                rounded = True)
    plt.show()
else:
    df_results = df_results[['param_learning_rate', 'param_max_depth', 'param_n_estimators', 'mean_test_roc_auc', 'std_test_roc_auc', 'rank_test_roc_auc']]
df_results = df_results.sort_values('rank_test_roc_auc')
display(df_results)

print("\n=== MELHORES PARÂMETROS (refit=roc_auc) ===")
print(grid.best_params_)

probs_casados = best_model_casados.predict_proba(X_train)[:, 1]
print('AUC:')
print(roc_auc_score(y_train, probs_casados))
print('KS:')
print(ks_2samp(probs_casados[y_train == 0], probs_casados[y_train == 1]).statistic)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


Unnamed: 0,param_learning_rate,param_max_depth,param_n_estimators,mean_test_roc_auc,std_test_roc_auc,rank_test_roc_auc
9,0.1,4,20,0.849125,0.007208,1
8,0.1,4,10,0.84453,0.007633,2
3,0.03,4,20,0.842818,0.004945,3
2,0.03,4,10,0.842624,0.003289,4
11,0.1,6,20,0.841515,0.007028,5
7,0.1,2,20,0.84144,0.007569,6
10,0.1,6,10,0.840792,0.007219,7
5,0.03,6,20,0.839423,0.007495,8
4,0.03,6,10,0.838568,0.006846,9
6,0.1,2,10,0.833517,0.006466,10



=== MELHORES PARÂMETROS (refit=roc_auc) ===
{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 20}
AUC:
0.8861358639609316
KS:
0.6159552113936612


### Avaliação Base Geral

In [16]:
df_temp = df[['Partner_No', alvo]].copy()
df_temp['prob'] = np.nan
df_temp.loc[df_temp['Partner_No'] == 1, 'prob'] = best_model_solteiros.predict_proba(df.loc[df['Partner_No'] == 1, features])[:, 1]
df_temp.loc[df_temp['Partner_No'] == 0, 'prob'] = best_model_casados.predict_proba(df.loc[df['Partner_No'] == 0, features])[:, 1]

probs = df_temp['prob']
y = df_temp[alvo]

print('AUC:')
print(roc_auc_score(y, probs))
print('KS:')
print(ks_2samp(probs[y == 0], probs[y == 1]).statistic)

AUC:
0.8742249365974846
KS:
0.5886433710174718


# Avaliação no Conjunto de Teste

In [17]:
df_temp_teste = df_teste[['Partner_No', alvo]].copy()
df_temp_teste['prob'] = np.nan
df_temp_teste.loc[df_temp_teste['Partner_No'] == 1, 'prob'] = best_model_solteiros.predict_proba(df_teste.loc[df_teste['Partner_No'] == 1, features])[:, 1]
df_temp_teste.loc[df_temp_teste['Partner_No'] == 0, 'prob'] = best_model_casados.predict_proba(df_teste.loc[df_teste['Partner_No'] == 0, features])[:, 1]

probs_teste = df_temp_teste['prob']
y_teste = df_temp_teste[alvo]

In [18]:
print('AUC:')
print(roc_auc_score(y_teste, probs_teste))
print('KS:')
print(ks_2samp(probs_teste[y_teste == 0], probs_teste[y_teste == 1]).statistic)

AUC:
0.8510539377733773
KS:
0.5571523487244291


# Análise em Subconjuntos

In [19]:
def analise_subconjunto(probs, y, mask):
    print('AUC:')
    print(roc_auc_score(y[mask], probs[mask]))
    print('KS:')
    print(ks_2samp(probs[(y == 0) & mask], probs[(y == 1) & mask]).statistic)

In [20]:
print(50*'=')
print('Avaliação Solteiros:')
print()
mask_partner_no = (df['Partner_No'] == 1).values
analise_subconjunto(probs, y, mask_partner_no)

print('')
print(50*'=')
print('Avaliação Casados:')
print()
mask_partner_yes = (df['Partner_No'] == 0).values
analise_subconjunto(probs, y, mask_partner_yes)

Avaliação Solteiros:

AUC:
0.8567507314778057
KS:
0.5595345742725656

Avaliação Casados:

AUC:
0.8861358639609316
KS:
0.6159552113936612


In [21]:
print(50*'=')
print('Avaliação Solteiros:')
print()
mask_partner_no = (df_teste['Partner_No'] == 1).values
analise_subconjunto(probs_teste, y_teste, mask_partner_no)

print('')
print(50*'=')
print('Avaliação Casados:')
print()
mask_partner_yes = (df_teste['Partner_No'] == 0).values
analise_subconjunto(probs_teste, y_teste, mask_partner_yes)

Avaliação Solteiros:

AUC:
0.8342500039855245
KS:
0.4938702632040429

Avaliação Casados:

AUC:
0.8590114500480727
KS:
0.5964950616204877
