In [3]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Carregar o dataset
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Corrigir valores faltantes e tipos de dados
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'].replace(' ', pd.NA), errors='coerce')

# Separar X (entrada) e y (alvo)
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn'].map({'Yes': 1, 'No': 0})

# Identificar colunas por tipo
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
binary_features = [col for col in X.columns if X[col].nunique() == 2 and X[col].dtype == 'object']
categorical_features = [col for col in X.select_dtypes(include='object').columns if col not in binary_features]

# Pipelines de transformação
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

binary_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='if_binary', dtype=int))
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # use sparse=False se sua versão for <1.2
])

# Transformer geral
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, numerical_features),
    ('bin', binary_pipeline, binary_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Aplicar transformação
X_processed_array = preprocessor.fit_transform(X)

# Recuperar os nomes das colunas transformadas
bin_feature_names = preprocessor.named_transformers_['bin']['encoder'].get_feature_names_out(binary_features)
cat_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_features)
final_feature_names = numerical_features + list(bin_feature_names) + list(cat_feature_names)

# Converter para DataFrame
X_processed = pd.DataFrame(X_processed_array, columns=final_feature_names)

# Comparar entrada e saída
print(f"Shape original: {X.shape}")
print(f"Shape pós-processamento: {X_processed.shape}\n")
print("Amostra dos dados finais:")
display(X_processed.head())


Shape original: (7043, 19)
Shape pós-processamento: (7043, 39)

Amostra dos dados finais:


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,PaperlessBilling_Yes,MultipleLines_No,MultipleLines_No phone service,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.277445,-1.160323,-0.994242,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.066327,-0.259629,-0.173244,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-1.236724,-0.36266,-0.959674,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.514251,-0.746535,-0.194766,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,-1.236724,0.197365,-0.94047,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [4]:
import pandas as pd
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Usar os dados processados: X_processed, y
# Se você ainda não tiver rodado o pré-processamento, rode o código anterior antes

# Dicionário de modelos
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}

# Métricas customizadas
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

# Avaliar cada modelo
results = []

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    scores = cross_validate(model, X_processed, y, cv=cv, scoring=scoring, n_jobs=-1)
    result = {
        'Model': name,
        'Accuracy': scores['test_accuracy'].mean(),
        'Precision': scores['test_precision'].mean(),
        'Recall': scores['test_recall'].mean(),
        'F1-Score': scores['test_f1'].mean(),
        'ROC AUC': scores['test_roc_auc'].mean()
    }
    results.append(result)

# Mostrar os resultados como tabela ordenada por F1-score
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='F1-Score', ascending=False).reset_index(drop=True)

print("Resultados da avaliação de modelos:")
display(results_df)


Resultados da avaliação de modelos:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,ROC AUC
0,Logistic Regression,0.80349,0.653757,0.550535,0.597567,0.722697
1,Naive Bayes,0.692175,0.456766,0.84535,0.593035,0.741094
2,Gradient Boosting,0.805053,0.668245,0.527005,0.589091,0.716247
3,SVM,0.801501,0.672774,0.489537,0.566595,0.701861
4,Random Forest,0.785458,0.622211,0.485799,0.545396,0.68975
5,KNN,0.762882,0.556828,0.520043,0.537688,0.685321
6,Decision Tree,0.729801,0.491398,0.518975,0.504696,0.662464
