In [13]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

from google.colab import files
import io

# 1. Upload do arquivo CSV
uploaded = files.upload()
for fn in uploaded.keys():
    df = pd.read_csv(io.StringIO(uploaded[fn].decode('utf-8')))
    print(f'Arquivo "{fn}" carregado com sucesso.')

display(df.head())

# 2. Transformação e Limpeza dos Dados
# Converter 'TotalCharges' para numérico
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Preencher valores ausentes em 'TotalCharges' com a mediana
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

# Preencher valores ausentes em 'MonthlyCharges' com a mediana
df['MonthlyCharges'] = df['MonthlyCharges'].fillna(df['MonthlyCharges'].median())

# Lidar com valores ausentes na coluna 'Churn' (assumindo que já são 0 ou 1 e apenas preenchendo NaNs)
df['Churn'] = df['Churn'].fillna(0) # Preencher NaNs com 0 (representando 'No')


# Converter 'SeniorCitizen' para categórico
df['SeniorCitizen'] = df['SeniorCitizen'].astype('category')

# Seleção de variáveis categóricas e One-Hot Encoding
categorical_features = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
    'PaperlessBilling', 'PaymentMethod'
]
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Definir variáveis preditoras e alvo
X = df_encoded.drop(columns=['customerID', 'Churn'])
y = df_encoded['Churn']

# Check class distribution in y
print("\n--- Class distribution in y ---")
print(y.value_counts())


# Separar treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check class distribution in y_train and y_test
print("\n--- Class distribution in y_train ---")
print(y_train.value_counts())

print("\n--- Class distribution in y_test ---")
print(y_test.value_counts())


# Normalização de variáveis numéricas
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# 3. Treinamento dos modelos
# Modelo 1: Regressão Logística
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)

# Modelo 2: Random Forest
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

# 4. Avaliação
print("\n--- Regressão Logística ---")
print(classification_report(y_test, y_pred_lr))
print("Matriz de Confusão:\n", confusion_matrix(y_test, y_pred_lr))

print("\n--- Random Forest ---")
print(classification_report(y_test, y_pred_rf))
print("Matriz de Confusão:\n", confusion_matrix(y_test, y_pred_rf))

# Importância das variáveis
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model_rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\n--- Importância das Variáveis (Random Forest) ---")
print(importance_df.head(20))

Saving dados_tratados.csv to dados_tratados (10).csv
Arquivo "dados_tratados (10).csv" carregado com sucesso.


Unnamed: 0,customerID,Churn,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0002-ORFBO,0,Female,0,Yes,Yes,9,Yes,No,DSL,...,Yes,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3
1,0003-MKNFE,0,Male,0,No,No,9,Yes,Yes,DSL,...,No,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4
2,0004-TLHLJ,1,Male,0,No,No,4,Yes,No,Fiber optic,...,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85
3,0011-IGKFF,1,Male,1,Yes,No,13,Yes,No,Fiber optic,...,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85
4,0013-EXCHZ,1,Female,1,Yes,No,3,Yes,No,Fiber optic,...,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4



--- Class distribution in y ---
Churn
0    5398
1    1869
Name: count, dtype: int64

--- Class distribution in y_train ---
Churn
0    4316
1    1497
Name: count, dtype: int64

--- Class distribution in y_test ---
Churn
0    1082
1     372
Name: count, dtype: int64

--- Regressão Logística ---
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      1082
           1       0.67      0.54      0.60       372

    accuracy                           0.82      1454
   macro avg       0.76      0.73      0.74      1454
weighted avg       0.81      0.82      0.81      1454

Matriz de Confusão:
 [[984  98]
 [170 202]]

--- Random Forest ---
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1082
           1       0.63      0.49      0.55       372

    accuracy                           0.80      1454
   macro avg       0.73      0.70      0.71      1454
weighted avg       0.78      0.80      