## Importação dos Dados

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('resources/output/prepared_data.csv')
df.dropna(inplace=True)
X = df.drop(columns='Churn')
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
X_train.shape, X_test.shape

((352665, 12), (88167, 12))

## Treinamento

In [3]:
%%time
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

CPU times: total: 266 ms
Wall time: 687 ms


In [4]:
%%time
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

CPU times: total: 16.6 s
Wall time: 27.2 s


In [None]:
%%time
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)

## Avaliação

In [11]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score

In [22]:
y_pred_log_reg = lr.predict(X_test)

accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print(f'Acurácia Logistic Regression: {accuracy_log_reg}')

f1_log_reg = f1_score(y_test, y_pred_log_reg)
print(f'F1-Score Logistic Regression: {f1_log_reg}')

confusion_log_reg = confusion_matrix(y_test, y_pred_log_reg)
print(f'Matriz de Confusão Logistic Regression:\n{confusion_log_reg}')

roc_log_reg = roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])
print(f'AUC-ROC Logistic Regression: {roc_log_reg}')

Acurácia Logistic Regression: 0.8963898057096192
F1-Score Logistic Regression: 0.906864594272198
Matriz de Confusão Logistic Regression:
[[34558  3505]
 [ 5630 44474]]
AUC-ROC Logistic Regression: 0.9597071137249034


In [13]:
y_pred_rf = rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Acurácia Random Forest: {accuracy_rf}')

f1_rf = f1_score(y_test, y_pred_rf)
print(f'F1-Score Random Forest: {f1_rf}')

confusion_rf = confusion_matrix(y_test, y_pred_rf)
print(f'Matriz de Confusão Random Forest:\n{confusion_rf}')

roc_rf = roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])
print(f'AUC-ROC Random Forest: {roc_rf}')

Acurácia Random Forest: 0.9996143681876439
F1-Score Random Forest: 0.9996606041246581
Matriz de Confusão Random Forest:
[[38061     2]
 [   32 50072]]
AUC-ROC Random Forest: 0.9999994481174137


In [16]:
y_pred_xgb = xgb.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f'Acurácia XGBoost: {accuracy_xgb}')

f1_xgb = f1_score(y_test, y_pred_xgb)
print(f'F1-Score XGBoost: {f1_xgb}')

confusion_xgb = confusion_matrix(y_test, y_pred_xgb)
print(f'Matriz de Confusão XGBoost:\n{confusion_xgb}')

roc_xgb = roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1])
print(f'AUC-ROC XGBoost: {roc_xgb}')

Acurácia XGBoost: 0.9993534996086971
F1-Score XGBoost: 0.9994308594023025
Matriz de Confusão XGBoost:
[[38063     0]
 [   57 50047]]
AUC-ROC XGBoost: 0.9999992648556902


## Validação Cruzada

In [26]:
from sklearn.model_selection import cross_val_score

k = 7

In [27]:
lr = LogisticRegression(max_iter=1000)

cv_scores_log_reg = cross_val_score(lr, X, y, cv=k, scoring='accuracy')

print(f'Logistic Regression Cross-Validation Scores: {cv_scores_log_reg}')
print(f'Média Acurácia (Logistic Regression): {cv_scores_log_reg.mean()}')

Logistic Regression Cross-Validation Scores: [0.89376905 0.89546811 0.89478532 0.89611916 0.89583333 0.89465828
 0.89683371]
Média Acurácia (Logistic Regression): 0.8953524245063879


In [28]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

cv_scores_rf = cross_val_score(rf, X, y, cv=k, scoring='accuracy')

print(f'Random Forest Cross-Validation Scores: {cv_scores_rf}')
print(f'Média Acurácia (Random Forest): {cv_scores_rf.mean()}')

Random Forest Cross-Validation Scores: [0.99963478 0.99965066 0.99966654 0.99973006 0.99968242 0.99949187
 0.99965066]
Média Acurácia (Random Forest): 0.9996438552555169


In [20]:
xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

cv_scores_xgb = cross_val_score(xgb, X, y, cv=k, scoring='accuracy')

print(f'XGBoost Cross-Validation Scores: {cv_scores_xgb}')
print(f'Média Acurácia (XGBoost): {cv_scores_xgb.mean()}')

XGBoost Cross-Validation Scores: [0.99946692 0.99946692 0.99959168 0.9994896  0.99943289]
Média Acurácia (XGBoost): 0.9994896015837366


## Salvando o modelo

In [5]:
import pickle

with open('resources/models/random_forest.pkl', 'wb') as f:
    pickle.dump(rf, f)