In [6]:
import pandas as pd

# Exemplo: se o arquivo estiver na mesma pasta do notebook
df = pd.read_csv("base.csv")

# 🤖 MODELOS DE MACHINE LEARNING PARA PREVISÃO DE ATTRITION

In [7]:
# 1. Importando bibliotecas
%pip install imbalanced-learn

import pandas as pd
import numpy as np
import sklearn

print("✅ scikit-learn:", sklearn.__version__)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


Note: you may need to restart the kernel to use updated packages.
✅ scikit-learn: 1.6.1


In [9]:
import pandas as pd

# Exemplo: se o arquivo estiver na mesma pasta do notebook
df = pd.read_csv("base.csv")

In [10]:
display(df.head())

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [11]:
# 3. Criação das features engenheiradas (baseado no notebook anterior)
df['YearsInCurrentRoleRatio'] = df['YearsInCurrentRole'] / (df['YearsAtCompany'] + 1)
df['ExperiencePerAge'] = (df['TotalWorkingYears'] + 1) / (df['Age'] + 1)
df['OverallSatisfaction'] = df[['JobSatisfaction', 'EnvironmentSatisfaction', 'RelationshipSatisfaction']].mean(axis=1)
df['SatisfactionPerformanceGap'] = df['OverallSatisfaction'] - df['PerformanceRating']
df['YearsInCompanyToAge'] = df['YearsAtCompany'] / (df['Age'] + 1)
df['YearsSincePromotionRatio'] = df['YearsSinceLastPromotion'] / (df['YearsAtCompany'] + 1)
df['IsManager'] = df['JobRole'].isin(['Manager', 'Director']).astype(int)
df['TravelHighFreq'] = (df['BusinessTravel'] == 'Travel_Frequently').astype(int)
df['IsSingleOverTime'] = ((df['MaritalStatus'] == 'Single') & (df['OverTime'] == 'Yes')).astype(int)

In [12]:
# 4. Codificação da variável alvo
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

In [13]:
# 5. Codificação de variáveis categóricas com One-Hot Encoding
cat_cols = ['Gender', 'OverTime', 'MaritalStatus', 'BusinessTravel', 'Department', 'JobRole', 'EducationField']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [14]:
# 6. Seleção de variáveis (remoção de colunas irrelevantes)
X = df.drop(columns=['Attrition', 'EmployeeNumber', 'EmployeeCount', 'Over18', 'StandardHours'])
y = df['Attrition']


In [15]:
# 7. Divisão em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

In [16]:
# normalizando dados
scaler = StandardScaler()
X_train_res = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
%pip install --upgrade scikit-learn imbalanced-learn

from imblearn.over_sampling import SMOTE

# 8. Tratamento do desbalanceamento com SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_res, y_train)

Collecting scikit-learn
  Using cached scikit_learn-1.7.0-cp312-cp312-win_amd64.whl.metadata (14 kB)
Collecting imbalanced-learn
  Using cached imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Using cached imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
Installing collected packages: scikit-learn, imbalanced-learn

  Attempting uninstall: scikit-learn

    Found existing installation: scikit-learn 1.3.2

    Uninstalling scikit-learn-1.3.2:

      Successfully uninstalled scikit-learn-1.3.2

   ---------------------------------------- 0/2 [scikit-learn]
   ---------------------------------------- 0/2 [scikit-learn]
   ---------------------------------------- 0/2 [scikit-learn]
   ---------------------------------------- 0/2 [scikit-learn]
   ---------------------------------------- 0/2 [scikit-learn]
   ----

##### ✔️ Justificativa: A base tem ~16% de casos positivos (Attrition = Yes), por isso aplicamos SMOTE apenas no treino para evitar vazamento de dados e simular a distribuição natural no teste.

In [20]:
# 9. Treinamento de 4 modelos distintos
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42), # 100 numero de arvores
    "XGBoost": XGBClassifier(eval_metric='logloss'),
    "CatBoost": CatBoostClassifier(verbose=0)
}

results = {}

for name, model in models.items():
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)

    # Armazenar métricas
    results[name] = {
        "F1-score": f1_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred)
    }


In [21]:
# 10. Comparação final dos modelos
# Trocando o F1-score por "Recall" na ordenação
resultados_df = pd.DataFrame(results).T.sort_values(by="Recall", ascending=False)
print("🏆 Comparativo de desempenho dos modelos:")
print(resultados_df)

🏆 Comparativo de desempenho dos modelos:
                     F1-score  Precision    Recall
Logistic Regression  0.478723   0.384615  0.633803
XGBoost              0.385321   0.552632  0.295775
CatBoost             0.403846   0.636364  0.295775
Random Forest        0.336449   0.500000  0.253521


🏆 Melhor modelo: Logistic Regression

**Justificativas:** <br>
<b>1 - Maior F1-score (0.4787):</b>

* O F1-score é a média harmônica entre Precisão e Recall, e é ideal quando você quer equilibrar ambos, especialmente em problemas com classes desbalanceadas. Isso indica que o modelo está conseguindo um bom equilíbrio entre acertar positivos e evitar falsos positivos.

**2 - Maior Recall (0.6318):**

* Isso significa que o modelo está detectando uma grande proporção dos verdadeiros positivos — importante em contextos onde não perder casos positivos é crítico (ex: fraudes, diagnósticos, etc.).

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Defina o grid de parâmetros que deseja testar
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# Verifica se X_train_res está definido
try:
    X_train_res
    y_train_res
except NameError:
    raise RuntimeError("Execute a célula 11 antes para definir X_train_res e y_train_res.")

# Faz a busca em grade
grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='f1')
grid.fit(X_train_res, y_train_res)
print("Melhores parâmetros:", grid.best_params_)

Melhores parâmetros: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}


In [23]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LogisticRegression(max_iter=1000), X_train_res, y_train_res, cv=5, scoring='f1')
print("F1-score médio (CV):", scores.mean())

F1-score médio (CV): 0.8179514466152481


In [24]:
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(estimators=[
    ('lr', LogisticRegression(max_iter=1000)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(eval_metric='logloss'))
], voting='soft')

ensemble.fit(X_train_res, y_train_res)
y_pred_ensemble = ensemble.predict(X_test)
print("F1-score Ensemble:", f1_score(y_test, y_pred_ensemble))

F1-score Ensemble: 0.46153846153846156
