In [7]:
%pip install optuna
import optuna
from optuna.pruners import HyperbandPruner
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier
import pandas as pd
from sklearn.preprocessing import LabelEncoder

Note: you may need to restart the kernel to use updated packages.


In [4]:
df = pd.read_csv('df_tratado.csv', sep=',')
df.head()

Unnamed: 0.1,Unnamed: 0,id,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_acidente,...,feridos_leves,feridos_graves,ilesos,ignorados,feridos,veiculos,dia,mes,ano,grave
0,2,1032898.0,2007-08-13,Segunda,14:25:00,MG,40,585.5,ITABIRITO,Outras,...,0,1,2,0,1,1,13,8,2007,0
1,3,1051130.0,2007-02-12,Segunda,02:10:00,MA,135,11.0,SAO LUIS,Animais na Pista,...,2,1,0,0,3,1,12,2,2007,1
2,4,1066824.0,2007-11-20,terça,05:30:00,CE,222,30.8,CAUCAIA,Defeito mecânico em veículo,...,1,0,0,0,1,1,20,11,2007,0
3,5,1069918.0,2007-12-16,Domingo,17:40:00,MA,230,14.0,BARAO DE GRAJAU,Outras,...,0,1,0,0,1,1,16,12,2007,0
4,6,1070971.0,2007-03-05,Segunda,08:10:00,PR,277,584.4,CASCAVEL,Outras,...,0,0,2,0,0,2,5,3,2007,0


In [5]:
features = ['dia_semana','fase_dia','condicao_metereologica','tipo_pista','veiculos']
X = df[features]
y = df['grave']

In [6]:
print(y.value_counts())
print(y.value_counts(normalize=True) * 100)

grave
0    1725424
1      74741
Name: count, dtype: int64
grave
0    95.848103
1     4.151897
Name: proportion, dtype: float64


In [8]:
for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.


In [9]:
# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'scale_pos_weight': (y_train == 0).sum() / (y_train == 1).sum(),
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train)

    # Previsão com threshold customizado
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba > 0.3).astype(int)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    # Combinação ponderada: 50% acc + 50% f1
    return (acc + f1) / 2

In [11]:
# ================== OTIMIZAÇÃO COM HYPERBAND ==================
pruner = HyperbandPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(objective, n_trials=50, timeout=600)

[I 2025-07-12 17:22:48,596] A new study created in memory with name: no-name-bbabc15d-155e-486f-b554-e4f28e3df085
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-12 17:23:01,838] Trial 0 finished with value: 0.20236969078391323 and parameters: {'n_estimators': 308, 'max_depth': 9, 'learning_rate': 0.05457966433867701, 'subsample': 0.7168021133444159, 'colsample_bytree': 0.6267305834232194}. Best is trial 0 with value: 0.20236969078391323.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-12 17:23:05,980] Trial 1 finished with value: 0.09491832785919063 and parameters: {'n_estimators': 121, 'max_depth': 5, 'learning_rate': 0.013684535691036686, 'subsample': 0.9378779015007539, 'colsample_bytree': 0.7259120968085235}. Best is trial 0 with value: 0.20236969078391323.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-12 17

In [12]:
# ================== MELHOR MODELO ==================
print("Melhores hiperparâmetros encontrados:")
print(study.best_params)

# Avaliação final com melhor modelo
best_model = XGBClassifier(**study.best_params)
best_model.fit(X_train, y_train)
y_proba = best_model.predict_proba(X_test)[:, 1]
y_pred = (y_proba > 0.3).astype(int)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, zero_division=0)

print(f"\n✅ Resultado final com melhores parâmetros:")
print(f"Acurácia:  {acc:.4f}")
print(f"F1-Score:  {f1:.4f}")

Melhores hiperparâmetros encontrados:
{'n_estimators': 977, 'max_depth': 14, 'learning_rate': 0.18421688954874094, 'subsample': 0.601440144721393, 'colsample_bytree': 0.8547460474881254}

✅ Resultado final com melhores parâmetros:
Acurácia:  0.9581
F1-Score:  0.0047
