In [1]:
%pip install optuna
import optuna
from optuna.pruners import HyperbandPruner
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier
import pandas as pd
from sklearn.preprocessing import LabelEncoder

Note: you may need to restart the kernel to use updated packages.


In [2]:
df = pd.read_csv('df_tratado.csv', sep=',')
df.head()

Unnamed: 0.1,Unnamed: 0,id,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_acidente,...,latitude,longitude,regional,delegacia,uop,origem_arquivo,dia,mes,ano,grave
0,1,405158.0,2022-01-01,Sábado,02:40:00,PR,116,33.0,CAMPINA GRANDE DO SUL,Ingestão de álcool pelo condutor,...,-25114403,-488467554,SPRF-PR,DEL01-PR,UOP02-DEL01-PR,datatran2022.csv,1,1,2022,1
1,28,405442.0,2022-02-01,Domingo,12:00:00,SC,116,54.0,PAPANDUVA,Velocidade Incompatível,...,-2639448417,-5016480003,SPRF-SC,DEL06-SC,UOP02-DEL06-SC,datatran2022.csv,1,2,2022,0
2,34,405542.0,2022-02-01,Domingo,18:40:00,RS,290,112.0,ELDORADO DO SUL,Acessar a via sem observar a presença dos outr...,...,-30040455,-51329193,SPRF-RS,DEL02-RS,UOP01-DEL02-RS,datatran2022.csv,1,2,2022,0
3,36,405558.0,2022-02-01,Domingo,23:00:00,SC,101,118.0,ITAJAI,Ingestão de álcool pelo condutor,...,-26895859,-4871903,SPRF-SC,DEL04-SC,UOP04-DEL04-SC,datatran2022.csv,1,2,2022,0
4,38,405589.0,2022-03-01,Segunda,03:30:00,SC,101,329.0,CAPIVARI DE BAIXO,Ingestão de álcool pelo condutor,...,-2845039264,-4897180974,SPRF-SC,DEL02-SC,UOP01-DEL02-SC,datatran2022.csv,1,3,2022,0


In [3]:
features = ['dia_semana','fase_dia','condicao_metereologica','tipo_pista','veiculos']
X = df[features]
y = df['grave']

In [4]:
print(y.value_counts())
print(y.value_counts(normalize=True) * 100)

grave
0    35517
1     1121
Name: count, dtype: int64
grave
0    96.940335
1     3.059665
Name: proportion, dtype: float64


In [5]:
for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.


In [6]:
# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'scale_pos_weight': (y_train == 0).sum() / (y_train == 1).sum(),
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train)

    # Previsão com threshold customizado
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba > 0.3).astype(int)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    # Combinação ponderada: 50% acc + 50% f1
    return (acc + f1) / 2

In [8]:
# ================== OTIMIZAÇÃO COM HYPERBAND ==================
pruner = HyperbandPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(objective, n_trials=50, timeout=600)

[I 2025-08-03 15:56:33,186] A new study created in memory with name: no-name-c02b4b20-ec21-4ac9-ac82-eb37cec4b6df


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-08-03 15:56:33,885] Trial 0 finished with value: 0.23816372582789797 and parameters: {'n_estimators': 293, 'max_depth': 4, 'learning_rate': 0.24670291156681673, 'subsample': 0.7249191174783913, 'colsample_bytree': 0.9076836850775086}. Best is trial 0 with value: 0.23816372582789797.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-08-03 15:56:34,681] Trial 1 finished with value: 0.23618973762291443 and parameters: {'n_estimators': 479, 'max_depth': 8, 'learning_rate': 0.012561894690007125, 'subsample': 0.6245242350220694, 'colsample_bytree': 0.9026620124596854}. Best is trial 0 with value: 0.23816372582789797.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-08-03 15:56:35,713] Trial 2 finished with value: 0.2609817860341056 and parameters: {'n_estimators': 523, 'max_depth': 12,

In [9]:
# ================== MELHOR MODELO ==================
print("Melhores hiperparâmetros encontrados:")
print(study.best_params)

# Avaliação final com melhor modelo
best_model = XGBClassifier(**study.best_params)
best_model.fit(X_train, y_train)
y_proba = best_model.predict_proba(X_test)[:, 1]
y_pred = (y_proba > 0.3).astype(int)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, zero_division=0)

print(f"\n✅ Resultado final com melhores parâmetros:")
print(f"Acurácia:  {acc:.4f}")
print(f"F1-Score:  {f1:.4f}")

Melhores hiperparâmetros encontrados:
{'n_estimators': 366, 'max_depth': 12, 'learning_rate': 0.1888534132836027, 'subsample': 0.7441905120685195, 'colsample_bytree': 0.873900272648435}

✅ Resultado final com melhores parâmetros:
Acurácia:  0.9664
F1-Score:  0.0160
