<a href="https://colab.research.google.com/github/arthurvale/TCC-2026/blob/main/predicao_velocidade_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predição de Velocidade Veicular em Redes 5G - Machine Learning e Deep Learning

**Autor:** Arthur Vale Fonseca

**Data:** Novembro 2025

# 1. Importação de Bibliotecas

In [None]:
# Bibliotecas básicas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import xgboost as xgb
import lightgbm as lgb

# Explicação dos modelos
!pip install shap
import shap

# Fine Tunning
!pip install optuna -q
from optuna.samplers import TPESampler
import optuna

np.random.seed(42)

# 2. Carregamento dos Dados

In [None]:
# Ajustar o caminho
try:
  df = pd.read_parquet('/content/processed_df.parquet')
except:
  print("Arquivo não encontrado, ajustar o caminho")

In [None]:
print("="*80)
print("INFORMAÇÕES DO DATASET")
print("="*80)
print(f"Dimensões: {df.shape}")
print(f"\nPrimeiras linhas:")
display(df.head())

print(f"\nEstatísticas da variável alvo (Speed):")
display(df['Speed'].describe())


In [None]:
df.columns

In [None]:
df.shape

## 2.1. FILTRAGEM DO MELHOR SINAL


In [None]:
print("\n" + "=" * 80)
print("2.1. FILTRAGEM DO MELHOR SINAL (MAIOR SS_PBCH-RSRP)")
print("=" * 80)
print("Seleciona o melhor RSRP a cada instante. Mas, se houver algum empate o RSRQ é analisado.")
df['timestamp'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d.%m.%Y %H:%M:%S.%f')
df_best = df.sort_values(['SS_PBCH-RSRP', 'SS_PBCH-RSRQ'], ascending=[False, False]).groupby('timestamp').first().reset_index()
print(f"Após filtragem: {df_best.shape}")


In [None]:
df_best.shape

# 3. Criação de novas features


In [None]:
df_best['timestamp'] = pd.to_datetime(df_best['timestamp'])
df_best['Date'] = pd.to_datetime(df_best['Date'])

df_best = df_best.sort_values('timestamp')

deltas = df_best['timestamp'].diff().dt.total_seconds()
print(deltas.describe())

In [None]:
# identificando segmentos continuos dentro de um mesmo dia.
# se houver um intervalo maior do que 1 minuto entre 2 medidas entao a janela da media movel deve ser "resetada"

df_best = df_best.sort_values('timestamp')

df_best['delta_s'] = df_best.groupby('Date')['timestamp'].transform(lambda x: x.diff().dt.total_seconds())
df_best['segment'] = df_best.groupby('Date')['delta_s'].transform(lambda s: (s > 60).cumsum())

In [None]:
# media movel com janela de 10 segundos
g = df_best.groupby(['Date', 'segment'])

r = g.rolling('10s', on='timestamp', min_periods=1, closed='right')

s = r['SS_PBCH-RSRP'].mean()

s_df = (
    s.rename('SS_PBCH-RSRP_janela_10_s')
      .reset_index()
)

df_best = df_best.merge(
    s_df,
    on=['Date', 'segment', 'timestamp'],
    how='left'
)
df_best.head()

In [None]:
df_best = df_best.sort_values('timestamp')

df_idx = df_best.set_index('timestamp')

per_10s = (
    df_idx
      .groupby(['Date', 'segment'])
      .resample('10S')
      .size()
      .rename('samples_in_bin_10s')
      .reset_index()
)

per_10s['samples_in_bin_10s'].describe()


In [None]:
from google.colab import files

unique_days = sorted(df_best['timestamp'].dt.date.unique())

for day in unique_days:
    df_day = df_best[df_best['timestamp'].dt.date == day]

    df_day = df_day.sort_values('timestamp')

    plt.figure(figsize=(15, 6))

    plt.scatter(
        df_day['timestamp'],
        df_day['SS_PBCH-RSRP'],
        s=6,
        alpha=0.6,
        label='RSRP (medido)'
    )

    plt.plot(
        df_day['timestamp'],
        df_day['SS_PBCH-RSRP_janela_10_s'],
        linewidth=0.5,
        color='green',
        label='RSRP média móvel 10s'
    )

    xmin = df_day['timestamp'].min()
    xmax = df_day['timestamp'].max()
    plt.xlim([xmin, xmax])

    plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))

    plt.xlabel("Timestamp")
    plt.ylabel("RSRP (dBm)")
    plt.title(f"RSRP do Canal SS_PBCH e média móvel de 10s — {day}")
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()


    filename = f"{day}_rsrp_media_movel_10s.png"
    plt.savefig(filename, dpi=300)
    files.download(filename)

    plt.show()
    plt.close()


In [None]:
# media movel com janela de 5 segundos
g = df_best.groupby(['Date', 'segment'])

r = g.rolling('5s', on='timestamp', min_periods=1, closed='right')

s = r['SS_PBCH-RSRP'].mean()

s_df = (
    s.rename('SS_PBCH-RSRP_janela_5_s')
      .reset_index()
)

df_best = df_best.merge(
    s_df,
    on=['Date', 'segment', 'timestamp'],
    how='left'
)
df_best.head()

In [None]:
unique_days = sorted(df_best['timestamp'].dt.date.unique())

for day in unique_days:
    df_day = df_best[df_best['timestamp'].dt.date == day]

    df_day = df_day.sort_values('timestamp')

    plt.figure(figsize=(15, 6))

    plt.scatter(
        df_day['timestamp'],
        df_day['SS_PBCH-RSRP'],
        s=6,
        alpha=0.6,
        label='RSRP (medido)'
    )

    plt.plot(
        df_day['timestamp'],
        df_day['SS_PBCH-RSRP_janela_5_s'],
        linewidth=0.5,
        color='green',
        label='RSRP média móvel 5s'
    )

    xmin = df_day['timestamp'].min()
    xmax = df_day['timestamp'].max()
    plt.xlim([xmin, xmax])

    plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))

    plt.xlabel("Timestamp")
    plt.ylabel("RSRP (dBm)")
    plt.title(f"RSRP do Canal SS_PBCH e média móvel de 5s — {day}")
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()


    filename = f"{day}_rsrp_media_movel_5s.png"
    plt.savefig(filename, dpi=300)
    files.download(filename)

    plt.show()
    plt.close()


In [None]:
per_5s = (
    df_idx
      .groupby(['Date', 'segment'])
      .resample('5S')
      .size()
      .rename('samples_in_bin_5s')
      .reset_index()
)

display(per_5s['samples_in_bin_5s'].describe())
per_5s.head(20)

In [None]:
pre_colunas = ['SSS', 'DM_RS', 'PBCH', 'PSS']
pos_colunas=['-RSRP', '-RSRQ', '-SINR', '-RePower']
for pre in pre_colunas:
    for pos in pos_colunas:
      for window in [5, 10]:

        g = df_best.groupby(['Date', 'segment'])
        tempo=f'{window}s'
        r = g.rolling(tempo, on='timestamp', min_periods=1, closed='right')
        nome_coluna=f'{pre}{pos}'
        nome_colunaFinal=f'{pre}{pos}__janela_{window}_s'


        s = r[nome_coluna].mean()

        s_df = (
            s.rename(nome_colunaFinal)
              .reset_index()
        )

        df_best = df_best.merge(
            s_df,
            on=['Date', 'segment', 'timestamp'],
            how='left'
        )

In [None]:
pre1_col=['SS_PBCH']
pos1_col=['-RSRQ', '-SINR', '-RePower']

df_best = df_best.sort_values('timestamp')

for pre in pre1_col:
    for pos in pos1_col:
      for window in [5, 10]:

        g = df_best.groupby(['Date', 'segment'])
        tempo=f'{window}s'
        r = g.rolling(tempo, on='timestamp', min_periods=1, closed='right')
        nome_coluna=f'{pre}{pos}'
        nome_colunaFinal=f'{pre}{pos}__janela_{window}_s'


        s = r[nome_coluna].mean()

        s_df = (
            s.rename(nome_colunaFinal)
              .reset_index()
        )

        df_best = df_best.merge(
            s_df,
            on=['Date', 'segment', 'timestamp'],
            how='left'
        )

In [None]:
pre_colunas = ['SSS', 'DM_RS', 'PBCH', 'PSS']
pos_colunas=['-RSRP', '-RSRQ', '-SINR', '-RePower']
for pre in pre_colunas:
    for pos in pos_colunas:
      nome_excluido=f'{pre}{pos}'
      df_best = df_best.drop(nome_excluido, axis=1)

pre1_col=['SS_PBCH']
pos1_col=['-RSRQ', '-SINR', '-RePower']
for pre in pre1_col:
    for pos in pos1_col:
      nome_excluido=f'{pre}{pos}'
      df_best = df_best.drop(nome_excluido, axis=1)



In [None]:
# 6. FEATURES TEMPORAIS (HORA, DIA)
print("\n" + "=" * 80)
print("6. FEATURES TEMPORAIS")
print("=" * 80)

df_best['day_of_week'] = (df_best['timestamp'].dt.weekday < 5).astype(int)
hour = df_best['timestamp'].dt.hour
df_best['hour_sin'] = np.sin(2 * np.pi * hour / 24)
df_best['hour_cos'] = np.cos(2 * np.pi * hour / 24)

columns_to_drop = ['timestamp','Date','Time','UTC','SS_PBCH-RSRP','campaign','delta_s','segment']
#columns_to_drop = ['Date','Time','UTC','SS_PBCH-RSRP','campaign','delta_s','segment']

df_best1 = df_best.drop(columns=columns_to_drop, axis=1)

df_best1.info()




# 4. Preparação dos Dados

In [None]:
col=df_best1.columns.to_list()
col.remove('Speed')
X = df_best1[col].copy()
y = df_best1['Speed'].copy()


print(f"Features utilizadas: {len(col)}")
print(f"Amostras: {len(X)}")

# Split train/test
Xx_train, X_test, yy_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

print(f"\nTreino: {Xx_train.shape[0]}, Teste: {X_test.shape[0]}")

Xx_train

In [None]:
# Split train/validação
X_train, X_val, y_train, y_val = train_test_split(
    Xx_train, yy_train, test_size=0.2, random_state=42
)

print(f"\nTreino: {X_train.shape[0]}, Teste: {X_val.shape[0]}")

X_train

In [None]:
excluded_columns = ['day_of_week', 'PCI', 'SSBIdx']

columns_to_scale = [c for c in col if c not in excluded_columns]

X_train_to_scale = X_train[columns_to_scale]
X_train_excluded = X_train[excluded_columns]

X_val_to_scale = X_val[columns_to_scale]
X_val_excluded = X_val[excluded_columns]

X_test_to_scale_final = X_test[columns_to_scale]
X_test_excluded_final = X_test[excluded_columns]

scaler = MinMaxScaler()

X_train_scaled_array = scaler.fit_transform(X_train_to_scale)
X_train_scaled = pd.DataFrame(X_train_scaled_array, columns=columns_to_scale, index=X_train.index)
X_train_scaled = pd.concat([X_train_scaled, X_train_excluded], axis=1)

X_val_scaled_array = scaler.transform(X_val_to_scale)
X_val_scaled = pd.DataFrame(X_val_scaled_array, columns=columns_to_scale, index=X_val.index)
X_val_scaled = pd.concat([X_val_scaled, X_val_excluded], axis=1)

X_test_scaled_array = scaler.transform(X_test_to_scale_final)
X_test_scaled = pd.DataFrame(X_test_scaled_array, columns=columns_to_scale, index=X_test.index)
X_test_scaled = pd.concat([X_test_scaled, X_test_excluded_final], axis=1)

# 5. Modelo 1: Random Forest

In [None]:
print("="*80)
print("MODELO 1: RANDOM FOREST")
print("="*80)

rf2_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

print("\nTreinando Random Forest...")
rf2_model.fit(X_train_scaled, y_train)

y_pred_rf = rf2_model.predict(X_val_scaled)

rf_r2 = r2_score(y_val, y_pred_rf)
rf_rmse = np.sqrt(mean_squared_error(y_val, y_pred_rf))
rf_mae = mean_absolute_error(y_val, y_pred_rf)

print(f"\nR²: {rf_r2:.4f}")
print(f"RMSE: {rf_rmse:.4f} km/h")
print(f"MAE: {rf_mae:.4f} km/h")

# 6. Modelo 2: XGBoost

In [None]:
print("="*80)
print("MODELO 2: XGBOOST")
print("="*80)

xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=15,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

print("\nTreinando XGBoost...")
xgb_model.fit(X_train_scaled, y_train, verbose=False)

y_pred_xgb = xgb_model.predict(X_val_scaled)

xgb_r2 = r2_score(y_val, y_pred_xgb)
xgb_rmse = np.sqrt(mean_squared_error(y_val, y_pred_xgb))
xgb_mae = mean_absolute_error(y_val, y_pred_xgb)

print(f"\nR²: {xgb_r2:.4f}")
print(f"RMSE: {xgb_rmse:.4f} km/h")
print(f"MAE: {xgb_mae:.4f} km/h")

# 7. Modelo 3: LightGBM

In [None]:
print("="*80)
print("MODELO 3: LIGHTGBM")
print("="*80)

lgb_model = lgb.LGBMRegressor(
    n_estimators=100,
    max_depth=15,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

print("\nTreinando LightGBM...")
lgb_model.fit(X_train_scaled, y_train)

y_pred_lgb = lgb_model.predict(X_val_scaled)

lgb_r2 = r2_score(y_val, y_pred_lgb)
lgb_rmse = np.sqrt(mean_squared_error(y_val, y_pred_lgb))
lgb_mae = mean_absolute_error(y_val, y_pred_lgb)

print(f"\nR²: {lgb_r2:.4f}")
print(f"RMSE: {lgb_rmse:.4f} km/h")
print(f"MAE: {lgb_mae:.4f} km/h")

# 8. Comparação de Modelos

In [None]:
# Resumo de resultados
results = pd.DataFrame({
    'Modelo': ['Random Forest', 'XGBoost', 'LightGBM'],
    'R²': [rf_r2, xgb_r2, lgb_r2],
    'RMSE (km/h)': [rf_rmse, xgb_rmse, lgb_rmse],
    'MAE (km/h)': [rf_mae, xgb_mae, lgb_mae]
})

print("="*80)
print("COMPARAÇÃO DE MODELOS")
print("="*80)
display(results.style.highlight_max(axis=0, subset=['R²']).highlight_min(axis=0, subset=['RMSE (km/h)', 'MAE (km/h)']))

# Visualização
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# R² Score
axes[0].bar(results['Modelo'], results['R²'], color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7'])
axes[0].set_title('Coeficiente de Determinação (R²)', fontsize=14, fontweight='bold')
axes[0].set_ylabel('R²')
axes[0].set_ylim(0, 1)
axes[0].grid(True, alpha=0.3)
for i, v in enumerate(results['R²']):
    axes[0].text(i, v + 0.02, f"{v:.4f}", ha='center', fontweight='bold')

# RMSE
axes[1].bar(results['Modelo'], results['RMSE (km/h)'], color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7'])
axes[1].set_title('Erro Quadrático Médio da Raiz (RMSE)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('RMSE (km/h)')
axes[1].grid(True, alpha=0.3)
for i, v in enumerate(results['RMSE (km/h)']):
    axes[1].text(i, v + 0.5, f"{v:.2f}", ha='center', fontweight='bold')

plt.tight_layout()

plt.show()

# 9. Explicação da tomada de decisão do modelo XGBM (melhor modelo)

In [None]:
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_train_scaled)

shap.plots.beeswarm(shap_values)

In [None]:
plt.figure(figsize=(10,8))
shap.summary_plot(shap_values, X_train_scaled, plot_type="bar")

In [None]:
shap_array = np.abs(shap_values.values)

shap_importance = shap_array.mean(axis=0)

df_shap_importance = pd.DataFrame({
    "Feature": X_train_scaled.columns,
    "Mean |SHAP value|": shap_importance
}).sort_values("Mean |SHAP value|", ascending=False)

df_shap_importance.reset_index(drop=True, inplace=True)

df_shap_importance

A análise de explicabilidade via SHAP revelou que as variáveis Latitude e Longitude são, de forma destacada, os dois descritores com maior impacto sobre as previsões de velocidade. Isso indica que o modelo está capturando fortemente padrões geoespaciais específicos do conjunto de treino. Embora isso aumente o desempenho no conjunto atual de dados, também revela um risco importante:

O modelo pode estar se apoiando em coordenadas geográficas absolutas, pouco generalizáveis para novas regiões.

Em outras palavras, o algoritmo aprende características muito particulares de cada ponto no mapa (por exemplo, ruas, quadras, morros, prédios), tornando-se dependente do local exato onde as medições foram realizadas. Esse comportamento reduz a capacidade do modelo de generalizar para áreas diferentes ou campanhas futuras.

Para mitigar esse efeito e aumentar a robustez do modelo, optei por transformar as variáveis de latitude e longitude em um descritor categórico de contexto geográfico, classificando cada ponto como:




*   Urbano Densa
*   Suburbano
*   Via Expressa
* outras

Essa abordagem substitui coordenadas exatas (altamente específicas e sensíveis ao dataset) por tipos de região, que são muito mais generalizáveis. Assim, o modelo passa a aprender características associadas ao tipo de ambiente — como densidade de edificações, nível de obstrução, morfologia urbana e comportamento esperado do sinal — em vez de depender de coordenadas específicas.


Essa transformação melhora a capacidade do algoritmo de generalizar para novas cidades, novas campanhas e diferentes contextos espaciais, mantendo a utilidade da informação geográfica de forma mais abstrata e robusta.





# 10. Transformação das variaveis adicionais

Engenharia de atributos determinada heuristicamente

In [None]:
def classify_environment(row):
    lat = row['Latitude']
    lon = row['Longitude']
    speed = row['Speed']
    ts = row['timestamp']

    dias_centro = [
        pd.to_datetime("2020-12-14").date(),
        pd.to_datetime("2020-12-17").date(),
        pd.to_datetime("2020-12-22").date()
    ]

    # ----------------------------------------
    # ÁREA VERDE
    # ----------------------------------------
    ts_area_verde_ini = pd.Timestamp("2021-01-11 16:09:46.868000")
    ts_area_verde_fim = pd.Timestamp("2021-01-11 16:24:41.269000")

    ts_area_verde_2_ini = pd.Timestamp("2020-12-22 15:27:53.268000")
    ts_area_verde_2_fim = pd.Timestamp("2020-12-22 15:28:29.398000")

    ts_area_verde_3_ini = pd.Timestamp("2021-01-10 18:20:26.241000")
    ts_area_verde_3_fim = pd.Timestamp("2021-01-10 18:21:31.720000")

    ts_area_verde_4_ini = pd.Timestamp("2021-01-10 18:09:03.839000")
    ts_area_verde_4_fim = pd.Timestamp("2021-01-10 18:09:51.712000")

    # ----------------------------------------
    # VIA RÁPIDA
    # ----------------------------------------
    ts_via_rapida_ini = pd.Timestamp("2021-01-11 15:45:57.169000")
    ts_via_rapida_fim = pd.Timestamp("2021-01-11 16:08:38.159000")

    ts_vr_extra_1_ini = pd.Timestamp("2021-01-11 16:06:15.349000")
    ts_vr_extra_1_fim = pd.Timestamp("2021-01-11 16:06:36.159000")

    ts_vr_extra_2_single = pd.Timestamp("2021-01-02 12:07:07.329000")
    ts_vr_extra_3_single = pd.Timestamp("2021-01-11 15:57:35.431000")

    ts_vr_extra_4_ini = pd.Timestamp("2021-01-02 11:57:08.862000")
    ts_vr_extra_4_fim = pd.Timestamp("2021-01-02 11:57:30.871000")

    ts_vr_extra_5_ini = pd.Timestamp("2021-01-08 16:18:12.344000")
    ts_vr_extra_5_fim = pd.Timestamp("2021-01-08 16:18:27.863000")

    ts_vr_extra_6_single = pd.Timestamp("2021-01-08 17:59:17.008000")
    ts_vr_extra_7_single = pd.Timestamp("2021-01-02 12:08:27.609000")

    ts_vr_extra_8_ini = pd.Timestamp("2021-01-11 15:37:44.363000")
    ts_vr_extra_8_fim = pd.Timestamp("2021-01-11 15:42:43.909000")

    ts_vr_extra_9_ini = pd.Timestamp("2021-01-02 11:49:03.646000")
    ts_vr_extra_9_fim = pd.Timestamp("2021-01-02 11:51:12.959000")


    ts_vr_extra_11_ini = pd.Timestamp("2021-01-08 18:06:46.780000")
    ts_vr_extra_11_fim = pd.Timestamp("2021-01-08 18:09:06.653000")
    ts_vr_extra_12_ini = pd.Timestamp("2021-01-08 16:02:49.133000")
    ts_vr_extra_12_fim = pd.Timestamp("2021-01-08 16:10:49.128000")

    ts_vr_extra_12_single = pd.Timestamp("2021-01-08 16:18:09.766000")

    ts_vr_extra_13_ini = pd.Timestamp("2021-01-02 12:15:01.582000")
    ts_vr_extra_13_fim = pd.Timestamp("2021-01-02 12:17:10.200000")

    # ----------------------------------------
    # RESIDENCIAL
    # ----------------------------------------
    ts_resid_1_ini = pd.Timestamp("2021-01-16 16:56:20.352000")
    ts_resid_1_fim = pd.Timestamp("2021-01-16 16:58:16.520000")

    ts_resid_2_ini = pd.Timestamp("2021-01-11 15:50:11.988000")
    ts_resid_2_fim = pd.Timestamp("2021-01-11 15:59:01.937000")

    ts_resid_3_ini = pd.Timestamp("2020-12-14 19:18:04.782000")
    ts_resid_3_fim = pd.Timestamp("2020-12-14 19:21:27.236000")

    ts_resid_4_ini = pd.Timestamp("2020-12-17 18:38:16.028000")
    ts_resid_4_fim = pd.Timestamp("2020-12-17 18:43:38.342000")

    ts_resid_5_ini = pd.Timestamp("2020-12-22 15:40:30.182000")
    ts_resid_5_fim = pd.Timestamp("2020-12-22 15:50:28.872000")

    ts_resid_6_ini = pd.Timestamp("2021-01-02 12:07:07.329000")


    if (
        (ts_resid_1_ini <= ts <= ts_resid_1_fim) or
        (ts_resid_2_ini <= ts <= ts_resid_2_fim) or
        (ts_resid_3_ini <= ts <= ts_resid_3_fim) or
        (ts_resid_4_ini <= ts <= ts_resid_4_fim) or
        (ts_resid_5_ini <= ts <= ts_resid_5_fim) or
        (ts==ts_resid_6_ini)

    ):
        return "Residencial"

    if (
        (ts_area_verde_ini <= ts <= ts_area_verde_fim) or
        (ts_area_verde_2_ini <= ts <= ts_area_verde_2_fim) or
        (ts_area_verde_3_ini <= ts <= ts_area_verde_3_fim) or
        (ts_area_verde_4_ini <= ts <= ts_area_verde_4_fim)
    ):
        return "Area_Verde"


    if ts.date() in dias_centro:
        return "Centro_historico"

    cond_via_rapida_coords = (
        (41.8240953 <= lat <= 41.859721) and
        (12.4661876 <= lon <= 12.4953271)
    )

    cond_via_rapida_time = (
        (ts_via_rapida_ini <= ts <= ts_via_rapida_fim) or
        (ts_vr_extra_1_ini <= ts <= ts_vr_extra_1_fim) or
        (ts == ts_vr_extra_2_single) or
        (ts == ts_vr_extra_3_single) or
        (ts_vr_extra_4_ini <= ts <= ts_vr_extra_4_fim) or
        (ts_vr_extra_5_ini <= ts <= ts_vr_extra_5_fim) or
        (ts == ts_vr_extra_6_single) or
        (ts == ts_vr_extra_7_single) or
        (ts_vr_extra_8_ini <= ts <= ts_vr_extra_8_fim) or
        (ts_vr_extra_9_ini <= ts <= ts_vr_extra_9_fim) or
        #(ts_vr_extra_10_ini <= ts <= ts_vr_extra_10_fim) or
        (ts_vr_extra_11_ini <= ts <= ts_vr_extra_11_fim)or
        (ts_vr_extra_12_ini <= ts <= ts_vr_extra_12_fim)or
        (ts == ts_vr_extra_12_single) or
        (ts_vr_extra_13_ini <= ts <= ts_vr_extra_13_fim)
    )

    if cond_via_rapida_coords or cond_via_rapida_time:
        return "via_rapida"

    if (41.888 <= lat <= 41.901) and (12.470 <= lon <= 12.495):
        return "Centro_historico"

    return "Residencial"

In [None]:
df_best['environment'] = df_best.apply(classify_environment, axis=1)

In [None]:
df_best['environment'].value_counts()

In [None]:
import folium
from folium import Tooltip

color_map = {
    'via_rapida': 'red',
    'Centro_historico': 'blue',
    'Area_Verde': 'green',
    'Residencial': 'purple'
}

map_center = [df_best['Latitude'].mean(), df_best['Longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=12)

for idx, row in df_best.iterrows():

    html = f"""
        <div style="font-size:14px; font-weight:bold; width:250px;">
            <b>Latitude:</b> {row['Latitude']}<br>
            <b>Longitude:</b> {row['Longitude']}<br>
            <b>Timestamp:</b> {row['timestamp']}<br>
            <b>Ambiente:</b> {row['environment']}
        </div>
    """

    tooltip = Tooltip(html, sticky=True, style="background-color:white;")

    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=4,
        color=color_map.get(row['environment'], 'gray'),
        fill=True,
        fill_color=color_map.get(row['environment'], 'gray'),
        fill_opacity=0.7,
        tooltip=tooltip
    ).add_to(m)

m

# 11. Modelagem 2:  com a feature enviroment

Modelagem sem latitude x longitude, apenas com os rotulos referentes aos ambientes

In [None]:
columns_to_drop = ['Latitude','Longitude','timestamp','Date','Time','UTC','SS_PBCH-RSRP','campaign','delta_s','segment']

df_best2 = df_best.drop(columns=columns_to_drop, axis=1)

df_best2.info()

In [None]:
codigos = {
    'Residencial': 1,
    'Centro_historico': 2,
    'via_rapida': 3,
    'Area_Verde': 4
}

df_best2["environment"] = df_best2["environment"].map(codigos)

In [None]:
print("Antes da transformação:")
display(df_best2['environment'].dtype)
display(df_best2['environment'].value_counts())

df_best2['environment'] = df_best2['environment'].astype('category')

print("\nDepois da transformação:")
display(df_best2['environment'].dtype)
display(df_best2['environment'].value_counts())

In [None]:
col=df_best2.columns.to_list()
col.remove('Speed')
X = df_best2[col].copy()
y = df_best2['Speed'].copy()


print(f"Features utilizadas: {len(col)}")
print(f"Amostras: {len(X)}")

# Split train/test
Xx_train, X_test, yy_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

print(f"\nTreino (total): {Xx_train.shape[0]}, Teste: {X_test.shape[0]}")

# Split train/validação
X_train, X_val, y_train, y_val = train_test_split(
    Xx_train, yy_train, test_size=0.2, random_state=42
)

print(f"\nTreino: {X_train.shape[0]}, Validação: {X_val.shape[0]}")



In [None]:

excluded_non_transformed_columns = ['day_of_week', 'PCI', 'SSBIdx']
one_hot_encode_columns = ['environment']

columns_to_scale = [c for c in col if c not in excluded_non_transformed_columns + one_hot_encode_columns]


X_train_to_scale = X_train[columns_to_scale]
X_train_to_onehot = X_train[one_hot_encode_columns]
X_train_non_transformed = X_train[excluded_non_transformed_columns]

X_val_to_scale = X_val[columns_to_scale]
X_val_to_onehot = X_val[one_hot_encode_columns]
X_val_non_transformed = X_val[excluded_non_transformed_columns]

X_test_to_scale_final = X_test[columns_to_scale]
X_test_to_onehot_final = X_test[one_hot_encode_columns]
X_test_non_transformed_final = X_test[excluded_non_transformed_columns]


scaler = MinMaxScaler()
X_train_scaled_array = scaler.fit_transform(X_train_to_scale)
X_val_scaled_array = scaler.transform(X_val_to_scale)
X_test_scaled_array = scaler.transform(X_test_to_scale_final)

X_train_scaled_df = pd.DataFrame(X_train_scaled_array, columns=columns_to_scale, index=X_train.index)
X_val_scaled_df = pd.DataFrame(X_val_scaled_array, columns=columns_to_scale, index=X_val.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled_array, columns=columns_to_scale, index=X_test.index)



ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_train_ohe_array = ohe.fit_transform(X_train_to_onehot)
X_val_ohe_array = ohe.transform(X_val_to_onehot)
X_test_ohe_array = ohe.transform(X_test_to_onehot_final)

ohe_feature_names = ohe.get_feature_names_out(one_hot_encode_columns)

X_train_ohe_df = pd.DataFrame(X_train_ohe_array, columns=ohe_feature_names, index=X_train.index)
X_val_ohe_df = pd.DataFrame(X_val_ohe_array, columns=ohe_feature_names, index=X_val.index)
X_test_ohe_df = pd.DataFrame(X_test_ohe_array, columns=ohe_feature_names, index=X_test.index)


X_train_scaled = pd.concat([X_train_scaled_df, X_train_ohe_df, X_train_non_transformed], axis=1)
X_val_scaled = pd.concat([X_val_scaled_df, X_val_ohe_df, X_val_non_transformed], axis=1)
X_test_scaled = pd.concat([X_test_scaled_df, X_test_ohe_df, X_test_non_transformed_final], axis=1)

## 11.2. RF

In [None]:
print("="*80)
print("MODELO 1: RANDOM FOREST")
print("="*80)

rf3_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

print("\nTreinando Random Forest...")
rf3_model.fit(X_train_scaled, y_train)

y_pred_rf = rf3_model.predict(X_val_scaled)

rf2_r2 = r2_score(y_val, y_pred_rf)
rf2_rmse = np.sqrt(mean_squared_error(y_val, y_pred_rf))
rf2_mae = mean_absolute_error(y_val, y_pred_rf)

print(f"\nR²: {rf2_r2:.4f}")
print(f"RMSE: {rf2_rmse:.4f} km/h")
print(f"MAE: {rf2_mae:.4f} km/h")

## 11.3 XGB

In [None]:
print("="*80)
print("MODELO 2: XGBOOST")
print("="*80)

xgb2_model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=15,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

print("\nTreinando XGBoost...")
xgb2_model.fit(X_train_scaled, y_train, verbose=False)

y_pred_xgb = xgb2_model.predict(X_val_scaled)

xgb2_r2 = r2_score(y_val, y_pred_xgb)
xgb2_rmse = np.sqrt(mean_squared_error(y_val, y_pred_xgb))
xgb2_mae = mean_absolute_error(y_val, y_pred_xgb)

print(f"\nR²: {xgb2_r2:.4f}")
print(f"RMSE: {xgb2_rmse:.4f} km/h")
print(f"MAE: {xgb2_mae:.4f} km/h")

##11.4. LGBM

In [None]:
print("="*80)
print("MODELO 3: LIGHTGBM")
print("="*80)

lgb2_model = lgb.LGBMRegressor(
    n_estimators=100,
    max_depth=15,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

print("\nTreinando LightGBM...")
lgb2_model.fit(X_train_scaled, y_train)

y_pred_lgb = lgb2_model.predict(X_val_scaled)

lgb2_r2 = r2_score(y_val, y_pred_lgb)
lgb2_rmse = np.sqrt(mean_squared_error(y_val, y_pred_lgb))
lgb2_mae = mean_absolute_error(y_val, y_pred_lgb)

print(f"\nR²: {lgb2_r2:.4f}")
print(f"RMSE: {lgb2_rmse:.4f} km/h")
print(f"MAE: {lgb2_mae:.4f} km/h")

# 12. Modelagem 3:  com a feature enviroment e sem: altitude, latitude e longitude

In [None]:
X_val_scaled=X_val_scaled.drop(columns=['Altitude'], axis=1)
X_train_scaled=X_train_scaled.drop(columns=['Altitude'], axis=1)

## 12.2: RF

In [None]:
print("="*80)
print("MODELO 1: RANDOM FOREST")
print("="*80)

rf4_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

print("\nTreinando Random Forest...")
rf4_model.fit(X_train_scaled, y_train)

y_pred_rf = rf4_model.predict(X_val_scaled)

rf4_r2 = r2_score(y_val, y_pred_rf)
rf4_rmse = np.sqrt(mean_squared_error(y_val, y_pred_rf))
rf4_mae = mean_absolute_error(y_val, y_pred_rf)

print(f"\nR²: {rf4_r2:.4f}")
print(f"RMSE: {rf4_rmse:.4f} km/h")
print(f"MAE: {rf4_mae:.4f} km/h")

## 12.3: XGBM

In [None]:
print("="*80)
print("MODELO 2: XGBOOST")
print("="*80)

xgb3_model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=15,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

print("\nTreinando XGBoost...")
xgb3_model.fit(X_train_scaled, y_train, verbose=False)

y_pred_xgb = xgb3_model.predict(X_val_scaled)

xgb3_r2 = r2_score(y_val, y_pred_xgb)
xgb3_rmse = np.sqrt(mean_squared_error(y_val, y_pred_xgb))
xgb3_mae = mean_absolute_error(y_val, y_pred_xgb)

print(f"\nR²: {xgb3_r2:.4f}")
print(f"RMSE: {xgb3_rmse:.4f} km/h")
print(f"MAE: {xgb3_mae:.4f} km/h")

## 12.4:LGBM

In [None]:
print("="*80)
print("MODELO 3: LIGHTGBM")
print("="*80)

lgb3_model = lgb.LGBMRegressor(
    n_estimators=100,
    max_depth=15,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

print("\nTreinando LightGBM...")
lgb3_model.fit(X_train_scaled, y_train)

y_pred_lgb = lgb3_model.predict(X_val_scaled)

lgb3_r2 = r2_score(y_val, y_pred_lgb)
lgb3_rmse = np.sqrt(mean_squared_error(y_val, y_pred_lgb))
lgb3_mae = mean_absolute_error(y_val, y_pred_lgb)

print(f"\nR²: {lgb3_r2:.4f}")
print(f"RMSE: {lgb3_rmse:.4f} km/h")
print(f"MAE: {lgb3_mae:.4f} km/h")

# 13. Seleção de features

## 13.1: Explicação do modelo XGB treinado a partir da tomada de decisao das features

In [None]:
explainer = shap.TreeExplainer(xgb3_model)
n_shap = 2000
X_shap = X_train_scaled.sample(n_shap, random_state=42)
shap_values = explainer(X_shap)
shap_array = np.abs(shap_values.values)

# média absoluta por feature
shap_importance = shap_array.mean(axis=0)

df_shap_importance = pd.DataFrame({
    "Feature": X_shap.columns,
    "Mean |SHAP value|": shap_importance
}).sort_values("Mean |SHAP value|", ascending=False)

df_shap_importance.reset_index(drop=True, inplace=True)

df_shap_importance

## 13.2: Avaliação dos Modelos XGB com a remoção das features contextuais

In [None]:
col_enviroments=['environment_3','environment_2','environment_1','environment_4']
X_train_scaled1=X_train_scaled.drop(columns=col_enviroments, axis=1)
X_val_scaled1=X_val_scaled.drop(columns=col_enviroments, axis=1)

col = ['PCI', 'hour_sin', 'hour_cos', 'day_of_week']

modelos_xgb = {}
predicoes_xgb = {}
resultados = []

print("="*80)
print("EXPERIMENTO: REMOÇÃO PROGRESSIVA DE FEATURES CONTEXTUAIS")
print("="*80)

for k in range(0, len(col) + 1):


    if k == 0:
        drop_cols = []
        print("\n" + "-"*80)
        print("Com a remoção da feature Enviroments")
        print("-"*80)
    else:
        drop_cols = col[:k]
        print("\n" + "-"*80)
        print(f"Removendo colunas Enviroments e: {drop_cols}")
        print("-"*80)


    if len(drop_cols) == 0:
        X_train_curr = X_train_scaled1.copy()
        X_val_curr   = X_val_scaled1.copy()
    else:
        X_train_curr = X_train_scaled1.drop(columns=drop_cols)
        X_val_curr   = X_val_scaled1.drop(columns=drop_cols)


    xgb_model = xgb.XGBRegressor(
        n_estimators=100,
        max_depth=15,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )

    print("\nTreinando XGBoost...")
    xgb_model.fit(X_train_curr, y_train, verbose=False)


    y_pred = xgb_model.predict(X_val_curr)


    r2  = r2_score(y_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)

    print(f"R²:   {r2:.4f}")
    print(f"RMSE: {rmse:.4f} km/h")
    print(f"MAE:  {mae:.4f} km/h")


    if k == 0:
        model_name = "xgb_sem_Enviroments"
    else:
        model_name = f"xgb_sem_Enviroments{'_'.join(drop_cols)}"


    modelos_xgb[model_name] = xgb_model
    predicoes_xgb[model_name] = y_pred

    resultados.append({
        "Modelo": model_name,
        "Features_removidas": drop_cols if drop_cols else "Enviroments",
        "R2": r2,
        "RMSE": rmse,
        "MAE": mae
    })


df_resultados = pd.DataFrame(resultados)
df_resultados

# 14. Fine Tunning dos modelos XGB para diferentes configurações de features

In [None]:
n_estimators_list = [80, 100, 120, 150]
max_depth_list = [6, 8, 10, 12, 15]
learning_rate_list = [0.03, 0.05, 0.07, 0.1]
subsample_list = [0.6, 0.7, 0.8, 0.9]
colsample_bytree_list = [0.6, 0.7, 0.8, 0.9]
min_child_weight_list = [1, 2, 3, 5]
gamma_list = [0.0, 0.1, 0.2, 0.3]
reg_alpha_list = [0.0, 0.001, 0.01, 0.05, 0.1]
reg_lambda_list = [1.0, 1.5, 2.0, 3.0]

In [None]:
def evaluate_params(params, X, y, n_splits=5, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    rmses, r2s = [], []

    one_hot_encode_columns = ['environment']

    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_tr_raw  = X.iloc[train_idx].copy()
        X_val_raw = X.iloc[val_idx].copy()
        y_tr  = y.iloc[train_idx].copy()
        y_val = y.iloc[val_idx].copy()

        scaler = MinMaxScaler()
        X_tr_scaled_num  = scaler.fit_transform(X_tr_raw[columns_to_scale])
        X_val_scaled_num = scaler.transform(X_val_raw[columns_to_scale])

        X_tr_scaled_df = pd.DataFrame(X_tr_scaled_num,
                                      columns=columns_to_scale,
                                      index=X_tr_raw.index)
        X_val_scaled_df = pd.DataFrame(X_val_scaled_num,
                                       columns=columns_to_scale,
                                       index=X_val_raw.index)

        try:
            ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        except TypeError:
            ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

        X_tr_to_ohe  = X_tr_raw[one_hot_encode_columns]
        X_val_to_ohe = X_val_raw[one_hot_encode_columns]

        X_tr_ohe_array  = ohe.fit_transform(X_tr_to_ohe)
        X_val_ohe_array = ohe.transform(X_val_to_ohe)

        ohe_feature_names = ohe.get_feature_names_out(one_hot_encode_columns)

        X_tr_ohe_df = pd.DataFrame(X_tr_ohe_array,
                                   columns=ohe_feature_names,
                                   index=X_tr_raw.index)
        X_val_ohe_df = pd.DataFrame(X_val_ohe_array,
                                    columns=ohe_feature_names,
                                    index=X_val_raw.index)

        X_tr_non = X_tr_raw[excluded_columns].drop(columns=one_hot_encode_columns, errors='ignore')
        X_val_non = X_val_raw[excluded_columns].drop(columns=one_hot_encode_columns, errors='ignore')

        X_tr_final  = pd.concat([X_tr_scaled_df,  X_tr_ohe_df,  X_tr_non], axis=1)
        X_val_final = pd.concat([X_val_scaled_df, X_val_ohe_df, X_val_non], axis=1)

        X_val_final = X_val_final.reindex(columns=X_tr_final.columns, fill_value=0)

        xgb_model = xgb.XGBRegressor(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            learning_rate=params['learning_rate'],
            subsample=params['subsample'],
            colsample_bytree=params['colsample_bytree'],
            min_child_weight=params['min_child_weight'],
            gamma=params['gamma'],
            reg_alpha=params['reg_alpha'],
            reg_lambda=params['reg_lambda'],
            random_state=42,
            n_jobs=-1,
            tree_method="hist"
        )

        xgb_model.fit(X_tr_final, y_tr)
        y_pred = xgb_model.predict(X_val_final)

        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        r2   = r2_score(y_val, y_pred)

        rmses.append(rmse)
        r2s.append(r2)

    rmse_mean = np.mean(rmses)
    rmse_std  = np.std(rmses)
    r2_mean   = np.mean(r2s)
    r2_std    = np.std(r2s)

    return rmse_mean, rmse_std, r2_mean, r2_std

In [None]:
def objective(trial):
    # amostra de hiperparâmetros
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', n_estimators_list),
        'max_depth': trial.suggest_categorical('max_depth', max_depth_list),
        'learning_rate': trial.suggest_categorical('learning_rate', learning_rate_list),
        'subsample': trial.suggest_categorical('subsample', subsample_list),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', colsample_bytree_list),
        'min_child_weight': trial.suggest_categorical('min_child_weight', min_child_weight_list),
        'gamma': trial.suggest_categorical('gamma', gamma_list),
        'reg_alpha': trial.suggest_categorical('reg_alpha', reg_alpha_list),
        'reg_lambda': trial.suggest_categorical('reg_lambda', reg_lambda_list),
    }

    rmse_mean, rmse_std, r2_mean, r2_std = evaluate_params(params, Xx_train, yy_train)

    # guardamos info adicional no trial
    trial.set_user_attr("rmse_mean", rmse_mean)
    trial.set_user_attr("rmse_std", rmse_std)
    trial.set_user_attr("r2_mean", r2_mean)
    trial.set_user_attr("r2_std", r2_std)

    # ==========================
    # Função objetivo:
    # minimizar RMSE, mas com restrição R² >= 0.93
    # ==========================
    if r2_mean < 0.93:
        # penaliza fortemente soluções com R² abaixo do limiar de 93%
        penalty = (0.93 - r2_mean) * 50.0   # fator ajustável
        objective_value = rmse_mean + penalty
    else:
        objective_value = rmse_mean

    return objective_value

## 14.1: Avaliando com todas as features contextuais

In [None]:
excluded_columns = ['day_of_week', 'PCI', 'SSBIdx','environment']

all_features = Xx_train.columns.tolist()
columns_to_scale = [c for c in all_features if c not in excluded_columns]

In [None]:
study = optuna.create_study(
    direction='minimize',
    sampler=TPESampler(seed=42)
)

study.enqueue_trial({
    'n_estimators': 100,
    'max_depth': 15,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8
})


study.optimize(objective, timeout=3600)

print("Melhores parâmetros encontrados:")
print(study.best_params)

best_params = study.best_params
rmse_mean, rmse_std, r2_mean, r2_std = evaluate_params(best_params, Xx_train, yy_train)

print("\nDesempenho com melhores parâmetros (CV KFold):")
print(best_params)
print(f"RMSE médio: {rmse_mean:.4f} km/h")
print(f"RMSE std:   {rmse_std:.4f} km/h")
print(f"R² médio:   {r2_mean:.4f}")
print(f"R² std:     {r2_std:.4f}")

## 14.2: Avaliando sem a feature de Altitude

In [None]:
Xx_train = Xx_train.drop('Altitude',axis=1)
excluded_columns = ['day_of_week', 'PCI', 'SSBIdx','environment']

all_features = Xx_train.columns.tolist()
columns_to_scale = [c for c in all_features if c not in excluded_columns]

In [None]:
study = optuna.create_study(
    direction='minimize',
    sampler=TPESampler(seed=42)
)

study.enqueue_trial({
    'n_estimators': 100,
    'max_depth': 15,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8
})


study.optimize(objective, timeout=3600)

print("Melhores parâmetros encontrados:")
print(study.best_params)

best_params = study.best_params
rmse_mean, rmse_std, r2_mean, r2_std = evaluate_params(best_params, Xx_train, yy_train)

print("\nDesempenho com melhores parâmetros (CV KFold):")
print(best_params)
print(f"RMSE médio: {rmse_mean:.4f} km/h")
print(f"RMSE std:   {rmse_std:.4f} km/h")
print(f"R² médio:   {r2_mean:.4f}")
print(f"R² std:     {r2_std:.4f}")

## 14.3: Avaliando sem as features contextuais

In [None]:
def evaluate_params(params, X, y, n_splits=5, random_state=42):
  kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
  rmses = []
  r2s = []
  for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr_raw = X.iloc[train_idx].copy()
    X_val_raw = X.iloc[val_idx].copy()
    y_tr = y.iloc[train_idx].copy()
    y_val = y.iloc[val_idx].copy()

    scaler = MinMaxScaler()
    X_tr_scaled_num = scaler.fit_transform(X_tr_raw[columns_to_scale])
    X_val_scaled_num = scaler.transform(X_val_raw[columns_to_scale])
    X_tr_scaled = pd.DataFrame(X_tr_scaled_num, columns=columns_to_scale, index=X_tr_raw.index)
    X_val_scaled = pd.DataFrame(X_val_scaled_num, columns=columns_to_scale, index=X_val_raw.index)
    X_tr_scaled = pd.concat([X_tr_scaled, X_tr_raw[excluded_columns]], axis=1)
    X_val_scaled = pd.concat([X_val_scaled, X_val_raw[excluded_columns]], axis=1)

    X_tr_scaled = X_tr_scaled[all_features]
    X_val_scaled = X_val_scaled[all_features]

    xgb_model = xgb.XGBRegressor( n_estimators=params['n_estimators'], max_depth=params['max_depth'], learning_rate=params['learning_rate'],
                                subsample=params['subsample'], colsample_bytree=params['colsample_bytree'], min_child_weight=params['min_child_weight'],
                                  gamma=params['gamma'], reg_alpha=params['reg_alpha'], reg_lambda=params['reg_lambda'],random_state=42, n_jobs=-1, tree_method="hist")
    xgb_model.fit(X_tr_scaled, y_tr)
    y_pred = xgb_model.predict(X_val_scaled)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    rmses.append(rmse)
    r2s.append(r2)
  rmse_mean = np.mean(rmses)
  rmse_std = np.std(rmses)
  r2_mean = np.mean(r2s)
  r2_std = np.std(r2s)
  return rmse_mean, rmse_std, r2_mean, r2_std


In [None]:
cols=['PCI', 'hour_sin', 'hour_cos', 'day_of_week','environment', 'Altitude', 'distance_w']
Xx_train = Xx_train.drop(cols,axis=1)
excluded_columns = ['SSBIdx']

all_features = Xx_train.columns.tolist()
columns_to_scale = [c for c in all_features if c not in excluded_columns]


In [None]:
study = optuna.create_study(
    direction='minimize',
    sampler=TPESampler(seed=42)
)

study.enqueue_trial({
    'n_estimators': 100,
    'max_depth': 15,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8
})


study.optimize(objective, timeout=3600)

print("Melhores parâmetros encontrados:")
print(study.best_params)

best_params = study.best_params
rmse_mean, rmse_std, r2_mean, r2_std = evaluate_params(best_params, Xx_train, yy_train)

print("\nDesempenho com melhores parâmetros (CV KFold):")
print(best_params)
print(f"RMSE médio: {rmse_mean:.4f} km/h")
print(f"RMSE std:   {rmse_std:.4f} km/h")
print(f"R² médio:   {r2_mean:.4f}")
print(f"R² std:     {r2_std:.4f}")

# 15. Teste Final dos melhores modelos

## 15.1: Teste final com todas as features contextuais

In [None]:
col=df_best2.columns.to_list()
col.remove('Speed')
X = df_best2[col].copy()
y = df_best2['Speed'].copy()

Xx_train, X_test, yy_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

excluded_non_transformed_columns = ['day_of_week', 'PCI', 'SSBIdx']
one_hot_encode_columns = ['environment']

columns_to_scale = [c for c in col if c not in excluded_non_transformed_columns + one_hot_encode_columns]


X_train_to_scale = Xx_train[columns_to_scale]
X_train_to_onehot = Xx_train[one_hot_encode_columns]
X_train_non_transformed = Xx_train[excluded_non_transformed_columns]

X_test_to_scale_final = X_test[columns_to_scale]
X_test_to_onehot_final = X_test[one_hot_encode_columns]
X_test_non_transformed_final = X_test[excluded_non_transformed_columns]


scaler = MinMaxScaler()
X_train_scaled_array = scaler.fit_transform(X_train_to_scale)
X_test_scaled_array = scaler.transform(X_test_to_scale_final)

X_train_scaled_df = pd.DataFrame(X_train_scaled_array, columns=columns_to_scale, index=Xx_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled_array, columns=columns_to_scale, index=X_test.index)



ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_train_ohe_array = ohe.fit_transform(X_train_to_onehot)
X_test_ohe_array = ohe.transform(X_test_to_onehot_final)

ohe_feature_names = ohe.get_feature_names_out(one_hot_encode_columns)

X_train_ohe_df = pd.DataFrame(X_train_ohe_array, columns=ohe_feature_names, index=Xx_train.index)
X_test_ohe_df = pd.DataFrame(X_test_ohe_array, columns=ohe_feature_names, index=X_test.index)


X_train_scaled = pd.concat([X_train_scaled_df, X_train_ohe_df, X_train_non_transformed], axis=1)
X_test_scaled = pd.concat([X_test_scaled_df, X_test_ohe_df, X_test_non_transformed_final], axis=1)

print("="*80)
print("MODELO: XGBOOST")
print("="*80)

xgb_model_final1 = xgb.XGBRegressor(
    n_estimators=150,
    max_depth=15,
    learning_rate=0.07,
    subsample=0.9,
    colsample_bytree=0.7,
    min_child_weight=1,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    n_jobs=-1
)

print("\nTreinando XGBoost...")
xgb_model_final1.fit(X_train_scaled, yy_train, verbose=False) # Train on yy_train

y_pred_xgb_fnal1 = xgb_model_final1.predict(X_test_scaled)

xgb_final1_r2 = r2_score(y_test, y_pred_xgb_fnal1)
xgb_final1_rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_fnal1))
xgb_final1_mae = mean_absolute_error(y_test, y_pred_xgb_fnal1)

print(f"\nR²: {xgb_final1_r2:.4f}")
print(f"RMSE: {xgb_final1_rmse:.4f} km/h")
print(f"MAE: {xgb_final1_mae:.4f} km/h")

## 15.2: Teste final eliminando a feature de ambiente

In [None]:
df_best2=df_best2.drop('Altitude',axis=1)
col=df_best2.columns.to_list()
col.remove('Speed')
X = df_best2[col].copy()
y = df_best2['Speed'].copy()

Xx_train, X_test, yy_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

excluded_non_transformed_columns = ['day_of_week', 'PCI', 'SSBIdx']
one_hot_encode_columns = ['environment']

columns_to_scale = [c for c in col if c not in excluded_non_transformed_columns + one_hot_encode_columns]


X_train_to_scale = Xx_train[columns_to_scale]
X_train_to_onehot = Xx_train[one_hot_encode_columns]
X_train_non_transformed = Xx_train[excluded_non_transformed_columns]

X_test_to_scale_final = X_test[columns_to_scale]
X_test_to_onehot_final = X_test[one_hot_encode_columns]
X_test_non_transformed_final = X_test[excluded_non_transformed_columns]


scaler = MinMaxScaler()
X_train_scaled_array = scaler.fit_transform(X_train_to_scale)
X_test_scaled_array = scaler.transform(X_test_to_scale_final)

X_train_scaled_df = pd.DataFrame(X_train_scaled_array, columns=columns_to_scale, index=Xx_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled_array, columns=columns_to_scale, index=X_test.index)



ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_train_ohe_array = ohe.fit_transform(X_train_to_onehot)
X_test_ohe_array = ohe.transform(X_test_to_onehot_final)

ohe_feature_names = ohe.get_feature_names_out(one_hot_encode_columns)

X_train_ohe_df = pd.DataFrame(X_train_ohe_array, columns=ohe_feature_names, index=Xx_train.index)
X_test_ohe_df = pd.DataFrame(X_test_ohe_array, columns=ohe_feature_names, index=X_test.index)


X_train_scaled = pd.concat([X_train_scaled_df, X_train_ohe_df, X_train_non_transformed], axis=1)
X_test_scaled = pd.concat([X_test_scaled_df, X_test_ohe_df, X_test_non_transformed_final], axis=1)

print("="*80)
print("MODELO: XGBOOST")
print("="*80)

xgb_model_final1 = xgb.XGBRegressor(
    n_estimators=150,
    max_depth=15,
    learning_rate=0.07,
    subsample=0.9,
    colsample_bytree=0.6,
    min_child_weight=1,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1.5,
    random_state=42,
    n_jobs=-1
)

print("\nTreinando XGBoost...")
xgb_model_final1.fit(X_train_scaled, yy_train, verbose=False) # Train on yy_train

y_pred_xgb_fnal1 = xgb_model_final1.predict(X_test_scaled)

xgb_final1_r2 = r2_score(y_test, y_pred_xgb_fnal1)
xgb_final1_rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_fnal1))
xgb_final1_mae = mean_absolute_error(y_test, y_pred_xgb_fnal1)

print(f"\nR²: {xgb_final1_r2:.4f}")
print(f"RMSE: {xgb_final1_rmse:.4f} km/h")
print(f"MAE: {xgb_final1_mae:.4f} km/h")

## 15.3: Teste final eliminando as features contextuais

In [None]:
col1=['PCI', 'hour_sin', 'hour_cos', 'day_of_week','environment',  'distance_w']
df_best2=df_best2.drop(col1,axis=1)
col=df_best2.columns.to_list()
col.remove('Speed')
X = df_best2[col].copy()
y = df_best2['Speed'].copy()

Xx_train, X_test, yy_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

excluded_columns_for_scaling = ['SSBIdx']

columns_to_scale = [c for c in col if c not in excluded_columns_for_scaling]

X_train_final_to_scale = Xx_train[columns_to_scale]
X_train_final_excluded = Xx_train[excluded_columns_for_scaling]

X_test_to_scale_final = X_test[columns_to_scale]
X_test_excluded_final = X_test[excluded_columns_for_scaling]

scaler = MinMaxScaler()

X_train_scaled_array = scaler.fit_transform(X_train_final_to_scale)
X_train_scaled = pd.DataFrame(X_train_scaled_array, columns=columns_to_scale, index=Xx_train.index)
X_train_scaled = pd.concat([X_train_scaled, X_train_final_excluded], axis=1)

X_test_scaled_array = scaler.transform(X_test_to_scale_final)
X_test_scaled = pd.DataFrame(X_test_scaled_array, columns=columns_to_scale, index=X_test.index)
X_test_scaled = pd.concat([X_test_scaled, X_test_excluded_final], axis=1)

print("="*80)
print("MODELO: XGBOOST (SEM FEATURES CONTEXTUAIS)")
print("="*80)


xgb_model_final3 = xgb.XGBRegressor(
    n_estimators=150,
    max_depth=15,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.7,
    min_child_weight=1,
    gamma=0.1,
    reg_alpha=0.01,
    reg_lambda=2,
    random_state=42,
    n_jobs=-1
)

print("\nTreinando XGBoost...")
xgb_model_final3.fit(X_train_scaled, yy_train, verbose=False)

y_pred_xgb_fnal3 = xgb_model_final3.predict(X_test_scaled)

xgb_final3_r2 = r2_score(y_test, y_pred_xgb_fnal3)
xgb_final3_rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_fnal3))
xgb_final3_mae = mean_absolute_error(y_test, y_pred_xgb_fnal3)

print(f"\nR²: {xgb_final3_r2:.4f}")
print(f"RMSE: {xgb_final3_rmse:.4f} km/h")
print(f"MAE: {xgb_final3_mae:.4f} km/h")
