In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

# --- 1. CARGA DE DATOS ---
df = pd.read_csv('../data/datos_historicos.csv')

# Feature Engineering (Crear Lags) - Respondiendo al punto 5 del tutor
df = df.sort_values(['com_nom', 'titular', 'periodo'])
df['lag_1'] = df.groupby(['com_nom', 'titular'])['total'].shift(1)
df['lag_2'] = df.groupby(['com_nom', 'titular'])['total'].shift(2)
df['rolling_mean_2'] = df.groupby(['com_nom', 'titular'])['total'].transform(lambda x: x.shift(1).rolling(2).mean())
df = df.dropna()

X = df[['periodo', 'lag_1', 'lag_2', 'rolling_mean_2', 'com_nom', 'titular']]
y = df['total']

# --- 2. PIPELINE Y JUSTIFICACIÓN (Punto 2) ---
# Usamos StandardScaler para normalizar 'periodo' y 'lags'
preprocess = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['com_nom', 'titular']),
    ('num', StandardScaler(), ['periodo', 'lag_1', 'lag_2', 'rolling_mean_2'])
])

pipeline = Pipeline([
    ('prep', preprocess),
    ('model', RandomForestRegressor(random_state=42))
])

# --- 3. OPTIMIZACIÓN GRIDSEARCH (Punto 3) ---
param_grid = {
    'model__n_estimators': [50, 100],
    'model__max_depth': [10, 20]
}

tscv = TimeSeriesSplit(n_splits=3)
grid = GridSearchCV(pipeline, param_grid, cv=tscv, scoring='neg_mean_absolute_error', n_jobs=-1)
grid.fit(X, y)

# --- 4. MÉTRICAS ACADÉMICAS (Punto 4) ---
y_pred = grid.predict(X)
mae = mean_absolute_error(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))
r2 = r2_score(y, y_pred)

print(f"Mejores Hiperparámetros: {grid.best_params_}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.4f}")