<a href="https://colab.research.google.com/github/auzaluis/upsa_mod_202502/blob/master/forecasting/script_forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Carga de datos

In [None]:
import pandas as pd
import plotly.express as px
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.formula.api import ols

In [None]:
# Carga de datos
path = '/content/drive/MyDrive/Teaching/UPSA/2025.02/modelacion/forecasting/sales.csv'
df = pd.read_csv(path)
df.head()

### 2. Preprocesado

In [None]:
df.info()

In [None]:
df['period'] = pd.to_datetime(df['period'], format='%m/%d/%Y')
df.info()

In [None]:
# Registro de promociones
promo_date = pd.to_datetime('2015-07-01')
post_promo_date = pd.to_datetime('2015-08-01')

In [None]:
# Entrenamiento
df_training = df[df['period'] <= '2015-12-01'].copy()
df_training['promo'] = (df_training['period'] == promo_date).astype(int)
df_training['post_promo'] = (df_training['period'] == post_promo_date).astype(int)
df_training

In [None]:
# Descomposición de la serie
decomp = seasonal_decompose(
    df_training.set_index("period")["sales"],
    model = "additive",
    period = 12
)

In [None]:
decomp.plot();

### 3. Entrenamiento del modelo

In [None]:
df_model = df_training.copy()

Features

In [None]:
df_model['trend'] = range(1, len(df_model) + 1)
df_model['month'] = df_model['period'].dt.month_name()
df_model['lag1'] = df_model['sales'].shift(1)

In [None]:
df_model.head()

In [None]:
# Eliminando la primera fila con NaN por el lag1
df_model = df_model.dropna()
df_model.head()

In [None]:
# Modelo de regresión
model = ols(
    formula = 'sales ~ trend + month + lag1 + promo + post_promo',
    data = df_model
).fit()

In [None]:
print(model.summary())

### 4. Diagnóstico del modelo

In [None]:
df_model['predicted'] = model.predict()

In [None]:
df_model.head()

In [None]:
px.scatter(
    df_model,
    x = 'sales',
    y = 'predicted',
    title = 'Sales vs Predicted'
).add_shape(
    type = 'line',
    x0 = df_model['sales'].min(),
    y0 = df_model['sales'].min(),
    x1 = df_model['sales'].max(),
    y1 = df_model['sales'].max()
)

In [None]:
# Serie temporal + predicciones
px.line(
    df_model,
    x = 'period',
    y = ['sales', 'predicted'],
    title = 'Sales vs Predicted'
)

### 5. Validación de los supuestos

In [None]:
residuals = model.resid

In [None]:
# Normalidad: Shapiro-Wilk
from scipy.stats import shapiro
shapiro_stat, shapiro_p = shapiro(residuals)
print(f"Shapiro-Wilk: {shapiro_stat}, p-value: {shapiro_p}")

In [None]:
# Histograma de los residuos
px.histogram(residuals)

In [None]:
# Homocedasticidad: Breusch-Pagan
from statsmodels.stats.diagnostic import het_breuschpagan
_, p_value, _, _ = het_breuschpagan(residuals, model.model.exog)
print(f"p-value: {p_value}")

In [None]:
# Autocorrelación
from statsmodels.stats.diagnostic import acorr_ljungbox
acorr_ljungbox(residuals, lags=12)

### 6. Validación del modelo

In [53]:
df_futuro_real = df[~df['period'].isin(df_training['period'])]

In [54]:
horizonte = pd.date_range('2016-01-01', '2016-05-01', freq='MS')

In [63]:
# Data frame a utilizar para predecir
df_futuro = pd.DataFrame({'period': horizonte})
df_futuro['trend'] = range(len(df_model) + 2, len(df_model) + 2 + len(df_futuro))
df_futuro['month'] = df_futuro['period'].dt.month_name()

In [64]:
# Promo anterior: 1, nueva promo: 0.7
df_futuro['promo'] = 0
df_futuro['promo'] = df_futuro['promo'].astype(float)
df_futuro.loc[df_futuro['period'] == pd.to_datetime('2016-03-01'), 'promo'] = 0.7

In [None]:
df_futuro['post_promo'] = 0
df_futuro['post_promo'] = df_futuro['post_promo'].astype(float)
df_futuro.loc[df_futuro['period'] == pd.to_datetime('2016-04-01'), 'post_promo'] = 0.7