In [2]:
import glob
import numpy as np
import pandas as pd
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [3]:
file_pattern = '../Data_Grouped/tiques_*.parquet'
file_list = glob.glob(file_pattern)

if not file_list:
    raise FileNotFoundError(f'No se encontraron archivos que coincidan con el patrón {file_pattern}')

aux = []
for file in file_list:
    aux_df = pd.read_parquet(file)
    aux.append(aux_df)

df_ser = pd.concat(aux, ignore_index=True)

In [4]:
df_ser = df_ser[df_ser['barrio'] == 'GOYA'] 

In [5]:
replace_dict = {'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u', 'Á': 'A', 'É': 'E', 'Í': 'I', 'Ó': 'O', 'Ú': 'U'}

remove_accents = lambda x: ''.join(replace_dict.get(c, c) for c in x) if isinstance(x, str) else x

df_ser = df_ser.replace('None', np.nan)
df_ser['tipo_zona'] = df_ser['tipo_zona'].apply(remove_accents)
df_ser = df_ser.groupby(['hora', 'barrio', 'distrito', 'tipo_zona']).agg({'cantidad_tickets': 'mean'}).reset_index()
df_ser.set_index(['hora'], inplace=True)

In [6]:
aux_df_ser = df_ser[df_ser.index >= df_ser.index.max() - pd.Timedelta(weeks=4)]
aux_df_ser = aux_df_ser[aux_df_ser['tipo_zona'] == 'VERDE']
aux_df_ser = aux_df_ser.asfreq('H')
aux_df_ser['cantidad_tickets'] = aux_df_ser['cantidad_tickets'].interpolate(method='linear')

model = auto_arima(aux_df_ser['cantidad_tickets'],
                   start_p=0, start_q=0,
                   max_p=0, max_q=0,
                   d=1,
                   start_P=0, start_Q=0,
                   max_P=3, max_Q=3,
                   D=1,
                   seasonal=True, m=24*7,
                   stepwise=True,
                   trace=True,
                   error_action='ignore',
                   suppress_warnings=True)

print(model.summary())

Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,1,0)[168]             : AIC=4830.964, Time=185.69 sec
 ARIMA(0,1,0)(1,1,0)[168]             : AIC=inf, Time=143.82 sec
 ARIMA(0,1,0)(0,1,1)[168]             : AIC=inf, Time=579.09 sec
 ARIMA(0,1,0)(1,1,1)[168]             : AIC=inf, Time=293.03 sec
 ARIMA(0,1,0)(0,1,0)[168] intercept   : AIC=4832.809, Time=49.13 sec

Best model:  ARIMA(0,1,0)(0,1,0)[168]          
Total fit time: 1251.476 seconds
                                      SARIMAX Results                                      
Dep. Variable:                                   y   No. Observations:                  671
Model:             SARIMAX(0, 1, 0)x(0, 1, 0, 168)   Log Likelihood               -2414.482
Date:                             Mon, 02 Sep 2024   AIC                           4830.964
Time:                                     16:35:00   BIC                           4835.183
Sample:                                 03-04-2024   HQIC                        

In [7]:
r2_scores = []
mse_scores = []
mae_scores = []

for zone_type in df_ser['tipo_zona'].unique():
    df_filtered = df_ser[df_ser.index >= df_ser.index.max() - pd.Timedelta(weeks=21)]
    df_filtered = df_filtered[df_filtered['tipo_zona'] == zone_type]
    df_filtered = df_filtered.asfreq('H')
    df_filtered['cantidad_tickets'] = df_filtered['cantidad_tickets'].interpolate(method='linear')

    train = df_filtered[df_filtered.index < df_filtered.index.max() - pd.Timedelta(weeks=1)]
    test = df_filtered[df_filtered.index >= df_filtered.index.max() - pd.Timedelta(weeks=1)]
    model = SARIMAX(train['cantidad_tickets'], order=(0, 1, 0), seasonal_order=(0, 1, 0, 168))
    model_fit = model.fit(disp=False)
    predictions = model_fit.forecast(steps=len(test))
    
    r2_scores.append(r2_score(test['cantidad_tickets'], predictions))
    mse_scores.append(mean_squared_error(test['cantidad_tickets'], predictions))
    mae_scores.append(mean_absolute_error(test['cantidad_tickets'], predictions))

print(f"Mean R2 Score: {np.mean(r2_scores)}")
print(f"Mean Squared Error (MSE): {np.mean(mse_scores)}")
print(f"Mean Absolute Error (MAE): {np.mean(mae_scores)}")


Mean R2 Score: -1.4749545152786103
Mean Squared Error (MSE): 4279.95392982966
Mean Absolute Error (MAE): 42.0394477317594


In [8]:
results = []

for zone_type in df_ser['tipo_zona'].unique():
    df_filtered = df_ser[df_ser.index >= df_ser.index.max() - pd.Timedelta(weeks=20)]
    df_filtered = df_filtered[df_filtered['tipo_zona'] == zone_type]
    df_filtered = df_filtered.asfreq('H')

    sarima_model = SARIMAX(df_filtered['cantidad_tickets'], order=(0, 1, 0), seasonal_order=(0, 1, 0, 168))
    sarima_fit = sarima_model.fit(disp=False)

    n_periods = 24*28  
    forecast = sarima_fit.get_forecast(steps=n_periods).predicted_mean

    barrio = df_filtered['barrio'].iloc[0]
    distrito = df_filtered['distrito'].iloc[0]

    forecast_df = pd.DataFrame({
        'barrio': barrio,
        'distrito': distrito,
        'tipo_zona': zone_type,
        'cantidad_tickets': forecast
    })

    results.append(forecast_df)

future_df = pd.concat(results, ignore_index=True)
future_df['cantidad_tickets'] = future_df['cantidad_tickets'].round().astype('int64')

In [9]:
future_df

Unnamed: 0,barrio,distrito,tipo_zona,cantidad_tickets
0,GOYA,SALAMANCA,AZUL,-6
1,GOYA,SALAMANCA,AZUL,-3
2,GOYA,SALAMANCA,AZUL,5
3,GOYA,SALAMANCA,AZUL,11
4,GOYA,SALAMANCA,AZUL,0
...,...,...,...,...
2011,GOYA,SALAMANCA,VERDE,-762
2012,GOYA,SALAMANCA,VERDE,-762
2013,GOYA,SALAMANCA,VERDE,-762
2014,GOYA,SALAMANCA,VERDE,-700
