In [1]:
import glob
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

In [2]:
file_pattern = '../Data_Grouped/tiques_*.parquet'
file_list = glob.glob(file_pattern)

if not file_list:
    raise FileNotFoundError(f'No se encontraron archivos que coincidan con el patrón {file_pattern}')

aux = []
for file in file_list:
    aux_df = pd.read_parquet(file)
    aux.append(aux_df)

df_ser = pd.concat(aux, ignore_index=True)

In [3]:
df_ser = df_ser[df_ser['barrio'] == 'GOYA'] 

In [4]:
replace_dict = {'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u', 'Á': 'A', 'É': 'E', 'Í': 'I', 'Ó': 'O', 'Ú': 'U'}

remove_accents = lambda x: ''.join(replace_dict.get(c, c) for c in x) if isinstance(x, str) else x

df_ser = df_ser.replace('None', np.nan)
df_ser['tipo_zona'] = df_ser['tipo_zona'].apply(remove_accents)
df_ser = df_ser.groupby(['hora', 'barrio', 'distrito', 'tipo_zona']).agg({'cantidad_tickets': 'mean'}).reset_index()

In [5]:
df_filtered = df_ser[df_ser['hora'] >= df_ser['hora'].max() - pd.Timedelta(weeks=20)]
df_filtered = df_filtered[df_filtered['tipo_zona'] == 'VERDE']

filtered_X = df_filtered.drop(columns=['cantidad_tickets'])
filtered_y = df_filtered['cantidad_tickets']

categorical_features = filtered_X.select_dtypes(include=['object']).columns
numeric_features = filtered_X.select_dtypes(include=[np.number]).columns

def extract_datetime(X):
    X = X.copy() 
    cols = X.columns

    for col in cols:
        X[col] = pd.to_datetime(X[col])
        X[f'{col}_hour'] = X[col].dt.hour
        X[f'{col}_dayofweek'] = X[col].dt.dayofweek
        X[f'{col}_day'] = X[col].dt.day
        
    return X.drop(columns=cols)

datetime_transformer = FunctionTransformer(extract_datetime)

preprocessor = ColumnTransformer(
    transformers=[
        ('datetime', datetime_transformer, ['hora']),
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', RandomForestRegressor(random_state=42))])

search_space = {"model__n_estimators": [5, 10, 15], "model__max_depth": [5, 10, 15]}
grid_search = GridSearchCV(estimator=pipeline, param_grid=search_space, cv=5)

predictions = cross_val_predict(grid_search, filtered_X, filtered_y, cv=5)

print(f"Mean R2 Score: {cross_val_score(grid_search, filtered_X, filtered_y, cv=5).mean()}")
print(f"Mean Squared Error: {mean_squared_error(filtered_y, predictions)}")
print(f"Mean Absolute Error: {mean_absolute_error(filtered_y, predictions)}")

Mean R2 Score: 0.996287669510802
Mean Squared Error: 9.181830164949563
Mean Absolute Error: 0.32464089720984163


In [6]:
df_filtered = df_ser[df_ser['hora'] >= df_ser['hora'].max() - pd.Timedelta(weeks=20)]

filtered_X = df_filtered.drop(columns=['cantidad_tickets'])
filtered_y = df_filtered['cantidad_tickets'] 

grid_search.fit(filtered_X, filtered_y)

print(f"Best model found: {grid_search.best_estimator_}")

Best model found: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('datetime',
                                                  FunctionTransformer(func=<function extract_datetime at 0x0000022CB6418CC0>),
                                                  ['hora']),
                                                 ('num', 'passthrough',
                                                  Index([], dtype='object')),
                                                 ('cat', OneHotEncoder(),
                                                  Index(['barrio', 'distrito', 'tipo_zona'], dtype='object'))])),
                ('model',
                 RandomForestRegressor(max_depth=10, n_estimators=5,
                                       random_state=42))])


In [7]:
time_range = pd.date_range(start=df_ser['hora'].max(), periods=24*28, freq='H')
combinations = df_ser[['barrio', 'distrito', 'tipo_zona']].drop_duplicates()
combinations_repeated = combinations.loc[combinations.index.repeat(len(time_range))].reset_index(drop=True)
time_range_repeated = pd.DataFrame(np.tile(time_range, len(combinations)), columns=['hora'])
future_df = pd.concat([combinations_repeated, time_range_repeated], axis=1).sort_values(by='hora')

predictions = grid_search.predict(future_df)
future_df.insert(3, 'cantidad_tickets', predictions)
future_df['cantidad_tickets'] = future_df['cantidad_tickets'].round().astype('int64')

In [8]:
future_df

Unnamed: 0,barrio,distrito,tipo_zona,cantidad_tickets,hora
0,GOYA,SALAMANCA,AZUL,47,2024-09-30 10:00:00
672,GOYA,SALAMANCA,COMERCIALES,96,2024-09-30 10:00:00
1344,GOYA,SALAMANCA,VERDE,242,2024-09-30 10:00:00
1,GOYA,SALAMANCA,AZUL,47,2024-09-30 11:00:00
673,GOYA,SALAMANCA,COMERCIALES,96,2024-09-30 11:00:00
...,...,...,...,...,...
670,GOYA,SALAMANCA,AZUL,33,2024-10-28 08:00:00
1342,GOYA,SALAMANCA,COMERCIALES,39,2024-10-28 08:00:00
1343,GOYA,SALAMANCA,COMERCIALES,40,2024-10-28 09:00:00
671,GOYA,SALAMANCA,AZUL,50,2024-10-28 09:00:00
