In [6]:
import glob
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

In [7]:
file_pattern = '../Data_Grouped/tiques_*.parquet'
file_list = glob.glob(file_pattern)

if not file_list:
    raise FileNotFoundError(f'No se encontraron archivos que coincidan con el patrón {file_pattern}')

aux = []
for file in file_list:
    aux_df = pd.read_parquet(file)
    aux.append(aux_df)

df_ser = pd.concat(aux, ignore_index=True)

In [8]:
df_ser = df_ser[df_ser['barrio'] == 'GOYA'] 

In [9]:
replace_dict = {'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u', 'Á': 'A', 'É': 'E', 'Í': 'I', 'Ó': 'O', 'Ú': 'U'}

remove_accents = lambda x: ''.join(replace_dict.get(c, c) for c in x) if isinstance(x, str) else x

df_ser = df_ser.replace('None', np.nan)
df_ser['tipo_zona'] = df_ser['tipo_zona'].apply(remove_accents)
df_ser = df_ser.groupby(['hora', 'barrio', 'distrito', 'tipo_zona']).agg({'cantidad_tickets': 'mean'}).reset_index()

In [10]:
df_filtered = df_ser[df_ser['hora'] >= df_ser['hora'].max() - pd.Timedelta(weeks=20)]
filtered_X = df_filtered.drop(columns=['cantidad_tickets'])
filtered_y = df_filtered['cantidad_tickets'] 

categorical_features = filtered_X.select_dtypes(include=['object']).columns
numeric_features = filtered_X.select_dtypes(include=[np.number]).columns

def extract_datetime(X):
    X = X.copy() 
    cols = X.columns

    for col in cols:
        X[col] = pd.to_datetime(X[col])
        X[f'{col}_hour'] = X[col].dt.hour
        X[f'{col}_dayofweek'] = X[col].dt.dayofweek
        X[f'{col}_day'] = X[col].dt.day
        
    return X.drop(columns=cols)

datetime_transformer = FunctionTransformer(extract_datetime)

preprocessor = ColumnTransformer(
    transformers=[
        ('datetime', datetime_transformer, ['hora']),
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(max_depth=5, n_estimators=15, random_state=42))
])

pipeline.fit(filtered_X, filtered_y)

In [11]:
time_range = pd.date_range(start=df_ser['hora'].max(), periods=13*7*24, freq='H')
combinations = df_ser[['barrio', 'distrito', 'tipo_zona']].drop_duplicates()
combinations_repeated = combinations.loc[combinations.index.repeat(len(time_range))].reset_index(drop=True)
time_range_repeated = pd.DataFrame(np.tile(time_range, len(combinations)), columns=['hora'])
future_df = pd.concat([combinations_repeated, time_range_repeated], axis=1).sort_values(by='hora')

predictions = pipeline.predict(future_df)
future_df.insert(3, 'cantidad_tickets', predictions)
future_df['cantidad_tickets'] = future_df['cantidad_tickets'].round().astype('int64')

future_df.to_parquet('../Data_Grouped/tiques_2024_2trimestre.parquet', index=False)