In [83]:
from datetime import datetime
import pandas as pd
import os
from tqdm import tqdm


# Definir la subcarpeta
subfolder = '../../data_csv_bicing/'
#subfolder = 'data/'

# Obtener la lista de archivos CSV en la subcarpeta
files = os.listdir(subfolder)
files = [f for f in files if f.endswith('ESTACIONS.csv')]


# Lista para almacenar los DataFrames de cada archivo CSV
df_list = []

# Leer cada archivo CSV y añadirlo a la lista
for file in tqdm(files):

    # Obtener fechas limite del fichero
    start_date = datetime(2020,1,1) 
    end_date = datetime(2024,12,31)

    # Leemos el csv y creamos un DataFrame
    file_path = os.path.join(subfolder, file)
    df_temp = pd.read_csv(file_path, low_memory=False)

    # seleccionar columnas relevantes
    cols_to_keep = ['station_id', 'num_bikes_available', 'num_bikes_available_types.mechanical', 'num_bikes_available_types.ebike','num_docks_available', 'last_reported']
    df_temp = df_temp[cols_to_keep]

    # Convertir el campo 'last_reported' del DataFrame a objetos datetime
    df_temp['last_reported'] = pd.to_datetime(df_temp['last_reported'], unit='s')

    # Filtrar el DataFrame para mantener solo las filas dentro del rango de fechas
    df_filtered = df_temp[(df_temp['last_reported'] >= start_date) & (df_temp['last_reported'] <= end_date)].copy()

    # Ahora df_filtered contiene solo las filas con 'last_reported' dentro del rango de fechas especificado
    df_list.append(df_filtered)

# Concatenar todos los DataFrames en uno solo
df_raw = pd.concat(df_list, ignore_index=True)

# Guardar en formato parquet
df_raw.to_parquet('data/1_all_data_raw.heavy.parquet', index=False)


100%|██████████| 48/48 [01:57<00:00,  2.46s/it]


In [43]:
import pandas as pd
import numpy as np

# Leer el archivo .parquet 
df_raw = pd.read_parquet('data/1_all_data_raw.heavy.parquet')

# Cargar datos de las estaciones
df_station_info = pd.read_csv('data/Informacio_Estacions_Bicing.csv')

In [84]:
# limpiar los datos
 
# Eliminar filas donde 'station_id' es nulo y pasarlos a int
df_raw = df_raw.dropna(subset=['station_id'])
df_raw['station_id'] = df_raw['station_id'].astype(int)


# Se añade la informacion de la capacidad de las estaciones, eliminando filas con station_id inexistentes
df_merge = df_raw.merge(df_station_info[["station_id", "capacity"]],
                             on="station_id",
                             how="inner")

# Aplica el filtro para excluir las filas donde 'num_docks_available' > 'capacity'
df_merge = df_merge[df_merge['num_docks_available'] <= df_merge['capacity']]

# Timestamp to hour, day, month, year
df_merge['hour'] = df_merge['last_reported'].dt.hour
df_merge['day'] = df_merge['last_reported'].dt.day
df_merge['month'] = df_merge['last_reported'].dt.month
df_merge['year'] = df_merge['last_reported'].dt.year

# Eliminamos la columna 'last_reported' que ya no es necesaria.
df_merge = df_merge.drop('last_reported', axis=1)

# Guardar en formato parquet
df_merge.to_parquet("data/2_all_data_mean_hour.heavy.parquet", index=False)

In [85]:
import pandas as pd
import numpy as np

# Leer el archivo .parquet 
df_merge = pd.read_parquet('data/2_all_data_mean_hour.heavy.parquet')

In [86]:
df_merge

Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,capacity,hour,day,month,year
0,1,9.0,9.0,0.0,35.0,45,21,31,5,2020
1,2,22.0,22.0,0.0,3.0,29,21,31,5,2020
2,3,12.0,12.0,0.0,15.0,27,22,31,5,2020
3,4,9.0,9.0,0.0,12.0,21,22,31,5,2020
4,5,31.0,31.0,0.0,7.0,39,21,31,5,2020
...,...,...,...,...,...,...,...,...,...,...
197544587,515,5.0,5.0,0.0,19.0,24,21,30,9,2020
197544588,516,18.0,16.0,2.0,3.0,21,21,30,9,2020
197544589,517,3.0,3.0,0.0,20.0,20,21,30,9,2020
197544590,518,2.0,0.0,2.0,24.0,27,21,30,9,2020


In [87]:
# Verificar si hay alguna columna relevaante con valores nulos
df_merge.isnull().any()



station_id                              False
num_bikes_available                     False
num_bikes_available_types.mechanical    False
num_bikes_available_types.ebike         False
num_docks_available                     False
capacity                                False
hour                                    False
day                                     False
month                                   False
year                                    False
dtype: bool

In [88]:
# Merge by taking the mean of the values
df_merge = df_merge.groupby(['station_id', 'hour', 'day', 'month', 'year']).mean().reset_index()


# Creamos el porcentaje de slots disponibles en cada estacion en cada momento
df_merge["percentage_docks_available"] = df_merge["num_docks_available"] / df_merge["capacity"]
df_merge.to_parquet("data/3_all_data_ctx.heavy.parquet", index=False)


In [89]:
import pandas as pd

# Leer el archivo .parquet 
df_ctx = pd.read_parquet('data/3_all_data_ctx.heavy.parquet')

In [90]:
df_ctx

Unnamed: 0,station_id,hour,day,month,year,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,capacity,percentage_docks_available
0,1,0,1,1,2020,23.333333,23.166667,0.166667,20.666667,45.0,0.459259
1,1,0,1,1,2021,40.500000,40.500000,0.000000,5.500000,45.0,0.122222
2,1,0,1,1,2022,14.583333,12.583333,2.000000,31.416667,45.0,0.698148
3,1,0,1,1,2023,39.333333,36.666667,2.666667,6.666667,45.0,0.148148
4,1,0,1,2,2020,31.500000,30.083333,1.416667,12.500000,45.0,0.277778
...,...,...,...,...,...,...,...,...,...,...,...
16276398,519,23,31,10,2023,4.375000,1.000000,3.375000,19.562500,24.0,0.815104
16276399,519,23,31,12,2020,1.833333,0.000000,1.833333,22.166667,24.0,0.923611
16276400,519,23,31,12,2021,0.866667,0.000000,0.866667,23.133333,24.0,0.963889
16276401,519,23,31,12,2022,12.533333,0.000000,12.533333,11.466667,24.0,0.477778


In [91]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime, timedelta

## Este codigo garantiza que estan todas las fechas.

# seleccionar columnas relevantes
cols_to_keep = ['station_id', 'year', 'month', 'day', 'hour', 'percentage_docks_available']
df_ctx = df_ctx[cols_to_keep]

# Lista de rows finales
rows = []

# para cada station_id
for st_id in tqdm(df_ctx.station_id.unique()):
    # creamos df ordenado por fecha.        
    df_station = df_ctx[df_ctx['station_id'] == st_id].sort_values(by=["year", "month", "day", "hour"])
    
    # Obtenemos la fecha mayor y menor
    min_row = df_station.iloc[0]
    min_data = datetime(int(min_row['year']), int(min_row['month']), int(min_row['day']), int(min_row['hour']))
    max_row = df_station.iloc[-1]
    max_data = datetime(int(max_row['year']), int(max_row['month']), int(max_row['day']), int(max_row['hour']))

    # empezamos con la fecha mas antigua
    current_date = min_data
    
    # Para cada registro
    for current_row in df_station.itertuples(index=False):
        current_row_date = datetime(current_row.year, current_row.month, current_row.day, current_row.hour)
        
        # mientras no concida la fecha, añadimos registro faltante
        while current_date < current_row_date:

            new_row = {
                'station_id': st_id,
                'year': current_date.year,
                'month': current_date.month,
                'day': current_date.day,
                'hour': current_date.hour,
                'percentage_docks_available': np.nan
            }
            rows.append(new_row)
            # Sumamos una hora 
            current_date += timedelta(hours=1)
        
        # Llegar aqui implica que la fecha coincide
        rows.append(current_row._asdict())

        # Sumamos una hora 
        current_date += timedelta(hours=1)

# Convertimos la lista de diccionarios en un DataFrame
df_complet = pd.DataFrame(rows)

df_complet.to_parquet("data/4_all_dates_null_ctx.parquet", index=False)


100%|██████████| 506/506 [00:30<00:00, 16.68it/s]


In [95]:
import pandas as pd

# Leer el archivo .parquet 
df_dates = pd.read_parquet('data/4_all_dates_null_ctx.parquet')

In [93]:
df_dates

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available
0,1,2020,1,1,0,0.459259
1,1,2020,1,1,1,0.394444
2,1,2020,1,1,2,0.346296
3,1,2020,1,1,3,0.283333
4,1,2020,1,1,4,0.364815
...,...,...,...,...,...,...
17371882,519,2023,12,31,19,0.767361
17371883,519,2023,12,31,20,0.715278
17371884,519,2023,12,31,21,0.684028
17371885,519,2023,12,31,22,0.666667


In [94]:
from tqdm import tqdm
rows = []
# para cada station_id
for st_id in tqdm(df_dates.station_id.unique()):

    # creamos df ordenado por fecha.
    df_station = df_dates[df_dates['station_id'] == st_id].sort_values(by=["year", "month", "day", "hour"])

    # Para cada registro
    # Iterar sobre las filas a partir de la quinta fila y saltando de 5 en 5
    for i in range(4, len(df_station), 5):

        tmp_row = df_station.iloc[i].copy()
        # retrocedemos las 4 horas anteriores para guardar sus valores si los tiene
        for j in range(1,5):
                # Buscamos la hora y su registro
                tmp_row[f'ctx-{j}'] = df_station.iloc[i-j].percentage_docks_available
        # añadimos el registro con los 4 ctx anteriores.
        rows.append(tmp_row)

col_types = {
     'station_id': int, 
     'year': int,
     'month': int,
     'day': int,
     'hour': int,
     }
# Convertimos la lista de diccionarios en un DataFrame
prediction_data = pd.DataFrame(rows)
for col in col_types:
     prediction_data[col] = prediction_data[col].astype(int)
#prediction_data = pd.concat(rows, ignore_index=True)
# Concatenar todos los DataFrames en uno solo

#prediction_data = pd.DataFrame(rows)


prediction_data.to_parquet("data/5_compress_dates_null_ctx_4h.parquet", index=False)

100%|██████████| 506/506 [32:41<00:00,  3.88s/it]


In [96]:
prediction_data

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available,ctx-1,ctx-2,ctx-3,ctx-4
4,1,2020,1,1,4,0.364815,0.283333,0.346296,0.394444,0.459259
9,1,2020,1,1,9,0.262963,0.248148,0.235185,0.233333,0.298148
14,1,2020,1,1,14,0.335185,0.381481,0.385185,0.337037,0.342593
19,1,2020,1,1,19,0.209259,0.079630,0.177778,0.211111,0.244444
24,1,2020,1,2,0,0.387037,0.420370,0.450000,0.492593,0.418519
...,...,...,...,...,...,...,...,...,...,...
17371864,519,2023,12,31,1,0.631944,0.701389,0.684028,0.812500,0.875000
17371869,519,2023,12,31,6,0.704861,0.583333,0.583333,0.593750,0.621528
17371874,519,2023,12,31,11,0.958333,0.900641,0.861111,0.802083,0.791667
17371879,519,2023,12,31,16,0.833333,0.885417,0.892361,0.958333,0.958333


In [97]:
import pandas as pd

# Leer el archivo .parquet 
df_ctx4 = pd.read_parquet('data/5_compress_dates_null_ctx_4h.parquet')

In [98]:
df_ctx4

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available,ctx-1,ctx-2,ctx-3,ctx-4
0,1,2020,1,1,4,0.364815,0.283333,0.346296,0.394444,0.459259
1,1,2020,1,1,9,0.262963,0.248148,0.235185,0.233333,0.298148
2,1,2020,1,1,14,0.335185,0.381481,0.385185,0.337037,0.342593
3,1,2020,1,1,19,0.209259,0.079630,0.177778,0.211111,0.244444
4,1,2020,1,2,0,0.387037,0.420370,0.450000,0.492593,0.418519
...,...,...,...,...,...,...,...,...,...,...
3474012,519,2023,12,31,1,0.631944,0.701389,0.684028,0.812500,0.875000
3474013,519,2023,12,31,6,0.704861,0.583333,0.583333,0.593750,0.621528
3474014,519,2023,12,31,11,0.958333,0.900641,0.861111,0.802083,0.791667
3474015,519,2023,12,31,16,0.833333,0.885417,0.892361,0.958333,0.958333


In [99]:
# Verificar % de nulos por columna
df_ctx4.isnull().sum()/len(df_ctx4)*100

station_id                    0.000000
year                          0.000000
month                         0.000000
day                           0.000000
hour                          0.000000
percentage_docks_available    6.359986
ctx-1                         6.320148
ctx-2                         6.292312
ctx-3                         6.257338
ctx-4                         6.303855
dtype: float64

In [100]:
# Eliminar filas con valores nulos
df_ctx4_clean = df_ctx4.dropna()
df_ctx4_clean.to_parquet("data/6_compress_dates_clean_ctx_4h.parquet", index=False)

In [1]:
import pandas as pd

# Leer el archivo .parquet 
df_ctx4_clean = pd.read_parquet('data/6_compress_dates_clean_ctx_4h.parquet')

In [2]:
# Verificar si hay alguna columna con nulos
df_ctx4_clean.isnull().any()

station_id                    False
year                          False
month                         False
day                           False
hour                          False
percentage_docks_available    False
ctx-1                         False
ctx-2                         False
ctx-3                         False
ctx-4                         False
dtype: bool

In [3]:
df_ctx4_clean

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available,ctx-1,ctx-2,ctx-3,ctx-4
0,1,2020,1,1,4,0.364815,0.283333,0.346296,0.394444,0.459259
1,1,2020,1,1,9,0.262963,0.248148,0.235185,0.233333,0.298148
2,1,2020,1,1,14,0.335185,0.381481,0.385185,0.337037,0.342593
3,1,2020,1,1,19,0.209259,0.079630,0.177778,0.211111,0.244444
4,1,2020,1,2,0,0.387037,0.420370,0.450000,0.492593,0.418519
...,...,...,...,...,...,...,...,...,...,...
3223874,519,2023,12,31,1,0.631944,0.701389,0.684028,0.812500,0.875000
3223875,519,2023,12,31,6,0.704861,0.583333,0.583333,0.593750,0.621528
3223876,519,2023,12,31,11,0.958333,0.900641,0.861111,0.802083,0.791667
3223877,519,2023,12,31,16,0.833333,0.885417,0.892361,0.958333,0.958333


In [4]:
train = df_ctx4_clean[df_ctx4_clean['year']<2023]
validation = df_ctx4_clean[df_ctx4_clean['year']>=2023]

In [5]:
train

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available,ctx-1,ctx-2,ctx-3,ctx-4
0,1,2020,1,1,4,0.364815,0.283333,0.346296,0.394444,0.459259
1,1,2020,1,1,9,0.262963,0.248148,0.235185,0.233333,0.298148
2,1,2020,1,1,14,0.335185,0.381481,0.385185,0.337037,0.342593
3,1,2020,1,1,19,0.209259,0.079630,0.177778,0.211111,0.244444
4,1,2020,1,2,0,0.387037,0.420370,0.450000,0.492593,0.418519
...,...,...,...,...,...,...,...,...,...,...
3222263,519,2022,12,31,1,0.819444,0.798611,0.881944,0.965278,0.916667
3222264,519,2022,12,31,6,0.802083,0.791667,0.791667,0.791667,0.791667
3222265,519,2022,12,31,11,1.000000,0.965278,0.902778,0.871528,0.833333
3222266,519,2022,12,31,16,0.875000,0.944444,0.944444,0.965278,1.000000


In [7]:
x_train = train.drop(columns=['percentage_docks_available', 'year']).values
y_train = train['percentage_docks_available'].values


x_val = validation.drop(columns=['percentage_docks_available', 'year']).values
y_val = validation['percentage_docks_available'].values

In [120]:
%pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.5.0-cp310-cp310-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.5.0-cp310-cp310-macosx_12_0_arm64.whl (11.0 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl (30.3 MB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.5.0 scipy-1.13.1 threadpoolctl-3.5.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> 

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model

#model = RandomForestRegressor()
model = linear_model.Ridge()
model.fit(y=y_train,X=x_train)

In [15]:
model.score(x_train, y_train)

0.845467153943007

In [16]:
model.score(x_val, y_val)

0.841064330391211

In [17]:
file_path = 'data/metadata_sample_submission_2024.csv'
df_temp = pd.read_csv(file_path, low_memory=False, index_col=0)


In [18]:
df_temp

Unnamed: 0_level_0,station_id,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,1,1,5,0.781481,0.677778,0.696296,0.750000
1,1,1,1,10,0.737374,0.711111,0.711111,0.731624
2,1,1,1,15,0.827778,0.896296,0.901852,0.883333
3,1,1,1,20,0.825926,0.874074,0.927778,0.918519
4,2,1,1,3,0.592593,0.341954,0.275862,0.540230
...,...,...,...,...,...,...,...,...
171897,496,3,31,1,0.828704,0.787037,0.777778,0.853535
171898,496,3,31,6,0.930556,0.944444,0.935185,0.856481
171899,496,3,31,11,0.912037,0.884259,0.518519,0.157407
171900,496,3,31,16,0.245370,0.319444,0.277778,0.305556


In [20]:
x_test = df_temp.values
x_test

array([[1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        6.77777778e-01, 6.96296296e-01, 7.50000000e-01],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        7.11111111e-01, 7.11111111e-01, 7.31623932e-01],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        8.96296296e-01, 9.01851852e-01, 8.83333333e-01],
       ...,
       [4.96000000e+02, 3.00000000e+00, 3.10000000e+01, ...,
        8.84259259e-01, 5.18518519e-01, 1.57407407e-01],
       [4.96000000e+02, 3.00000000e+00, 3.10000000e+01, ...,
        3.19444444e-01, 2.77777778e-01, 3.05555556e-01],
       [4.96000000e+02, 3.00000000e+00, 3.10000000e+01, ...,
        4.35185185e-01, 4.90740741e-01, 5.04629630e-01]])

In [21]:
y_test = model.predict(x_test)

In [26]:
df_submit = pd.DataFrame(y_test, columns=['percentage_docks_available'])

In [32]:
df_submit['index'] = df_submit.index
df_submit = df_submit[['index', 'percentage_docks_available']]
df_submit.to_csv('data/submition_ridge_2305.csv', index=False)
