In [8]:
from datetime import datetime
import polars as pl
import os
from tqdm import tqdm


# Definir la subcarpeta
subfolder = '../../data_csv_bicing/'
#subfolder = 'data/'

# Obtener la lista de archivos CSV en la subcarpeta
files = os.listdir(subfolder)
files = [f for f in files if f.endswith('ESTACIONS.csv')]


# Lista para almacenar los DataFrames de cada archivo CSV
df_list = []

# Leer cada archivo CSV y añadirlo a la lista
for file in tqdm(files):

    # Obtener fechas limite del fichero
    start_date = datetime(2020,1,1) 
    end_date = datetime(2024,12,31)

    # Leemos el csv y creamos un DataFrame
    file_path = os.path.join(subfolder, file)
    # Read the CSV file and specify the dtype for the 'last_reported' column
    df_temp = pl.read_csv(
        file_path,
        dtypes={
            "station_id": pl.Int64,
            "num_bikes_available": pl.Int64,
            "num_bikes_available_types.mechanical": pl.Int64,
            "num_bikes_available_types.ebike": pl.Int64,
            "num_docks_available": pl.Int64,            
            "last_reported": pl.Float64
        },
        infer_schema_length=10000,
            null_values="NA"  # Agrega 'NA' a la lista de valores nulos
    )


    # seleccionar columnas relevantes
    cols_to_keep = ['station_id', 'num_bikes_available', 'num_bikes_available_types.mechanical', 'num_bikes_available_types.ebike','num_docks_available', 'last_reported']
    df_temp = df_temp.select(cols_to_keep)

    # Convertir el campo 'last_reported' del DataFrame a objetos datetime
    #df_temp['last_reported'] = pl.to_datetime(df_temp['last_reported'], unit='s')

    # Convierte la columna 'last_reported' de timestamp a datetime
    df_temp = df_temp.with_columns([
        (pl.col("last_reported").cast(pl.Int64) * 1_000_000).alias("last_reported").cast(pl.Datetime("us"))
    ])

    # Filtrar el DataFrame para mantener solo las filas dentro del rango de fechas
    df_filtered = df_temp.filter(
        (pl.col("last_reported") >= start_date) & (pl.col("last_reported") <= end_date)
    )
    # Ahora df_filtered contiene solo las filas con 'last_reported' dentro del rango de fechas especificado
    df_list.append(df_filtered)

# Concatenar todos los DataFrames en uno solo
df_raw = pl.concat(df_list)

# Guardar en formato parquet
df_raw.write_parquet('data/pl/1_all_data_raw_pl.heavy.parquet')


100%|██████████| 48/48 [00:18<00:00,  2.65it/s]


In [2]:
import polars as pl

# Leer el archivo .parquet 
df_raw = pl.read_parquet('data/pl/1_all_data_raw_pl.heavy.parquet')

# Cargar datos de las estaciones
df_station_info = pl.read_csv('data/Informacio_Estacions_Bicing.csv')

In [9]:

# limpiar los datos

# Eliminar filas con valores nulos en la columna 'station_id' y 'last_reported'
df_raw = df_raw.drop_nulls("station_id")
df_raw = df_raw.drop_nulls("last_reported")

# Se añade la información de la capacidad de las estaciones, eliminando filas con station_id inexistentes
df_merge = df_raw.join(df_station_info.select(["station_id", "capacity"]),
                       on="station_id",
                       how="inner")

# Aplica el filtro para excluir las filas donde 'num_docks_available' > 'capacity'
df_merge = df_merge.filter(pl.col('num_docks_available') <= pl.col('capacity'))

# Extraer hora, día, mes y año de la columna 'last_reported'
hour_expr = pl.col('last_reported').dt.hour().alias('hour')
day_expr = pl.col('last_reported').dt.day().alias('day')
month_expr = pl.col('last_reported').dt.month().alias('month')
year_expr = pl.col('last_reported').dt.year().alias('year')

# Añadir las nuevas columnas al DataFrame
df_merge = df_merge.with_columns([hour_expr, day_expr, month_expr, year_expr])

# Eliminamos la columna 'last_reported' que ya no es necesaria.
df_merge = df_merge.drop('last_reported')

# Guardar en formato parquet
df_merge.write_parquet("data/pl/2_all_data_mean_hour_pl.heavy.parquet")


In [19]:
import polars as pl

# Leer el archivo .parquet 
df_merge = pl.read_parquet('data/pl/2_all_data_mean_hour_pl.heavy.parquet')

In [20]:
df_merge

station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,capacity,hour,day,month,year
i64,i64,i64,i64,i64,i64,i8,i8,i8,i32
1,9,9,0,35,45,21,31,5,2020
2,22,22,0,3,29,21,31,5,2020
3,12,12,0,15,27,22,31,5,2020
4,9,9,0,12,21,22,31,5,2020
5,31,31,0,7,39,21,31,5,2020
…,…,…,…,…,…,…,…,…,…
515,5,5,0,19,24,21,30,9,2020
516,18,16,2,3,21,21,30,9,2020
517,3,3,0,20,20,21,30,9,2020
518,2,0,2,24,27,21,30,9,2020


In [21]:
# Check for null values

df_merge.null_count()


station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,capacity,hour,day,month,year
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0


In [22]:
# Realizar el groupby y la agregación
df_grouped = df_merge.group_by(['station_id', 'hour', 'day', 'month', 'year']).agg(
    [pl.mean(col) for col in df_merge.columns if col not in ['station_id', 'hour', 'day', 'month', 'year']]
)
# Creamos el porcentaje de slots disponibles en cada estacion en cada momento
df_grouped = df_grouped.with_columns(
    (pl.col('num_docks_available') / pl.col('capacity')).alias('percentage_docks_available')
)

# Guardar en formato parquet
df_grouped.write_parquet("data/pl/3_all_data_ctx_pl.heavy.parquet")


In [23]:
import polars as pl

# Leer el archivo .parquet 
df_ctx = pl.read_parquet('data/pl/3_all_data_ctx_pl.heavy.parquet')

In [24]:
df_ctx

station_id,hour,day,month,year,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,capacity,percentage_docks_available
i64,i8,i8,i8,i32,f64,f64,f64,f64,f64,f64
228,23,30,7,2020,8.416667,3.0,5.416667,10.583333,21.0,0.503968
459,13,23,5,2023,13.166667,0.5,12.666667,25.083333,39.0,0.643162
355,19,13,10,2023,12.0,9.083333,2.916667,14.0,27.0,0.518519
377,10,29,9,2023,25.0,11.083333,13.916667,4.0,29.0,0.137931
10,14,21,10,2022,13.833333,12.0,1.833333,29.166667,43.0,0.678295
…,…,…,…,…,…,…,…,…,…,…
439,10,27,6,2022,2.416667,1.833333,0.583333,15.583333,18.0,0.865741
501,21,25,6,2023,7.5,0.0,7.5,6.25,16.0,0.390625
152,7,24,6,2020,18.583333,18.583333,0.0,4.416667,31.0,0.142473
511,3,12,9,2020,0.0,0.0,0.0,17.0,20.0,0.85


In [25]:
import polars as pl
from datetime import datetime, timedelta
from tqdm import tqdm


# seleccionar columnas relevantes
cols_to_keep = ['station_id', 'year', 'month', 'day', 'hour', 'percentage_docks_available']
df_ctx = df_ctx.select(cols_to_keep)

# Lista de rows finales
rows = []

# para cada station_id
unique_station_ids = df_ctx.select('station_id').unique().to_series().to_list()
for st_id in tqdm(unique_station_ids):
    # creamos df ordenado por fecha.
    df_station = df_ctx.filter(pl.col('station_id') == st_id).sort(['year', 'month', 'day', 'hour'])
    
    # Obtenemos la fecha mayor y menor
    min_row = df_station.row(0)
    min_data = datetime(min_row[1], min_row[2], min_row[3], min_row[4])
    max_row = df_station.row(-1)
    max_data = datetime(max_row[1], max_row[2], max_row[3], max_row[4])

    # empezamos con la fecha mas antigua
    current_date = min_data
    
    # Para cada registro
    for current_row in df_station.iter_rows(named=True):
        current_row_date = datetime(current_row['year'], current_row['month'], current_row['day'], current_row['hour'])
        
        # mientras no concida la fecha, añadimos registro faltante
        while current_date < current_row_date:
            new_row = {
                'station_id': st_id,
                'year': current_date.year,
                'month': current_date.month,
                'day': current_date.day,
                'hour': current_date.hour,
                'percentage_docks_available': None
            }
            rows.append(new_row)
            # Sumamos una hora 
            current_date += timedelta(hours=1)
        
        # Llegar aqui implica que la fecha coincide
        rows.append(current_row)

        # Sumamos una hora 
        current_date += timedelta(hours=1)

# Convertimos la lista de diccionarios en un DataFrame de Polars
df_complet = pl.DataFrame(rows)

# Guardamos el DataFrame en un archivo .parquet
df_complet.write_parquet("data/pl/4_all_dates_null_ctx_pl.parquet")


100%|██████████| 506/506 [00:22<00:00, 22.38it/s]


In [27]:
import polars as pl

# Leer el archivo .parquet 
df_dates = pl.read_parquet('data/pl/4_all_dates_null_ctx_pl.parquet')

In [32]:
df_dates

station_id,year,month,day,hour,percentage_docks_available
i64,i64,i64,i64,i64,f64
36,2020,1,1,0,0.753968
36,2020,1,1,1,0.670635
36,2020,1,1,2,0.678571
36,2020,1,1,3,0.650794
36,2020,1,1,4,0.694444
…,…,…,…,…,…
515,2023,12,31,19,0.815972
515,2023,12,31,20,0.791667
515,2023,12,31,21,0.791667
515,2023,12,31,22,0.791667


In [94]:
import polars as pl
from tqdm import tqdm

# Leer el archivo .parquet 
df_dates = pl.read_parquet('data/pl/4_all_dates_null_ctx_pl.parquet')

# Lista de rows finales
rows = []

# Obtener los IDs únicos de las estaciones
unique_station_ids = df_dates.select('station_id').unique().to_series().to_list()

# para cada station_id
for st_id in tqdm(unique_station_ids):
    # Filtrar y ordenar el DataFrame por fecha
    df_station = df_dates.filter(pl.col('station_id') == st_id).sort(['year', 'month', 'day', 'hour'])

    # Iterar sobre las filas a partir de la quinta fila y saltando de 5 en 5
    for i in range(4, df_station.height, 5):
        tmp_row = df_station.row(i).copy()
        # retrocedemos las 4 horas anteriores para guardar sus valores si los tiene
        for j in range(1, 5):
            # Buscamos la hora y su registro
            tmp_row[f'ctx-{j}'] = df_station.row(i-j)[5]  # Asumiendo que 'percentage_docks_available' es la sexta columna
        # añadimos el registro con los 4 ctx anteriores.
        rows.append(tmp_row)

# Convertimos la lista de diccionarios en un DataFrame de Polars
prediction_data = pl.DataFrame(rows)

# Guardar el DataFrame en un archivo .parquet
prediction_data.write_parquet("data/pl/5_compress_dates_null_ctx_4h_pl.parquet")

100%|██████████| 506/506 [32:41<00:00,  3.88s/it]


In [96]:
prediction_data

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available,ctx-1,ctx-2,ctx-3,ctx-4
4,1,2020,1,1,4,0.364815,0.283333,0.346296,0.394444,0.459259
9,1,2020,1,1,9,0.262963,0.248148,0.235185,0.233333,0.298148
14,1,2020,1,1,14,0.335185,0.381481,0.385185,0.337037,0.342593
19,1,2020,1,1,19,0.209259,0.079630,0.177778,0.211111,0.244444
24,1,2020,1,2,0,0.387037,0.420370,0.450000,0.492593,0.418519
...,...,...,...,...,...,...,...,...,...,...
17371864,519,2023,12,31,1,0.631944,0.701389,0.684028,0.812500,0.875000
17371869,519,2023,12,31,6,0.704861,0.583333,0.583333,0.593750,0.621528
17371874,519,2023,12,31,11,0.958333,0.900641,0.861111,0.802083,0.791667
17371879,519,2023,12,31,16,0.833333,0.885417,0.892361,0.958333,0.958333


In [97]:
import pandas as pd

# Leer el archivo .parquet 
df_ctx4 = pd.read_parquet('data/5_compress_dates_null_ctx_4h.parquet')

In [98]:
df_ctx4

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available,ctx-1,ctx-2,ctx-3,ctx-4
0,1,2020,1,1,4,0.364815,0.283333,0.346296,0.394444,0.459259
1,1,2020,1,1,9,0.262963,0.248148,0.235185,0.233333,0.298148
2,1,2020,1,1,14,0.335185,0.381481,0.385185,0.337037,0.342593
3,1,2020,1,1,19,0.209259,0.079630,0.177778,0.211111,0.244444
4,1,2020,1,2,0,0.387037,0.420370,0.450000,0.492593,0.418519
...,...,...,...,...,...,...,...,...,...,...
3474012,519,2023,12,31,1,0.631944,0.701389,0.684028,0.812500,0.875000
3474013,519,2023,12,31,6,0.704861,0.583333,0.583333,0.593750,0.621528
3474014,519,2023,12,31,11,0.958333,0.900641,0.861111,0.802083,0.791667
3474015,519,2023,12,31,16,0.833333,0.885417,0.892361,0.958333,0.958333


In [99]:
# Verificar % de nulos por columna
df_ctx4.isnull().sum()/len(df_ctx4)*100

station_id                    0.000000
year                          0.000000
month                         0.000000
day                           0.000000
hour                          0.000000
percentage_docks_available    6.359986
ctx-1                         6.320148
ctx-2                         6.292312
ctx-3                         6.257338
ctx-4                         6.303855
dtype: float64

In [100]:
# Eliminar filas con valores nulos
df_ctx4_clean = df_ctx4.dropna()
df_ctx4_clean.to_parquet("data/6_compress_dates_clean_ctx_4h.parquet", index=False)

In [1]:
import pandas as pd

# Leer el archivo .parquet 
df_ctx4_clean = pd.read_parquet('data/6_compress_dates_clean_ctx_4h.parquet')

In [2]:
# Verificar si hay alguna columna con nulos
df_ctx4_clean.isnull().any()

station_id                    False
year                          False
month                         False
day                           False
hour                          False
percentage_docks_available    False
ctx-1                         False
ctx-2                         False
ctx-3                         False
ctx-4                         False
dtype: bool

In [3]:
df_ctx4_clean

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available,ctx-1,ctx-2,ctx-3,ctx-4
0,1,2020,1,1,4,0.364815,0.283333,0.346296,0.394444,0.459259
1,1,2020,1,1,9,0.262963,0.248148,0.235185,0.233333,0.298148
2,1,2020,1,1,14,0.335185,0.381481,0.385185,0.337037,0.342593
3,1,2020,1,1,19,0.209259,0.079630,0.177778,0.211111,0.244444
4,1,2020,1,2,0,0.387037,0.420370,0.450000,0.492593,0.418519
...,...,...,...,...,...,...,...,...,...,...
3223874,519,2023,12,31,1,0.631944,0.701389,0.684028,0.812500,0.875000
3223875,519,2023,12,31,6,0.704861,0.583333,0.583333,0.593750,0.621528
3223876,519,2023,12,31,11,0.958333,0.900641,0.861111,0.802083,0.791667
3223877,519,2023,12,31,16,0.833333,0.885417,0.892361,0.958333,0.958333


In [4]:
train = df_ctx4_clean[df_ctx4_clean['year']<2023]
validation = df_ctx4_clean[df_ctx4_clean['year']>=2023]

In [5]:
train

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available,ctx-1,ctx-2,ctx-3,ctx-4
0,1,2020,1,1,4,0.364815,0.283333,0.346296,0.394444,0.459259
1,1,2020,1,1,9,0.262963,0.248148,0.235185,0.233333,0.298148
2,1,2020,1,1,14,0.335185,0.381481,0.385185,0.337037,0.342593
3,1,2020,1,1,19,0.209259,0.079630,0.177778,0.211111,0.244444
4,1,2020,1,2,0,0.387037,0.420370,0.450000,0.492593,0.418519
...,...,...,...,...,...,...,...,...,...,...
3222263,519,2022,12,31,1,0.819444,0.798611,0.881944,0.965278,0.916667
3222264,519,2022,12,31,6,0.802083,0.791667,0.791667,0.791667,0.791667
3222265,519,2022,12,31,11,1.000000,0.965278,0.902778,0.871528,0.833333
3222266,519,2022,12,31,16,0.875000,0.944444,0.944444,0.965278,1.000000


In [7]:
x_train = train.drop(columns=['percentage_docks_available', 'year']).values
y_train = train['percentage_docks_available'].values


x_val = validation.drop(columns=['percentage_docks_available', 'year']).values
y_val = validation['percentage_docks_available'].values

In [120]:
%pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.5.0-cp310-cp310-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.5.0-cp310-cp310-macosx_12_0_arm64.whl (11.0 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl (30.3 MB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.5.0 scipy-1.13.1 threadpoolctl-3.5.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> 

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model

#model = RandomForestRegressor()
model = linear_model.Ridge()
model.fit(y=y_train,X=x_train)

In [15]:
model.score(x_train, y_train)

0.845467153943007

In [16]:
model.score(x_val, y_val)

0.841064330391211

In [17]:
file_path = 'data/metadata_sample_submission_2024.csv'
df_temp = pd.read_csv(file_path, low_memory=False, index_col=0)


In [18]:
df_temp

Unnamed: 0_level_0,station_id,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,1,1,5,0.781481,0.677778,0.696296,0.750000
1,1,1,1,10,0.737374,0.711111,0.711111,0.731624
2,1,1,1,15,0.827778,0.896296,0.901852,0.883333
3,1,1,1,20,0.825926,0.874074,0.927778,0.918519
4,2,1,1,3,0.592593,0.341954,0.275862,0.540230
...,...,...,...,...,...,...,...,...
171897,496,3,31,1,0.828704,0.787037,0.777778,0.853535
171898,496,3,31,6,0.930556,0.944444,0.935185,0.856481
171899,496,3,31,11,0.912037,0.884259,0.518519,0.157407
171900,496,3,31,16,0.245370,0.319444,0.277778,0.305556


In [20]:
x_test = df_temp.values
x_test

array([[1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        6.77777778e-01, 6.96296296e-01, 7.50000000e-01],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        7.11111111e-01, 7.11111111e-01, 7.31623932e-01],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        8.96296296e-01, 9.01851852e-01, 8.83333333e-01],
       ...,
       [4.96000000e+02, 3.00000000e+00, 3.10000000e+01, ...,
        8.84259259e-01, 5.18518519e-01, 1.57407407e-01],
       [4.96000000e+02, 3.00000000e+00, 3.10000000e+01, ...,
        3.19444444e-01, 2.77777778e-01, 3.05555556e-01],
       [4.96000000e+02, 3.00000000e+00, 3.10000000e+01, ...,
        4.35185185e-01, 4.90740741e-01, 5.04629630e-01]])

In [21]:
y_test = model.predict(x_test)

In [26]:
df_submit = pd.DataFrame(y_test, columns=['percentage_docks_available'])

In [32]:
df_submit['index'] = df_submit.index
df_submit = df_submit[['index', 'percentage_docks_available']]
df_submit.to_csv('data/submition_ridge_2305.csv', index=False)
