In [15]:
from datetime import datetime, timedelta
def get_minMax_dates(date_string):
    # String de ejemplo
    # date_string = "2020_12_Abril_BicingNou_ESTACIONS"

    # Separar el año y el mes del string
    year, month_str, *_ = date_string.split('_')
    month = datetime.strptime(month_str, "%m").month

    # Crear objeto datetime para el primer día del mes de ese año
    start_date = datetime(int(year), month, 1)

    # Calcular el primer día del mes siguiente
    next_month_start_date = start_date.replace(day=1, month=start_date.month % 12 + 1, year=start_date.year + start_date.month // 12)

    # Restar una hora para obtener el último día del mes de ese año
    end_date = next_month_start_date - timedelta(hours=1)

    return(start_date, end_date)


In [16]:
a,b = get_minMax_dates("2020_12_Abril_BicingNou_ESTACIONS")

In [28]:
import pandas as pd
import os

# Definir la subcarpeta
#subfolder = '../../data_csv_bicing/'
subfolder = 'data/'

# Obtener la lista de archivos CSV en la subcarpeta
files = os.listdir(subfolder)
files = [f for f in files if f.endswith('ESTACIONS.csv')]


# Lista para almacenar los DataFrames de cada archivo CSV
df_list = []

# Leer cada archivo CSV y añadirlo a la lista
for file in files:
    # Obtenim 
    start_date, end_date = get_minMax_dates(file)

    file_path = os.path.join(subfolder, file)
    df_temp = pd.read_csv(file_path)



    # Convertir el campo 'last_reported' del DataFrame a objetos datetime
    df_temp['last_reported_datetime'] = pd.to_datetime(df_temp['last_reported'], unit='s')

    # Filtrar el DataFrame para mantener solo las filas dentro del rango de fechas
    df_filtered = df_temp[(df_temp['last_reported_datetime'] >= start_date) & (df_temp['last_reported_datetime'] <= end_date)]

    # Eliminar la columna temporal 'last_reported_datetime' si ya no la necesitas
    df_filtered.drop(columns=['last_reported_datetime'], inplace=True)

    # Ahora df_filtered contiene solo las filas con 'last_reported' dentro del rango de fechas especificado



    df_list.append(df_filtered)

# Concatenar todos los DataFrames en uno solo
big_df = pd.concat(df_list, ignore_index=True)

# Guardar en formato parquet
big_df.to_parquet('data/1_all_data_raw.parquet', index=False)


  df_temp = pd.read_csv(file_path)
  df_temp = pd.read_csv(file_path)


Fecha de inicio del mes: 2020-12-01 00:00:00
Fecha de finalización del mes: 2020-12-31 23:00:00


In [1]:
import pandas as pd
import numpy as np

# Leer el archivo .parquet 
df = pd.read_parquet('data/1_all_data_raw.parquet')

In [2]:
# limpiar los datos
 
# Eliminar filas donde 'station_id' es nulo y pasarlos a int
df = df.dropna(subset=['station_id'])
df['station_id'] = df['station_id'].astype(int)

df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s')

# Timestamp to hour, day, month, year
df['hour'] = df['last_reported'].dt.hour
df['day'] = df['last_reported'].dt.day
df['month'] = df['last_reported'].dt.month
df['year'] = df['last_reported'].dt.year

# seleccionar columnas relevantes
cols_to_keep = ['station_id', 'year', 'month', 'day', 'hour', 'num_bikes_available', 'num_bikes_available_types.mechanical', 'num_bikes_available_types.ebike','num_docks_available', 'last_reported']
df = df[cols_to_keep]

# Merge by taking the mean of the values
df = df.groupby(['station_id', 'hour', 'day', 'month', 'year']).mean().reset_index()

# Guardar en formato parquet
df.to_parquet("data/2_all_data_mean_hour.parquet", index=False)

In [48]:
import pandas as pd
import numpy as np

# Leer el archivo .parquet 
df_clean = pd.read_parquet('data/2_all_data_mean_hour.parquet')

# Cargar datos de las estaciones
df_station_info = pd.read_csv('data/Informacio_Estacions_Bicing.csv')

In [47]:
# Verificar si hay alguna columna relevaante con valores nulos
hay_nulos1 = df_clean.isnull().any()
hay_nulos2 = df_station_info.isnull().any()

print(hay_nulos1, hay_nulos2)

station_id                              False
hour                                    False
day                                     False
month                                   False
year                                    False
num_bikes_available                     False
num_bikes_available_types.mechanical    False
num_bikes_available_types.ebike         False
num_docks_available                     False
last_reported                           False
dtype: bool station_id                False
name                      False
physical_configuration    False
lat                       False
lon                       False
altitude                  False
address                   False
post_code                 False
capacity                  False
is_charging_station       False
nearby_distance           False
_ride_code_support        False
rental_uris                True
cross_street               True
dtype: bool


In [57]:

# se tienen que eliminar filas con numeros de station_id inexistentes en el registro de estaciones.
df_merge = df_clean.merge(df_station_info[["station_id", "capacity"]],
                             on="station_id",
                             how="inner")

df_merge["percentage_docks_available"] = df_merge["num_docks_available"] / df_merge["capacity"]
df_merge.to_parquet("data/3_all_data_ctx.parquet", index=False)


In [1]:
import pandas as pd

# Leer el archivo .parquet 
df_merge = pd.read_parquet('data/3_all_data_ctx.parquet')

In [10]:
from datetime import datetime

# Extraer los valores mínimos de las columnas 'year', 'month', 'day' y 'hour'
min_year = df_merge['year'].min()
min_month = df_merge['month'].min()
min_day = df_merge['day'].min()
min_hour = df_merge['hour'].min()

# Crear el objeto datetime
init_data = datetime(min_year, min_month, min_day, min_hour)
init_data


datetime.datetime(1970, 1, 1, 0, 0)

In [11]:
from datetime import datetime

# para cada station_id
for st_id in df_merge.station_id.unique():
    # creamos df ordenado por fecha.
    df_station = df_merge[df_merge['station_id'] == st_id].sort_values(by=["year", "month", "day", "hour"])
    # Extraer los valores mínimos de las columnas 'year', 'month', 'day' y 'hour'
    min_year = df_station['year'].min()
    min_month = df_station['month'].min()
    min_day = df_station['day'].min()
    min_hour = df_station['hour'].min()
    # Crear el objeto datetime
    init_data = datetime(min_year, min_month, min_day, min_hour)
    print(st_id, init_data)


1 2019-01-01 00:00:00
2 2019-01-01 00:00:00
3 2019-01-01 00:00:00
4 2019-01-01 00:00:00
5 2019-01-01 00:00:00
6 2019-01-01 00:00:00
7 2019-01-01 00:00:00
8 2019-01-01 00:00:00
9 2019-01-01 00:00:00
10 2020-01-01 00:00:00
11 2019-01-01 00:00:00
12 2019-01-01 00:00:00
13 2019-01-01 00:00:00
14 2019-01-01 00:00:00
15 2019-01-01 00:00:00
17 2019-01-01 00:00:00
18 2019-01-01 00:00:00
19 2019-01-01 00:00:00
20 2019-01-01 00:00:00
21 2019-01-01 00:00:00
22 2019-01-01 00:00:00
23 2019-01-01 00:00:00
24 2019-01-01 00:00:00
25 2019-01-01 00:00:00
26 2019-01-01 00:00:00
27 2019-01-01 00:00:00
28 2019-01-01 00:00:00
29 2019-01-01 00:00:00
30 2019-01-01 00:00:00
31 2019-01-01 00:00:00
32 2019-01-01 00:00:00
33 2019-01-01 00:00:00
34 2019-01-01 00:00:00
35 2019-01-01 00:00:00
36 2019-01-01 00:00:00
37 2019-01-01 00:00:00
39 2019-01-01 00:00:00
40 2019-01-01 00:00:00
41 2019-01-01 00:00:00
42 2019-01-01 00:00:00
43 1970-01-01 00:00:00
44 2019-01-01 00:00:00
45 2019-01-01 00:00:00
46 2019-01-01 00:00:

In [68]:
# TODO revisar estrategia.

prediction_data = pd.DataFrame()

for s in df_merge.station_id.unique():
    ctx = df_merge.loc[df_merge["station_id"] == s, :]
    ctx = ctx.sort_values(by=["year", "month", "day", "hour"],
                            ignore_index=True)

    for lag in range(1, 5):
        ctx.loc[:, f"ctx-{lag}"] = ctx.loc[:, "percentage_docks_available"].shift(lag)

    ctx = ctx.iloc[4::5]

    prediction_data = pd.concat([prediction_data, ctx], ignore_index=True)

prediction_data = prediction_data[["station_id", "year", "month", "day", "hour", "percentage_docks_available"]]

prediction_data.to_parquet("data/4_data_ctx4h_v1.parquet", index=False)