In [49]:
import pickle
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

pd.set_option("display.max_rows", None)   # Muestra todas las filas
pd.set_option("display.max_columns", None)  # Muestra todas las columnas
pd.set_option("display.width", None)     # No corta la tabla en varias líneas
pd.set_option("display.max_colwidth", None)  # Muestra el contenido de celdas completo

In [50]:
with open("../data/join/df_data.pk1", "rb") as f:
    df_data = pickle.load(f)

In [51]:
df_data.dtypes

ride_id                            object
rideable_type                      object
started_at                 datetime64[ns]
ended_at                   datetime64[ns]
start_station_id                   object
end_station_id                     object
member_casual                      object
year                                int64
month                               int64
day                                 int64
time_hms_ms               timedelta64[ns]
day_type                           object
event                                bool
temperature                       float64
wind_speed                        float64
relative_humidity                 float64
station_level_pressure            float64
dtype: object

In [52]:
df_data.head(10)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_id,end_station_id,member_casual,year,month,day,time_hms_ms,day_type,event,temperature,wind_speed,relative_humidity,station_level_pressure
0,98D355D9A9852BE9,classic_bike,2022-01-01 00:00:05,2022-01-01 00:01:48,623,623,casual,2022,1,1,0 days 00:00:05,Holiday,False,7.8,2.1,83.0,980.8
1,04706CA7F5BD25EE,electric_bike,2022-01-01 00:01:00,2022-01-01 00:04:39,13325,13137,casual,2022,1,1,0 days 00:01:00,Holiday,False,7.8,2.1,83.0,980.8
2,42178E850B92597A,electric_bike,2022-01-01 00:01:16,2022-01-01 00:32:14,TA1305000009,TA1305000009,casual,2022,1,1,0 days 00:01:16,Holiday,False,7.8,2.1,83.0,980.8
3,6B93C46E8F5B114C,classic_bike,2022-01-01 00:02:14,2022-01-01 00:31:07,623,623,casual,2022,1,1,0 days 00:02:14,Holiday,False,7.8,2.1,83.0,980.8
4,466943353EAC8022,classic_bike,2022-01-01 00:02:35,2022-01-01 00:31:04,623,623,casual,2022,1,1,0 days 00:02:35,Holiday,False,7.8,2.1,83.0,980.8
5,AC1F67BDCDDD5988,electric_bike,2022-01-01 00:03:36,2022-01-01 00:04:02,13431,13431,member,2022,1,1,0 days 00:03:36,Holiday,False,7.8,2.1,83.0,980.8
6,A5BD5A4FD53D5414,electric_bike,2022-01-01 00:05:59,2022-01-01 00:08:06,KA1503000038,TA1309000055,member,2022,1,1,0 days 00:05:59,Holiday,False,7.8,2.1,83.0,980.8
7,E93D5F426242BC48,electric_bike,2022-01-01 00:06:15,2022-01-01 00:13:24,TA1309000015,KA1503000022,casual,2022,1,1,0 days 00:06:15,Holiday,False,7.8,2.1,83.0,980.8
8,0C9545AFBACF60E1,classic_bike,2022-01-01 00:06:27,2022-01-01 00:09:06,18003,TA1305000003,member,2022,1,1,0 days 00:06:27,Holiday,False,7.8,2.1,83.0,980.8
9,22E39FF7ECA32D5E,classic_bike,2022-01-01 00:06:37,2022-01-01 00:16:44,TA1306000013,13017,casual,2022,1,1,0 days 00:06:37,Holiday,False,7.8,2.1,83.0,980.817949


# Como se ha menciodando se elimina la columna de presión atomosférica


In [53]:
df_data = df_data.drop(columns=["station_level_pressure"])

# Se realiza el One-Hot Encoding

## De rideable_type

In [54]:
df_data['rideable_type'].unique()

array(['classic_bike', 'electric_bike', 'docked_bike'], dtype=object)

In [55]:
df_data = pd.get_dummies(df_data, columns=['rideable_type'], prefix=['rideable_type'])

In [56]:
df_data.head(10)

Unnamed: 0,ride_id,started_at,ended_at,start_station_id,end_station_id,member_casual,year,month,day,time_hms_ms,day_type,event,temperature,wind_speed,relative_humidity,rideable_type_classic_bike,rideable_type_docked_bike,rideable_type_electric_bike
0,98D355D9A9852BE9,2022-01-01 00:00:05,2022-01-01 00:01:48,623,623,casual,2022,1,1,0 days 00:00:05,Holiday,False,7.8,2.1,83.0,True,False,False
1,04706CA7F5BD25EE,2022-01-01 00:01:00,2022-01-01 00:04:39,13325,13137,casual,2022,1,1,0 days 00:01:00,Holiday,False,7.8,2.1,83.0,False,False,True
2,42178E850B92597A,2022-01-01 00:01:16,2022-01-01 00:32:14,TA1305000009,TA1305000009,casual,2022,1,1,0 days 00:01:16,Holiday,False,7.8,2.1,83.0,False,False,True
3,6B93C46E8F5B114C,2022-01-01 00:02:14,2022-01-01 00:31:07,623,623,casual,2022,1,1,0 days 00:02:14,Holiday,False,7.8,2.1,83.0,True,False,False
4,466943353EAC8022,2022-01-01 00:02:35,2022-01-01 00:31:04,623,623,casual,2022,1,1,0 days 00:02:35,Holiday,False,7.8,2.1,83.0,True,False,False
5,AC1F67BDCDDD5988,2022-01-01 00:03:36,2022-01-01 00:04:02,13431,13431,member,2022,1,1,0 days 00:03:36,Holiday,False,7.8,2.1,83.0,False,False,True
6,A5BD5A4FD53D5414,2022-01-01 00:05:59,2022-01-01 00:08:06,KA1503000038,TA1309000055,member,2022,1,1,0 days 00:05:59,Holiday,False,7.8,2.1,83.0,False,False,True
7,E93D5F426242BC48,2022-01-01 00:06:15,2022-01-01 00:13:24,TA1309000015,KA1503000022,casual,2022,1,1,0 days 00:06:15,Holiday,False,7.8,2.1,83.0,False,False,True
8,0C9545AFBACF60E1,2022-01-01 00:06:27,2022-01-01 00:09:06,18003,TA1305000003,member,2022,1,1,0 days 00:06:27,Holiday,False,7.8,2.1,83.0,True,False,False
9,22E39FF7ECA32D5E,2022-01-01 00:06:37,2022-01-01 00:16:44,TA1306000013,13017,casual,2022,1,1,0 days 00:06:37,Holiday,False,7.8,2.1,83.0,True,False,False


## De member casul

In [57]:
df_data['member_casual'].unique()

array(['casual', 'member'], dtype=object)

In [58]:
df_data["member_casual_bool"] = df_data["member_casual"].map({
    "member": False,
    "casual": True
})


In [59]:
df_data.head(10)

Unnamed: 0,ride_id,started_at,ended_at,start_station_id,end_station_id,member_casual,year,month,day,time_hms_ms,day_type,event,temperature,wind_speed,relative_humidity,rideable_type_classic_bike,rideable_type_docked_bike,rideable_type_electric_bike,member_casual_bool
0,98D355D9A9852BE9,2022-01-01 00:00:05,2022-01-01 00:01:48,623,623,casual,2022,1,1,0 days 00:00:05,Holiday,False,7.8,2.1,83.0,True,False,False,True
1,04706CA7F5BD25EE,2022-01-01 00:01:00,2022-01-01 00:04:39,13325,13137,casual,2022,1,1,0 days 00:01:00,Holiday,False,7.8,2.1,83.0,False,False,True,True
2,42178E850B92597A,2022-01-01 00:01:16,2022-01-01 00:32:14,TA1305000009,TA1305000009,casual,2022,1,1,0 days 00:01:16,Holiday,False,7.8,2.1,83.0,False,False,True,True
3,6B93C46E8F5B114C,2022-01-01 00:02:14,2022-01-01 00:31:07,623,623,casual,2022,1,1,0 days 00:02:14,Holiday,False,7.8,2.1,83.0,True,False,False,True
4,466943353EAC8022,2022-01-01 00:02:35,2022-01-01 00:31:04,623,623,casual,2022,1,1,0 days 00:02:35,Holiday,False,7.8,2.1,83.0,True,False,False,True
5,AC1F67BDCDDD5988,2022-01-01 00:03:36,2022-01-01 00:04:02,13431,13431,member,2022,1,1,0 days 00:03:36,Holiday,False,7.8,2.1,83.0,False,False,True,False
6,A5BD5A4FD53D5414,2022-01-01 00:05:59,2022-01-01 00:08:06,KA1503000038,TA1309000055,member,2022,1,1,0 days 00:05:59,Holiday,False,7.8,2.1,83.0,False,False,True,False
7,E93D5F426242BC48,2022-01-01 00:06:15,2022-01-01 00:13:24,TA1309000015,KA1503000022,casual,2022,1,1,0 days 00:06:15,Holiday,False,7.8,2.1,83.0,False,False,True,True
8,0C9545AFBACF60E1,2022-01-01 00:06:27,2022-01-01 00:09:06,18003,TA1305000003,member,2022,1,1,0 days 00:06:27,Holiday,False,7.8,2.1,83.0,True,False,False,False
9,22E39FF7ECA32D5E,2022-01-01 00:06:37,2022-01-01 00:16:44,TA1306000013,13017,casual,2022,1,1,0 days 00:06:37,Holiday,False,7.8,2.1,83.0,True,False,False,True


## De day_type

In [60]:
df_data['day_type'].unique()

array(['Holiday', 'Weekend', 'Normal'], dtype=object)

In [61]:
df_data = pd.get_dummies(df_data, columns=['day_type'], prefix=['day_type'])

In [62]:
df_data.head(10)

Unnamed: 0,ride_id,started_at,ended_at,start_station_id,end_station_id,member_casual,year,month,day,time_hms_ms,event,temperature,wind_speed,relative_humidity,rideable_type_classic_bike,rideable_type_docked_bike,rideable_type_electric_bike,member_casual_bool,day_type_Holiday,day_type_Normal,day_type_Weekend
0,98D355D9A9852BE9,2022-01-01 00:00:05,2022-01-01 00:01:48,623,623,casual,2022,1,1,0 days 00:00:05,False,7.8,2.1,83.0,True,False,False,True,True,False,False
1,04706CA7F5BD25EE,2022-01-01 00:01:00,2022-01-01 00:04:39,13325,13137,casual,2022,1,1,0 days 00:01:00,False,7.8,2.1,83.0,False,False,True,True,True,False,False
2,42178E850B92597A,2022-01-01 00:01:16,2022-01-01 00:32:14,TA1305000009,TA1305000009,casual,2022,1,1,0 days 00:01:16,False,7.8,2.1,83.0,False,False,True,True,True,False,False
3,6B93C46E8F5B114C,2022-01-01 00:02:14,2022-01-01 00:31:07,623,623,casual,2022,1,1,0 days 00:02:14,False,7.8,2.1,83.0,True,False,False,True,True,False,False
4,466943353EAC8022,2022-01-01 00:02:35,2022-01-01 00:31:04,623,623,casual,2022,1,1,0 days 00:02:35,False,7.8,2.1,83.0,True,False,False,True,True,False,False
5,AC1F67BDCDDD5988,2022-01-01 00:03:36,2022-01-01 00:04:02,13431,13431,member,2022,1,1,0 days 00:03:36,False,7.8,2.1,83.0,False,False,True,False,True,False,False
6,A5BD5A4FD53D5414,2022-01-01 00:05:59,2022-01-01 00:08:06,KA1503000038,TA1309000055,member,2022,1,1,0 days 00:05:59,False,7.8,2.1,83.0,False,False,True,False,True,False,False
7,E93D5F426242BC48,2022-01-01 00:06:15,2022-01-01 00:13:24,TA1309000015,KA1503000022,casual,2022,1,1,0 days 00:06:15,False,7.8,2.1,83.0,False,False,True,True,True,False,False
8,0C9545AFBACF60E1,2022-01-01 00:06:27,2022-01-01 00:09:06,18003,TA1305000003,member,2022,1,1,0 days 00:06:27,False,7.8,2.1,83.0,True,False,False,False,True,False,False
9,22E39FF7ECA32D5E,2022-01-01 00:06:37,2022-01-01 00:16:44,TA1306000013,13017,casual,2022,1,1,0 days 00:06:37,False,7.8,2.1,83.0,True,False,False,True,True,False,False


# Normalizacion de variables numéricas

Se usa una estandarización centrada en 0

## Temperatura

In [63]:
scaler_temperature = StandardScaler()
df_data["temp_std"] = scaler_temperature.fit_transform(df_data[["temperature"]])
df_data.head(10)

Unnamed: 0,ride_id,started_at,ended_at,start_station_id,end_station_id,member_casual,year,month,day,time_hms_ms,event,temperature,wind_speed,relative_humidity,rideable_type_classic_bike,rideable_type_docked_bike,rideable_type_electric_bike,member_casual_bool,day_type_Holiday,day_type_Normal,day_type_Weekend,temp_std
0,98D355D9A9852BE9,2022-01-01 00:00:05,2022-01-01 00:01:48,623,623,casual,2022,1,1,0 days 00:00:05,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614
1,04706CA7F5BD25EE,2022-01-01 00:01:00,2022-01-01 00:04:39,13325,13137,casual,2022,1,1,0 days 00:01:00,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614
2,42178E850B92597A,2022-01-01 00:01:16,2022-01-01 00:32:14,TA1305000009,TA1305000009,casual,2022,1,1,0 days 00:01:16,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614
3,6B93C46E8F5B114C,2022-01-01 00:02:14,2022-01-01 00:31:07,623,623,casual,2022,1,1,0 days 00:02:14,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614
4,466943353EAC8022,2022-01-01 00:02:35,2022-01-01 00:31:04,623,623,casual,2022,1,1,0 days 00:02:35,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614
5,AC1F67BDCDDD5988,2022-01-01 00:03:36,2022-01-01 00:04:02,13431,13431,member,2022,1,1,0 days 00:03:36,False,7.8,2.1,83.0,False,False,True,False,True,False,False,-0.996614
6,A5BD5A4FD53D5414,2022-01-01 00:05:59,2022-01-01 00:08:06,KA1503000038,TA1309000055,member,2022,1,1,0 days 00:05:59,False,7.8,2.1,83.0,False,False,True,False,True,False,False,-0.996614
7,E93D5F426242BC48,2022-01-01 00:06:15,2022-01-01 00:13:24,TA1309000015,KA1503000022,casual,2022,1,1,0 days 00:06:15,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614
8,0C9545AFBACF60E1,2022-01-01 00:06:27,2022-01-01 00:09:06,18003,TA1305000003,member,2022,1,1,0 days 00:06:27,False,7.8,2.1,83.0,True,False,False,False,True,False,False,-0.996614
9,22E39FF7ECA32D5E,2022-01-01 00:06:37,2022-01-01 00:16:44,TA1306000013,13017,casual,2022,1,1,0 days 00:06:37,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614


## Wind speed

In [64]:
scaler_wind = StandardScaler()
df_data["wind_std"] = scaler_wind.fit_transform(df_data[["wind_speed"]])
df_data.head(10)

Unnamed: 0,ride_id,started_at,ended_at,start_station_id,end_station_id,member_casual,year,month,day,time_hms_ms,event,temperature,wind_speed,relative_humidity,rideable_type_classic_bike,rideable_type_docked_bike,rideable_type_electric_bike,member_casual_bool,day_type_Holiday,day_type_Normal,day_type_Weekend,temp_std,wind_std
0,98D355D9A9852BE9,2022-01-01 00:00:05,2022-01-01 00:01:48,623,623,casual,2022,1,1,0 days 00:00:05,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159
1,04706CA7F5BD25EE,2022-01-01 00:01:00,2022-01-01 00:04:39,13325,13137,casual,2022,1,1,0 days 00:01:00,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159
2,42178E850B92597A,2022-01-01 00:01:16,2022-01-01 00:32:14,TA1305000009,TA1305000009,casual,2022,1,1,0 days 00:01:16,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159
3,6B93C46E8F5B114C,2022-01-01 00:02:14,2022-01-01 00:31:07,623,623,casual,2022,1,1,0 days 00:02:14,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159
4,466943353EAC8022,2022-01-01 00:02:35,2022-01-01 00:31:04,623,623,casual,2022,1,1,0 days 00:02:35,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159
5,AC1F67BDCDDD5988,2022-01-01 00:03:36,2022-01-01 00:04:02,13431,13431,member,2022,1,1,0 days 00:03:36,False,7.8,2.1,83.0,False,False,True,False,True,False,False,-0.996614,-1.035159
6,A5BD5A4FD53D5414,2022-01-01 00:05:59,2022-01-01 00:08:06,KA1503000038,TA1309000055,member,2022,1,1,0 days 00:05:59,False,7.8,2.1,83.0,False,False,True,False,True,False,False,-0.996614,-1.035159
7,E93D5F426242BC48,2022-01-01 00:06:15,2022-01-01 00:13:24,TA1309000015,KA1503000022,casual,2022,1,1,0 days 00:06:15,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159
8,0C9545AFBACF60E1,2022-01-01 00:06:27,2022-01-01 00:09:06,18003,TA1305000003,member,2022,1,1,0 days 00:06:27,False,7.8,2.1,83.0,True,False,False,False,True,False,False,-0.996614,-1.035159
9,22E39FF7ECA32D5E,2022-01-01 00:06:37,2022-01-01 00:16:44,TA1306000013,13017,casual,2022,1,1,0 days 00:06:37,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159


## Humedad relativa

Aunque ya está en un rango de 0% a 100% se elige realizar la normalizacion igualmente

In [65]:
scaler_rel_humi = StandardScaler()
df_data["rel_humidity_std"] = scaler_rel_humi.fit_transform(df_data[["relative_humidity"]])
df_data.head(10)

Unnamed: 0,ride_id,started_at,ended_at,start_station_id,end_station_id,member_casual,year,month,day,time_hms_ms,event,temperature,wind_speed,relative_humidity,rideable_type_classic_bike,rideable_type_docked_bike,rideable_type_electric_bike,member_casual_bool,day_type_Holiday,day_type_Normal,day_type_Weekend,temp_std,wind_std,rel_humidity_std
0,98D355D9A9852BE9,2022-01-01 00:00:05,2022-01-01 00:01:48,623,623,casual,2022,1,1,0 days 00:00:05,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734
1,04706CA7F5BD25EE,2022-01-01 00:01:00,2022-01-01 00:04:39,13325,13137,casual,2022,1,1,0 days 00:01:00,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734
2,42178E850B92597A,2022-01-01 00:01:16,2022-01-01 00:32:14,TA1305000009,TA1305000009,casual,2022,1,1,0 days 00:01:16,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734
3,6B93C46E8F5B114C,2022-01-01 00:02:14,2022-01-01 00:31:07,623,623,casual,2022,1,1,0 days 00:02:14,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734
4,466943353EAC8022,2022-01-01 00:02:35,2022-01-01 00:31:04,623,623,casual,2022,1,1,0 days 00:02:35,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734
5,AC1F67BDCDDD5988,2022-01-01 00:03:36,2022-01-01 00:04:02,13431,13431,member,2022,1,1,0 days 00:03:36,False,7.8,2.1,83.0,False,False,True,False,True,False,False,-0.996614,-1.035159,1.399734
6,A5BD5A4FD53D5414,2022-01-01 00:05:59,2022-01-01 00:08:06,KA1503000038,TA1309000055,member,2022,1,1,0 days 00:05:59,False,7.8,2.1,83.0,False,False,True,False,True,False,False,-0.996614,-1.035159,1.399734
7,E93D5F426242BC48,2022-01-01 00:06:15,2022-01-01 00:13:24,TA1309000015,KA1503000022,casual,2022,1,1,0 days 00:06:15,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734
8,0C9545AFBACF60E1,2022-01-01 00:06:27,2022-01-01 00:09:06,18003,TA1305000003,member,2022,1,1,0 days 00:06:27,False,7.8,2.1,83.0,True,False,False,False,True,False,False,-0.996614,-1.035159,1.399734
9,22E39FF7ECA32D5E,2022-01-01 00:06:37,2022-01-01 00:16:44,TA1306000013,13017,casual,2022,1,1,0 days 00:06:37,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734


## Se guardan los scaler para su uso futuro

In [66]:
with open('../data/normalized/scalerTemperature.pkl', 'wb') as f:
    pickle.dump(scaler_temperature, f)
    
with open('../data/normalized/scalerWind.pkl', 'wb') as f:
    pickle.dump(scaler_wind, f)
    
with open('../data/normalized/scalerHumidity.pkl', 'wb') as f:
    pickle.dump(scaler_rel_humi, f)

# Normalización de las estaciones de inicio y fin

In [67]:
from sklearn.preprocessing import LabelEncoder

le_station = LabelEncoder()

# Primero ajustas con todas las estaciones únicas (inicio + fin)
le_station.fit(pd.concat([df_data["start_station_id"], df_data["end_station_id"]]))
df_data["start_station_idx"] = le_station.transform(df_data["start_station_id"])
df_data["end_station_idx"]   = le_station.transform(df_data["end_station_id"])

In [68]:
# Se observan las etiquetas
le_station.classes_

array(['021320', '1011', '1012', ..., 'chargingstx3', 'chargingstx4',
       'chargingstx5'], shape=(1695,), dtype=object)

In [69]:
# Se observa una de las estaciones
le_station.transform(["TA1305000009"])[0]

np.int64(1533)

## Se guarda el LabelEncoder para traducir los datos en el futuro

In [70]:
with open('../data/normalized/label_encoder.pkl', 'wb') as f:
    pickle.dump(le_station, f)

In [71]:
df_data.head(10)

Unnamed: 0,ride_id,started_at,ended_at,start_station_id,end_station_id,member_casual,year,month,day,time_hms_ms,event,temperature,wind_speed,relative_humidity,rideable_type_classic_bike,rideable_type_docked_bike,rideable_type_electric_bike,member_casual_bool,day_type_Holiday,day_type_Normal,day_type_Weekend,temp_std,wind_std,rel_humidity_std,start_station_idx,end_station_idx
0,98D355D9A9852BE9,2022-01-01 00:00:05,2022-01-01 00:01:48,623,623,casual,2022,1,1,0 days 00:00:05,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013
1,04706CA7F5BD25EE,2022-01-01 00:01:00,2022-01-01 00:04:39,13325,13137,casual,2022,1,1,0 days 00:01:00,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,367,300
2,42178E850B92597A,2022-01-01 00:01:16,2022-01-01 00:32:14,TA1305000009,TA1305000009,casual,2022,1,1,0 days 00:01:16,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,1533,1533
3,6B93C46E8F5B114C,2022-01-01 00:02:14,2022-01-01 00:31:07,623,623,casual,2022,1,1,0 days 00:02:14,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013
4,466943353EAC8022,2022-01-01 00:02:35,2022-01-01 00:31:04,623,623,casual,2022,1,1,0 days 00:02:35,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013
5,AC1F67BDCDDD5988,2022-01-01 00:03:36,2022-01-01 00:04:02,13431,13431,member,2022,1,1,0 days 00:03:36,False,7.8,2.1,83.0,False,False,True,False,True,False,False,-0.996614,-1.035159,1.399734,383,383
6,A5BD5A4FD53D5414,2022-01-01 00:05:59,2022-01-01 00:08:06,KA1503000038,TA1309000055,member,2022,1,1,0 days 00:05:59,False,7.8,2.1,83.0,False,False,True,False,True,False,False,-0.996614,-1.035159,1.399734,1415,1675
7,E93D5F426242BC48,2022-01-01 00:06:15,2022-01-01 00:13:24,TA1309000015,KA1503000022,casual,2022,1,1,0 days 00:06:15,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,1651,1403
8,0C9545AFBACF60E1,2022-01-01 00:06:27,2022-01-01 00:09:06,18003,TA1305000003,member,2022,1,1,0 days 00:06:27,False,7.8,2.1,83.0,True,False,False,False,True,False,False,-0.996614,-1.035159,1.399734,473,1529
9,22E39FF7ECA32D5E,2022-01-01 00:06:37,2022-01-01 00:16:44,TA1306000013,13017,casual,2022,1,1,0 days 00:06:37,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1558,258


# Se convierte la columna de hora y mes a variables cíclicas para que el modelo pueda extraer mejor las características

In [72]:
# Se obtiene la hora en formato 0-23
# df_data["hour"] = pd.to_timedelta(df_data["time_hms_ms"])  # ejemplo

# Convertir timedelta a número de horas (float)
df_data["hour_float"] = df_data["time_hms_ms"].dt.total_seconds() / 3600

df_data["hour_sin"] = np.sin(2 * np.pi * df_data["hour_float"] / 24)
df_data["hour_cos"] = np.cos(2 * np.pi * df_data["hour_float"] / 24)

In [73]:
df_data.head(10)

Unnamed: 0,ride_id,started_at,ended_at,start_station_id,end_station_id,member_casual,year,month,day,time_hms_ms,event,temperature,wind_speed,relative_humidity,rideable_type_classic_bike,rideable_type_docked_bike,rideable_type_electric_bike,member_casual_bool,day_type_Holiday,day_type_Normal,day_type_Weekend,temp_std,wind_std,rel_humidity_std,start_station_idx,end_station_idx,hour_float,hour_sin,hour_cos
0,98D355D9A9852BE9,2022-01-01 00:00:05,2022-01-01 00:01:48,623,623,casual,2022,1,1,0 days 00:00:05,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013,0.001389,0.000364,1.0
1,04706CA7F5BD25EE,2022-01-01 00:01:00,2022-01-01 00:04:39,13325,13137,casual,2022,1,1,0 days 00:01:00,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,367,300,0.016667,0.004363,0.99999
2,42178E850B92597A,2022-01-01 00:01:16,2022-01-01 00:32:14,TA1305000009,TA1305000009,casual,2022,1,1,0 days 00:01:16,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,1533,1533,0.021111,0.005527,0.999985
3,6B93C46E8F5B114C,2022-01-01 00:02:14,2022-01-01 00:31:07,623,623,casual,2022,1,1,0 days 00:02:14,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013,0.037222,0.009745,0.999953
4,466943353EAC8022,2022-01-01 00:02:35,2022-01-01 00:31:04,623,623,casual,2022,1,1,0 days 00:02:35,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013,0.043056,0.011272,0.999936
5,AC1F67BDCDDD5988,2022-01-01 00:03:36,2022-01-01 00:04:02,13431,13431,member,2022,1,1,0 days 00:03:36,False,7.8,2.1,83.0,False,False,True,False,True,False,False,-0.996614,-1.035159,1.399734,383,383,0.06,0.015707,0.999877
6,A5BD5A4FD53D5414,2022-01-01 00:05:59,2022-01-01 00:08:06,KA1503000038,TA1309000055,member,2022,1,1,0 days 00:05:59,False,7.8,2.1,83.0,False,False,True,False,True,False,False,-0.996614,-1.035159,1.399734,1415,1675,0.099722,0.026104,0.999659
7,E93D5F426242BC48,2022-01-01 00:06:15,2022-01-01 00:13:24,TA1309000015,KA1503000022,casual,2022,1,1,0 days 00:06:15,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,1651,1403,0.104167,0.027267,0.999628
8,0C9545AFBACF60E1,2022-01-01 00:06:27,2022-01-01 00:09:06,18003,TA1305000003,member,2022,1,1,0 days 00:06:27,False,7.8,2.1,83.0,True,False,False,False,True,False,False,-0.996614,-1.035159,1.399734,473,1529,0.1075,0.02814,0.999604
9,22E39FF7ECA32D5E,2022-01-01 00:06:37,2022-01-01 00:16:44,TA1306000013,13017,casual,2022,1,1,0 days 00:06:37,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1558,258,0.110278,0.028867,0.999583


# Se convierte la variable month a una variable ciclica

In [74]:
df_data["month_sin"] = np.sin(2 * np.pi * df_data["month"] / 12)
df_data["month_cos"] = np.cos(2 * np.pi * df_data["month"] / 12)

In [75]:
df_data.head(10)

Unnamed: 0,ride_id,started_at,ended_at,start_station_id,end_station_id,member_casual,year,month,day,time_hms_ms,event,temperature,wind_speed,relative_humidity,rideable_type_classic_bike,rideable_type_docked_bike,rideable_type_electric_bike,member_casual_bool,day_type_Holiday,day_type_Normal,day_type_Weekend,temp_std,wind_std,rel_humidity_std,start_station_idx,end_station_idx,hour_float,hour_sin,hour_cos,month_sin,month_cos
0,98D355D9A9852BE9,2022-01-01 00:00:05,2022-01-01 00:01:48,623,623,casual,2022,1,1,0 days 00:00:05,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013,0.001389,0.000364,1.0,0.5,0.866025
1,04706CA7F5BD25EE,2022-01-01 00:01:00,2022-01-01 00:04:39,13325,13137,casual,2022,1,1,0 days 00:01:00,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,367,300,0.016667,0.004363,0.99999,0.5,0.866025
2,42178E850B92597A,2022-01-01 00:01:16,2022-01-01 00:32:14,TA1305000009,TA1305000009,casual,2022,1,1,0 days 00:01:16,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,1533,1533,0.021111,0.005527,0.999985,0.5,0.866025
3,6B93C46E8F5B114C,2022-01-01 00:02:14,2022-01-01 00:31:07,623,623,casual,2022,1,1,0 days 00:02:14,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013,0.037222,0.009745,0.999953,0.5,0.866025
4,466943353EAC8022,2022-01-01 00:02:35,2022-01-01 00:31:04,623,623,casual,2022,1,1,0 days 00:02:35,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013,0.043056,0.011272,0.999936,0.5,0.866025
5,AC1F67BDCDDD5988,2022-01-01 00:03:36,2022-01-01 00:04:02,13431,13431,member,2022,1,1,0 days 00:03:36,False,7.8,2.1,83.0,False,False,True,False,True,False,False,-0.996614,-1.035159,1.399734,383,383,0.06,0.015707,0.999877,0.5,0.866025
6,A5BD5A4FD53D5414,2022-01-01 00:05:59,2022-01-01 00:08:06,KA1503000038,TA1309000055,member,2022,1,1,0 days 00:05:59,False,7.8,2.1,83.0,False,False,True,False,True,False,False,-0.996614,-1.035159,1.399734,1415,1675,0.099722,0.026104,0.999659,0.5,0.866025
7,E93D5F426242BC48,2022-01-01 00:06:15,2022-01-01 00:13:24,TA1309000015,KA1503000022,casual,2022,1,1,0 days 00:06:15,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,1651,1403,0.104167,0.027267,0.999628,0.5,0.866025
8,0C9545AFBACF60E1,2022-01-01 00:06:27,2022-01-01 00:09:06,18003,TA1305000003,member,2022,1,1,0 days 00:06:27,False,7.8,2.1,83.0,True,False,False,False,True,False,False,-0.996614,-1.035159,1.399734,473,1529,0.1075,0.02814,0.999604,0.5,0.866025
9,22E39FF7ECA32D5E,2022-01-01 00:06:37,2022-01-01 00:16:44,TA1306000013,13017,casual,2022,1,1,0 days 00:06:37,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1558,258,0.110278,0.028867,0.999583,0.5,0.866025


# Se añade una columna que contabilice la duración de cada viaje

In [76]:
# Duración en minutos
df_data['duration_min'] = (df_data['ended_at'] - df_data['started_at']).dt.total_seconds() / 60

In [77]:
df_data.head(10)

Unnamed: 0,ride_id,started_at,ended_at,start_station_id,end_station_id,member_casual,year,month,day,time_hms_ms,event,temperature,wind_speed,relative_humidity,rideable_type_classic_bike,rideable_type_docked_bike,rideable_type_electric_bike,member_casual_bool,day_type_Holiday,day_type_Normal,day_type_Weekend,temp_std,wind_std,rel_humidity_std,start_station_idx,end_station_idx,hour_float,hour_sin,hour_cos,month_sin,month_cos,duration_min
0,98D355D9A9852BE9,2022-01-01 00:00:05,2022-01-01 00:01:48,623,623,casual,2022,1,1,0 days 00:00:05,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013,0.001389,0.000364,1.0,0.5,0.866025,1.716667
1,04706CA7F5BD25EE,2022-01-01 00:01:00,2022-01-01 00:04:39,13325,13137,casual,2022,1,1,0 days 00:01:00,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,367,300,0.016667,0.004363,0.99999,0.5,0.866025,3.65
2,42178E850B92597A,2022-01-01 00:01:16,2022-01-01 00:32:14,TA1305000009,TA1305000009,casual,2022,1,1,0 days 00:01:16,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,1533,1533,0.021111,0.005527,0.999985,0.5,0.866025,30.966667
3,6B93C46E8F5B114C,2022-01-01 00:02:14,2022-01-01 00:31:07,623,623,casual,2022,1,1,0 days 00:02:14,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013,0.037222,0.009745,0.999953,0.5,0.866025,28.883333
4,466943353EAC8022,2022-01-01 00:02:35,2022-01-01 00:31:04,623,623,casual,2022,1,1,0 days 00:02:35,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013,0.043056,0.011272,0.999936,0.5,0.866025,28.483333
5,AC1F67BDCDDD5988,2022-01-01 00:03:36,2022-01-01 00:04:02,13431,13431,member,2022,1,1,0 days 00:03:36,False,7.8,2.1,83.0,False,False,True,False,True,False,False,-0.996614,-1.035159,1.399734,383,383,0.06,0.015707,0.999877,0.5,0.866025,0.433333
6,A5BD5A4FD53D5414,2022-01-01 00:05:59,2022-01-01 00:08:06,KA1503000038,TA1309000055,member,2022,1,1,0 days 00:05:59,False,7.8,2.1,83.0,False,False,True,False,True,False,False,-0.996614,-1.035159,1.399734,1415,1675,0.099722,0.026104,0.999659,0.5,0.866025,2.116667
7,E93D5F426242BC48,2022-01-01 00:06:15,2022-01-01 00:13:24,TA1309000015,KA1503000022,casual,2022,1,1,0 days 00:06:15,False,7.8,2.1,83.0,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,1651,1403,0.104167,0.027267,0.999628,0.5,0.866025,7.15
8,0C9545AFBACF60E1,2022-01-01 00:06:27,2022-01-01 00:09:06,18003,TA1305000003,member,2022,1,1,0 days 00:06:27,False,7.8,2.1,83.0,True,False,False,False,True,False,False,-0.996614,-1.035159,1.399734,473,1529,0.1075,0.02814,0.999604,0.5,0.866025,2.65
9,22E39FF7ECA32D5E,2022-01-01 00:06:37,2022-01-01 00:16:44,TA1306000013,13017,casual,2022,1,1,0 days 00:06:37,False,7.8,2.1,83.0,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1558,258,0.110278,0.028867,0.999583,0.5,0.866025,10.116667


# Se guardan los datos

In [78]:
df_data.to_pickle("../data/normalized/df_normalized.pk1")

# Se limpian las columnas que no van a ser usadas durante el entrenamiento del modelo

In [79]:
df_model = df_data.drop(columns=[
    "ride_id",
    "started_at",
    "ended_at",
    "start_station_id",
    "end_station_id",
    "member_casual",
    "month",
    "day", # Se va a usar day_type desglosado en dummies
    "time_hms_ms",
    "temperature",
    "wind_speed",
    "relative_humidity"
])

In [80]:
df_model.head(10)

Unnamed: 0,year,event,rideable_type_classic_bike,rideable_type_docked_bike,rideable_type_electric_bike,member_casual_bool,day_type_Holiday,day_type_Normal,day_type_Weekend,temp_std,wind_std,rel_humidity_std,start_station_idx,end_station_idx,hour_float,hour_sin,hour_cos,month_sin,month_cos,duration_min
0,2022,False,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013,0.001389,0.000364,1.0,0.5,0.866025,1.716667
1,2022,False,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,367,300,0.016667,0.004363,0.99999,0.5,0.866025,3.65
2,2022,False,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,1533,1533,0.021111,0.005527,0.999985,0.5,0.866025,30.966667
3,2022,False,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013,0.037222,0.009745,0.999953,0.5,0.866025,28.883333
4,2022,False,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1013,1013,0.043056,0.011272,0.999936,0.5,0.866025,28.483333
5,2022,False,False,False,True,False,True,False,False,-0.996614,-1.035159,1.399734,383,383,0.06,0.015707,0.999877,0.5,0.866025,0.433333
6,2022,False,False,False,True,False,True,False,False,-0.996614,-1.035159,1.399734,1415,1675,0.099722,0.026104,0.999659,0.5,0.866025,2.116667
7,2022,False,False,False,True,True,True,False,False,-0.996614,-1.035159,1.399734,1651,1403,0.104167,0.027267,0.999628,0.5,0.866025,7.15
8,2022,False,True,False,False,False,True,False,False,-0.996614,-1.035159,1.399734,473,1529,0.1075,0.02814,0.999604,0.5,0.866025,2.65
9,2022,False,True,False,False,True,True,False,False,-0.996614,-1.035159,1.399734,1558,258,0.110278,0.028867,0.999583,0.5,0.866025,10.116667


In [81]:
df_model.dtypes

year                             int64
event                             bool
rideable_type_classic_bike        bool
rideable_type_docked_bike         bool
rideable_type_electric_bike       bool
member_casual_bool                bool
day_type_Holiday                  bool
day_type_Normal                   bool
day_type_Weekend                  bool
temp_std                       float64
wind_std                       float64
rel_humidity_std               float64
start_station_idx                int64
end_station_idx                  int64
hour_float                     float64
hour_sin                       float64
hour_cos                       float64
month_sin                      float64
month_cos                      float64
duration_min                   float64
dtype: object