In [1]:
# Estou pnensando na ETL
# Fluxo que eu quero executar
# RAW -> VALIDATED -> TIME SERIES DATA -> TRAINING DATA

# import libraries
import pandas as pd
from tqdm import tqdm
from typing import Optional, List
import plotly.express as px

# Load the validated raw data and check the first 20 entries
rides = pd.read_parquet('../data/transformed/validated_yellow_tripdata_2023_01.parquet')
rides.head(20)

Unnamed: 0,pickup_datetime,pickup_location_id
0,2023-01-01 00:32:10,161
1,2023-01-01 00:55:08,43
2,2023-01-01 00:25:04,48
3,2023-01-01 00:03:48,138
4,2023-01-01 00:10:29,107
5,2023-01-01 00:50:34,161
6,2023-01-01 00:09:22,239
7,2023-01-01 00:27:12,142
8,2023-01-01 00:21:44,164
9,2023-01-01 00:39:42,141


In [2]:
rides['pickup_datetime'] = rides.pickup_datetime.dt.floor('H')

In [3]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id
0,2023-01-01 00:00:00,161
1,2023-01-01 00:00:00,43
2,2023-01-01 00:00:00,48
3,2023-01-01 00:00:00,138
4,2023-01-01 00:00:00,107
...,...,...
3066761,2023-01-31 23:00:00,107
3066762,2023-01-31 23:00:00,112
3066763,2023-01-31 23:00:00,114
3066764,2023-01-31 23:00:00,230


In [10]:
agg = rides.groupby(['pickup_datetime', 'pickup_location_id']).size().reset_index()
agg.rename(columns={0: 'rides'}, inplace=True)

agg

Unnamed: 0,pickup_datetime,pickup_location_id,rides
0,2023-01-01 00:00:00,4,19
1,2023-01-01 00:00:00,7,3
2,2023-01-01 00:00:00,12,1
3,2023-01-01 00:00:00,13,14
4,2023-01-01 00:00:00,24,20
...,...,...,...
71486,2023-01-31 23:00:00,261,5
71487,2023-01-31 23:00:00,262,11
71488,2023-01-31 23:00:00,263,41
71489,2023-01-31 23:00:00,264,40


In [13]:
# Imagine que h'a locais que n~ao tem dados para determinada hora... fill the gap


def add_missing_slots(agg_rides: pd.DataFrame) -> pd.DataFrame:
    """
    Add missing hourly slots to a DataFrame of ride counts, filling them with zeros.

    Parameters:
    - agg_rides (pd.DataFrame): A DataFrame containing ride counts, with columns 'pickup_hour', 'pickup_location_id', and 'rides'.

    Returns:
    - pd.DataFrame: A new DataFrame with the same columns as `agg_rides`, but with additional rows for any missing hourly slots, filled with zeros.

    Example:
    >>> rides = pd.DataFrame({'pickup_hour': ['2022-01-01 00:00:00', '2022-01-01 02:00:00'], 'pickup_location_id': [1, 1], 'rides': [10, 20]})
    >>> add_missing_slots(rides)
    pickup_hour         pickup_location_id   rides
    2022-01-01 00:00:00 1                    10
    2022-01-01 01:00:00 1                    0
    2022-01-01 02:00:00 1                    20
    """
    
    location_ids = agg_rides['pickup_location_id'].unique()
    full_range = pd.date_range(agg_rides['pickup_datetime'].min(), agg_rides['pickup_datetime'].max(), freq='H')
    output = pd.DataFrame()

    for location_id in tqdm(location_ids):
        agg_rides_i = agg_rides.loc[agg_rides['pickup_location_id'] == location_id, ['pickup_datetime', 'rides']]
        agg_rides_i.set_index('pickup_datetime', inplace=True)
        agg_rides_i.index = pd.DatetimeIndex(agg_rides_i.index)
        agg_rides_i = agg_rides_i.reindex(full_range, fill_value=0)
        agg_rides_i['pickup_location_id'] = location_id

        output = pd.concat([output, agg_rides_i])

    output = output.reset_index().rename(columns={'index': 'pickup_datetime'})

    return output


In [14]:
agg_rides_all_slots = add_missing_slots(agg)

100%|██████████| 257/257 [00:01<00:00, 210.13it/s]


In [16]:
agg_rides_all_slots.loc[agg_rides_all_slots['pickup_location_id'] == 1]

Unnamed: 0,pickup_datetime,rides,pickup_location_id
125736,2023-01-01 00:00:00,0,1
125737,2023-01-01 01:00:00,0,1
125738,2023-01-01 02:00:00,0,1
125739,2023-01-01 03:00:00,0,1
125740,2023-01-01 04:00:00,0,1
...,...,...,...
126475,2023-01-31 19:00:00,0,1
126476,2023-01-31 20:00:00,0,1
126477,2023-01-31 21:00:00,0,1
126478,2023-01-31 22:00:00,0,1


In [17]:
def plot_rides(
        rides: pd.DataFrame,
        locations: Optional[List[int]] = None
) -> None:
    """
    Plot ride counts over time, for one or more locations.
    """
    rides_to_plot = rides[rides.pickup_location_id.isin(locations)] if locations else rides

    fig = px.line(
        rides_to_plot,
        x='pickup_datetime',
        y='rides',
        color='pickup_location_id',
        title='Rides over time'
    )

    fig.show()

In [20]:
plot_rides(agg_rides_all_slots, locations=[43])


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [21]:
agg_rides_all_slots.to_parquet('../data/transformed/ts_data_rides_2022_01.parquet')