In [18]:
import pandas as pd
import itertools
import plotly.express as px
from tqdm import tqdm
import os

In [32]:
transformed = os.path.join(".", "data/silver/2023/")

# Construct the full path to the Parquet file
file_path = os.path.join(transformed, "ts_data_2023-01.parquet")

In [33]:
rides = pd.read_parquet(file_path)
#/Users/borja/Documents/Somniumrema/projects/ml/taxi_demand_predictor/data/silver/2023/ts_data_2023-01.parquet

FileNotFoundError: [Errno 2] No such file or directory: './data/silver/2023/ts_data_2023-01.parquet'

In [None]:
def add_missing_times(data: pd.DataFrame, freq: str = 'h') -> pd.DataFrame:
    # Generate all possible intervals
    all_intervals = pd.date_range(start=data['pickup_time_hour'].min(), 
                                  end=data['pickup_time_hour'].max(), 
                                  freq=freq)
    
    # Generate all possible locations
    all_locations = data['pickup_location_id'].unique()

    # Create a full DataFrame with all intervals and locations
    full_index = pd.MultiIndex.from_product([all_intervals, all_locations], 
                                            names=['pickup_time_hour', 'pickup_location_id'])
    
    full_data = pd.DataFrame(index=full_index).reset_index()
    
    # Merge with progress bar
    with tqdm(total=len(full_data), desc="Merging data", unit="rows") as pbar:
        full_data = full_data.merge(data, on=['pickup_time_hour', 'pickup_location_id'], how='left').fillna(0)
        pbar.update(len(full_data))
    
    return full_data

In [None]:
def plot_rides(df: pd.DataFrame, location: str, time_col: str = 'pickup_datetime', location_col: str = 'pickup_location_id', ride_count_col: str = 'ride_count') -> None:
    
    # Ensure the time column is in datetime format
    df[time_col] = pd.to_datetime(df[time_col])
    
    # Filter the DataFrame for the specified location
    df_location = df[df[location_col] == location]
    
    # Group by the time column and sum the ride counts
    df_grouped = df_location.groupby(time_col)[ride_count_col].sum().reset_index()
    
    # Plot the data using Plotly
    fig = px.line(df_grouped, x=time_col, y=ride_count_col, title=f'Number of Rides by hour for Location {location}')
    
    fig.show()

In [None]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id,pickup_time
0,2024-01-01 00:57:55,186,2024-01-01 00:57:55
1,2024-01-01 00:03:00,140,2024-01-01 00:03:00
2,2024-01-01 00:17:06,236,2024-01-01 00:17:06
3,2024-01-01 00:36:38,79,2024-01-01 00:36:38
4,2024-01-01 00:46:51,211,2024-01-01 00:46:51
...,...,...,...
2964619,2024-01-31 23:45:59,107,2024-01-31 23:45:59
2964620,2024-01-31 23:13:07,114,2024-01-31 23:13:07
2964621,2024-01-31 23:19:00,211,2024-01-31 23:19:00
2964622,2024-01-31 23:07:23,107,2024-01-31 23:07:23


In [None]:
# Group by pickup_quarter_hour and pickup_location, and count the rides
rides_1 = (
    rides0
    .assign(
        pickup_time_hour=lambda df: df["pickup_time"].dt.floor('h').astype('datetime64[us]')
    )
   .groupby(['pickup_time_hour', 'pickup_location_id'])
   .size()
   .reset_index(name='ride_count')
)

In [None]:
rides_1

Unnamed: 0,pickup_time_hour,pickup_location_id,ride_count
0,2024-01-01 00:00:00,4,25
1,2024-01-01 00:00:00,7,4
2,2024-01-01 00:00:00,9,1
3,2024-01-01 00:00:00,10,6
4,2024-01-01 00:00:00,12,4
...,...,...,...
77525,2024-01-31 23:00:00,260,2
77526,2024-01-31 23:00:00,261,12
77527,2024-01-31 23:00:00,262,9
77528,2024-01-31 23:00:00,263,53


In [None]:
plot_rides(rides_1, 42)

KeyError: 'pickup_datetime'

In [None]:
rides = add_missing_times(rides_1)

In [None]:
rides_1

In [None]:
plot_rides(rides_1, location=43, time_col='pickup_time_hour', location_col='pickup_location_id', ride_count_col='ride_count')

In [None]:
rides.to_parquet(transformed + "rides_by_hour_location_2024_01.parquet", index=False)