In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

In [3]:
import hopsworks
import pandas as pd
from datetime import timedelta
from src.inference import get_feature_store, fetch_predictions

def fetch_hourly_rides(hours):
    current_hour = (pd.Timestamp.now(tz="Etc/UTC") - timedelta(hours=hours)).floor('h')

    fs = get_feature_store()
    fg = fs.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=1
    )

    query = fg.select_all()
    query = query.filter(fg.pickup_hour >= current_hour)

    return query.read()

In [6]:
df = fetch_hourly_rides(12)

2025-03-04 00:58:02,611 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 00:58:02,616 INFO: Initializing external client
2025-03-04 00:58:02,616 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 00:58:03,339 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215649
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.32s) 


In [7]:
df

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2025-03-03 14:00:00+00:00,232,9
1,2025-03-03 15:00:00+00:00,224,5
2,2025-03-03 13:00:00+00:00,215,2
3,2025-03-03 15:00:00+00:00,40,1
4,2025-03-03 16:00:00+00:00,175,1
...,...,...,...
4513,2025-03-04 05:00:00+00:00,67,0
4514,2025-03-04 05:00:00+00:00,154,0
4515,2025-03-04 05:00:00+00:00,135,0
4516,2025-03-04 05:00:00+00:00,212,0


In [8]:
df_pred = fetch_predictions(240)

2025-03-04 00:58:15,459 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 00:58:15,466 INFO: Initializing external client
2025-03-04 00:58:15,466 INFO: Base URL: https://c.app.hopsworks.ai:443


2025-03-04 00:58:16,289 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215649
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.40s) 


In [9]:
df_pred

Unnamed: 0,pickup_location_id,predicted_demand,pickup_hour
0,95,1.0,2025-03-03 03:00:00+00:00
1,179,1.0,2025-03-03 03:00:00+00:00
2,127,0.0,2025-03-03 03:00:00+00:00
3,90,71.0,2025-03-03 03:00:00+00:00
4,79,98.0,2025-03-03 03:00:00+00:00
...,...,...,...
3007,12,1.0,2025-03-04 06:00:00+00:00
3008,161,216.0,2025-03-04 06:00:00+00:00
3009,195,0.0,2025-03-04 06:00:00+00:00
3010,46,0.0,2025-03-04 06:00:00+00:00


In [10]:
merged_df = pd.merge(df, df_pred, on=['pickup_location_id', 'pickup_hour'])

In [11]:
merged_df

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand
0,2025-03-03 16:00:00+00:00,175,1,0.0
1,2025-03-03 16:00:00+00:00,250,1,0.0
2,2025-03-03 16:00:00+00:00,160,0,0.0
3,2025-03-03 16:00:00+00:00,247,2,0.0
4,2025-03-03 16:00:00+00:00,166,31,11.0
...,...,...,...,...
2003,2025-03-04 05:00:00+00:00,67,0,0.0
2004,2025-03-04 05:00:00+00:00,154,0,0.0
2005,2025-03-04 05:00:00+00:00,135,0,0.0
2006,2025-03-04 05:00:00+00:00,212,0,0.0


In [12]:
pd.to_datetime(df['pickup_hour']).dt.floor('h')

0      2025-03-03 14:00:00+00:00
1      2025-03-03 15:00:00+00:00
2      2025-03-03 13:00:00+00:00
3      2025-03-03 15:00:00+00:00
4      2025-03-03 16:00:00+00:00
                  ...           
4513   2025-03-04 05:00:00+00:00
4514   2025-03-04 05:00:00+00:00
4515   2025-03-04 05:00:00+00:00
4516   2025-03-04 05:00:00+00:00
4517   2025-03-04 05:00:00+00:00
Name: pickup_hour, Length: 4518, dtype: datetime64[us, Etc/UTC]

In [13]:
merged_df['difference'] = merged_df['predicted_demand'] - merged_df['rides']

In [14]:
merged_df.sort_values(["pickup_location_id", "pickup_hour"])

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand,difference
48,2025-03-03 16:00:00+00:00,2,0,0.0,0.0
466,2025-03-03 18:00:00+00:00,2,0,0.0,0.0
724,2025-03-03 19:00:00+00:00,2,0,0.0,0.0
858,2025-03-03 20:00:00+00:00,2,0,0.0,0.0
1194,2025-03-03 22:00:00+00:00,2,0,0.0,0.0
...,...,...,...,...,...
973,2025-03-03 20:00:00+00:00,263,87,52.0,-35.0
1188,2025-03-03 22:00:00+00:00,263,45,135.0,90.0
1260,2025-03-04 02:00:00+00:00,263,6,20.0,14.0
1619,2025-03-04 04:00:00+00:00,263,8,44.0,36.0


In [15]:
merged_df

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand,difference
0,2025-03-03 16:00:00+00:00,175,1,0.0,-1.0
1,2025-03-03 16:00:00+00:00,250,1,0.0,-1.0
2,2025-03-03 16:00:00+00:00,160,0,0.0,0.0
3,2025-03-03 16:00:00+00:00,247,2,0.0,-2.0
4,2025-03-03 16:00:00+00:00,166,31,11.0,-20.0
...,...,...,...,...,...
2003,2025-03-04 05:00:00+00:00,67,0,0.0,0.0
2004,2025-03-04 05:00:00+00:00,154,0,0.0,0.0
2005,2025-03-04 05:00:00+00:00,135,0,0.0,0.0
2006,2025-03-04 05:00:00+00:00,212,0,0.0,0.0


In [16]:
import pandas as pd  
import plotly.express as px
df1 = df
df2 = df_pred

df['pickup_hour'] = pd.to_datetime(df['pickup_hour']).dt.floor('h')
df_pred['pickup_hour'] = pd.to_datetime(df_pred['pickup_hour']).dt.floor('h')


# Merge the DataFrames on 'pickup_location_id' and 'pickup_hour'  
merged_df = pd.merge(df1, df2, on=['pickup_location_id', 'pickup_hour'])  

# Calculate the absolute error  
merged_df['absolute_error'] = abs(merged_df['predicted_demand'] - merged_df['rides'])  

# Group by 'pickup_hour' and calculate the mean absolute error (MAE)  
mae_by_hour = merged_df.groupby('pickup_hour')['absolute_error'].mean().reset_index()  
mae_by_hour.rename(columns={'absolute_error': 'MAE'}, inplace=True)  

# Create a Plotly plot  
fig = px.line(  
    mae_by_hour,  
    x='pickup_hour',  
    y='MAE',  
    title='Mean Absolute Error (MAE) by Pickup Hour',  
    labels={'pickup_hour': 'Pickup Hour', 'MAE': 'Mean Absolute Error'},  
    markers=True  
)  

# Show the plot  
fig.show()

In [17]:
mae_by_hour["MAE"].mean()

7.871513944223108