In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

In [3]:
import hopsworks
import pandas as pd
from datetime import timedelta
from src.inference import get_feature_store, fetch_predictions

def fetch_hourly_rides(hours):
    current_hour = (pd.Timestamp.now(tz="Etc/UTC") - timedelta(hours=hours)).floor('h')

    fs = get_feature_store()
    fg = fs.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=1
    )

    query = fg.select_all()
    query = query.filter(fg.hour >= current_hour)

    return query.read()

In [4]:
df = fetch_hourly_rides(12)

2025-05-11 10:35:53,418 INFO: Initializing external client
2025-05-11 10:35:53,419 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-11 10:35:54,046 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1231009
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.59s) 


In [5]:
df

Unnamed: 0,hour,start_station_id,ride_count
0,2025-05-11 08:00:00+00:00,5905.14,0
1,2025-05-11 01:00:00+00:00,5905.14,0
2,2025-05-11 04:00:00+00:00,5905.14,0
3,2025-05-11 09:00:00+00:00,5905.14,3
4,2025-05-11 03:00:00+00:00,5905.14,0
5,2025-05-11 02:00:00+00:00,5905.14,0
6,2025-05-10 22:00:00+00:00,5905.14,0
7,2025-05-11 03:00:00+00:00,5329.03,0
8,2025-05-11 06:00:00+00:00,5905.14,0
9,2025-05-10 22:00:00+00:00,5329.03,3


In [6]:
df_pred = fetch_predictions(5)

2025-05-11 10:35:57,317 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-11 10:35:57,323 INFO: Initializing external client
2025-05-11 10:35:57,323 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-11 10:35:57,818 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1231009
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.28s) 


In [7]:
df_pred

Unnamed: 0,start_station_id,predicted_demand,hour
0,6140.05,25.0,2025-05-11 13:00:00+00:00
1,5905.14,19.0,2025-05-11 13:00:00+00:00
2,5329.03,13.0,2025-05-11 13:00:00+00:00
3,6140.05,10.0,2025-05-11 15:00:00+00:00
4,5905.14,5.0,2025-05-11 15:00:00+00:00
5,5329.03,12.0,2025-05-11 15:00:00+00:00


In [8]:
merged_df = pd.merge(df, df_pred, on=['start_station_id'])#, 'hour'])

In [9]:
merged_df

Unnamed: 0,hour_x,start_station_id,ride_count,predicted_demand,hour_y
0,2025-05-11 08:00:00+00:00,5905.14,0,19.0,2025-05-11 13:00:00+00:00
1,2025-05-11 08:00:00+00:00,5905.14,0,5.0,2025-05-11 15:00:00+00:00
2,2025-05-11 01:00:00+00:00,5905.14,0,19.0,2025-05-11 13:00:00+00:00
3,2025-05-11 01:00:00+00:00,5905.14,0,5.0,2025-05-11 15:00:00+00:00
4,2025-05-11 04:00:00+00:00,5905.14,0,19.0,2025-05-11 13:00:00+00:00
...,...,...,...,...,...
97,2025-05-11 12:00:00+00:00,6140.05,7,10.0,2025-05-11 15:00:00+00:00
98,2025-05-11 14:00:00+00:00,6140.05,27,25.0,2025-05-11 13:00:00+00:00
99,2025-05-11 14:00:00+00:00,6140.05,27,10.0,2025-05-11 15:00:00+00:00
100,2025-05-11 13:00:00+00:00,6140.05,20,25.0,2025-05-11 13:00:00+00:00


In [10]:
import pandas as pd  
import plotly.express as px
df1 = df
df2 = df_pred

# Merge the DataFrames on 'pickup_location_id' and 'pickup_hour'  
merged_df = pd.merge(df1, df2, on=['start_station_id', 'hour'])  

# Calculate the absolute error  
merged_df['absolute_error'] = abs(merged_df['predicted_demand'] - merged_df['ride_count'])  

# Group by 'pickup_hour' and calculate the mean absolute error (MAE)  
mae_by_hour = merged_df.groupby('hour')['absolute_error'].mean().reset_index()  
mae_by_hour.rename(columns={'absolute_error': 'MAE'}, inplace=True)  

# Create a Plotly plot  
fig = px.line(  
    mae_by_hour,  
    x='hour',  
    y='MAE',  
    title='Mean Absolute Error (MAE) by Pickup Hour',  
    labels={'hour': 'Start Hour', 'MAE': 'Mean Absolute Error'},  
    markers=True  
)  

# Show the plot  
fig.show()