In [1]:
import src.config as config

In [2]:
import pandas as pd

current_date = pd.Timestamp('2023-02-28 09:00:00')

## Get pointers to the 2 feature groups we need to create our feature view

- One with the model predictions
- The other with the actual demands

In [3]:
from src.feature_store_api import get_feature_group

predictions_fg = get_feature_group(
    name=config.FEATURE_GROUP_MODEL_PREDICTIONS
)

actuals_fg = get_feature_group(
    name=config.FEATURE_GROUP_NAME,
)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/12447




Connected. Call `.close()` to terminate connection gracefully.
Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/12447
Connected. Call `.close()` to terminate connection gracefully.


## Query to generate our feature view from these 2 feature groups

In [4]:
from datetime import timedelta

query = predictions_fg.select_all() \
    .join(actuals_fg.select_all(), on=['pickup_hour', 'pickup_location_id']) \
    .filter(predictions_fg.pickup_hour >= current_date - timedelta(days=30))

In [5]:
from src.feature_store_api import get_feature_store

feature_store = get_feature_store()

try:
    # create feature view as it does not exist yet
    feature_store.create_feature_view(
        name=config.FEATURE_VIEW_MONITORING,
        version=1,
        query=query
    )
except:
    print('Feature view already existed. Skip creation.')


# get feature view
predictions_and_actuals_fv = feature_store.get_feature_view(
    name=config.FEATURE_VIEW_MONITORING,
    version=1
)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/12447
Connected. Call `.close()` to terminate connection gracefully.
Feature view already existed. Skip creation.


In [6]:
predictions_and_actuals = predictions_and_actuals_fv.get_batch_data(
    # start_time=(fetch_data_from - timedelta(days=1)),
    # end_time=(fetch_data_to + timedelta(days=1))
)



2023-02-28 13:19:08,877 INFO: USE `taxi_demand_featurestore`
2023-02-28 13:19:09,385 INFO: WITH right_fg0 AS (SELECT *
FROM (SELECT `fg1`.`pickup_location_id` `pickup_location_id`, `fg1`.`predicted_demand` `predicted_demand`, `fg1`.`pickup_hour` `pickup_hour`, `fg1`.`pickup_location_id` `join_pk_pickup_location_id`, `fg1`.`pickup_hour` `join_pk_pickup_hour`, `fg1`.`pickup_hour` `join_evt_pickup_hour`, `fg0`.`rides` `rides`, RANK() OVER (PARTITION BY `fg1`.`pickup_hour`, `fg1`.`pickup_location_id`, `fg1`.`pickup_hour` ORDER BY `fg0`.`pickup_hour` DESC) pit_rank_hopsworks
FROM `taxi_demand_featurestore`.`model_predictions_feature_group_1` `fg1`
INNER JOIN `taxi_demand_featurestore`.`time_series_hourly_feature_group_1` `fg0` ON `fg1`.`pickup_hour` = `fg0`.`pickup_hour` AND `fg1`.`pickup_location_id` = `fg0`.`pickup_location_id` AND `fg1`.`pickup_hour` >= `fg0`.`pickup_hour`) NA
WHERE `pit_rank_hopsworks` = 1) (SELECT `right_fg0`.`pickup_location_id` `pickup_location_id`, `right_fg0`.`pred



In [7]:
predictions_and_actuals

Unnamed: 0,pickup_location_id,predicted_demand,pickup_hour,rides
0,1,0.0,2023-02-27 00:00:00,0
1,2,0.0,2023-02-27 00:00:00,0
2,3,0.0,2023-02-27 00:00:00,0
3,4,4.0,2023-02-27 00:00:00,2
4,5,0.0,2023-02-27 00:00:00,0
...,...,...,...,...
6885,261,11.0,2023-02-28 09:00:00,15
6886,262,132.0,2023-02-28 09:00:00,155
6887,263,147.0,2023-02-28 09:00:00,144
6888,264,48.0,2023-02-28 09:00:00,61


In [50]:
df = predictions_and_actuals
df

Unnamed: 0,pickup_location_id,predicted_demand,pickup_hour,rides
0,1,0.0,2023-02-27 00:00:00,0
1,2,0.0,2023-02-27 00:00:00,0
2,3,0.0,2023-02-27 00:00:00,0
3,4,4.0,2023-02-27 00:00:00,2
4,5,0.0,2023-02-27 00:00:00,0
...,...,...,...,...
6885,261,11.0,2023-02-28 09:00:00,15
6886,262,132.0,2023-02-28 09:00:00,155
6887,263,147.0,2023-02-28 09:00:00,144
6888,264,48.0,2023-02-28 09:00:00,61


In [51]:
# overall MAE
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(df['rides'], df['predicted_demand'])
print(f'{mae=:.2f}')

mae=2.52


In [52]:
# MAE per pickup_location_id
mae_per_location = (
    df
    .groupby('pickup_location_id')
    .apply(lambda g: mean_absolute_error(g['rides'], g['predicted_demand']))
    .reset_index()
    .rename(columns={0: 'mae'})
)

import plotly.express as px 

fig = px.bar(
    mae_per_location,
    x='pickup_location_id', y='mae',
    template='plotly_dark',
    # markers=True,
    title="Mean Absolute Error over time"
)
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.



In [36]:
# MAE per pickup_hour
# https://stackoverflow.com/a/47914634
mae_per_hour = (
    df
    .groupby('pickup_hour')
    .apply(lambda g: mean_absolute_error(g['rides'], g['predicted_demand']))
    .reset_index()
    .rename(columns={0: 'mae'})
    .sort_values(by='pickup_hour')
)

import plotly.express as px 

fig = px.bar(
    mae_per_hour,
    x='pickup_hour', y='mae',
    template='plotly_dark',
    # markers=True,
    title="Mean Absolute Error over time"
)
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.



In [53]:
top_locations_by_demand = (
    df
    .groupby('pickup_location_id')['rides']
    .sum()
    .sort_values(ascending=False)
    .reset_index()
    .head(10)['pickup_location_id']
)
print(f'{top_locations_by_demand=}')

for location_id in top_locations_by_demand:
    
    mae_per_hour = (
        df[df.pickup_location_id == location_id]
        .groupby('pickup_hour')
        .apply(lambda g: mean_absolute_error(g['rides'], g['predicted_demand']))
        .reset_index()
        .rename(columns={0: 'mae'})
        .sort_values(by='pickup_hour')
    )

    fig = px.bar(
        mae_per_hour,
        x='pickup_hour', y='mae',
        template='plotly_dark',
        # markers=True,
        title=f"{location_id=}"
    )
    fig.show()

    # print(mae_per_hour)

top_locations_by_demand=0    237
1    132
2    236
3    161
4    162
5    138
6    186
7    142
8    170
9    230
Name: pickup_location_id, dtype: int64



distutils Version classes are deprecated. Use packaging.version instead.




distutils Version classes are deprecated. Use packaging.version instead.




distutils Version classes are deprecated. Use packaging.version instead.




distutils Version classes are deprecated. Use packaging.version instead.




distutils Version classes are deprecated. Use packaging.version instead.




distutils Version classes are deprecated. Use packaging.version instead.




distutils Version classes are deprecated. Use packaging.version instead.




distutils Version classes are deprecated. Use packaging.version instead.




distutils Version classes are deprecated. Use packaging.version instead.




distutils Version classes are deprecated. Use packaging.version instead.



In [None]:
import plotly.express as px 

fig = px.line(
    x=ts_dates, y=ts_values,
    template='plotly_dark',
    markers=True, title=title
)