In [None]:
import numpy as np
import pandas as pd

from datetime import datetime, timedelta
from feast import Entity, FeatureStore
from feast.repo_config import RepoConfig, RegistryConfig
from feast.infra.offline_stores.contrib.postgres_offline_store.postgres import PostgreSQLOfflineStoreConfig
from feast.infra.online_stores.redis import RedisOnlineStoreConfig

from sklearn.linear_model import LinearRegression

## Feature Stores' Configuration

### Alice's Configuration

In [None]:
offline_store_config_alice = PostgreSQLOfflineStoreConfig(
    host="postgresql-offline-store.default.svc.cluster.local",
    database="driver_data",
    db_schema="driver_data",
    user="bob",
    password="bob"
)

In [None]:
online_store_config_alice = RedisOnlineStoreConfig(
    connection_string="redis-online-store.default.svc.cluster.local:6379,username=bob,password=bob,db=0"
)

In [None]:
registry_config_alice = RegistryConfig(
    registry_store_type="KubeflowRegistryStore",
    path="",
    project="kubeflow-alice",
    readMode=True
)

In [None]:
repo_config_alice = RepoConfig(
    project="kubeflow-alice",
    registry=registry_config_alice,
    provider="local",
    offline_store=offline_store_config_alice,
    online_store=online_store_config_alice
)

In [None]:
fs_alice = FeatureStore(config=repo_config_alice, repo_path=None)

### Charlie's Configuration

In [None]:
offline_store_config_charlie = PostgreSQLOfflineStoreConfig(
    host="postgresql-offline-store.default.svc.cluster.local",
    database="driver_stream_data",
    db_schema="driver_stream_data",
    user="bob",
    password="bob"
)

In [None]:
online_store_config_charlie = RedisOnlineStoreConfig(
    connection_string="redis-online-store.default.svc.cluster.local:6379,username=bob,password=bob,db=0"
)

In [None]:
registry_config_charlie = RegistryConfig(
    registry_store_type="KubeflowRegistryStore",
    path="",
    project="kubeflow-charlie",
    readMode=True
)

In [None]:
repo_config_charlie = RepoConfig(
    project="kubeflow-charlie",
    registry=registry_config_charlie,
    provider="local",
    offline_store=offline_store_config_charlie,
    online_store=online_store_config_charlie
)

In [None]:
fs_charlie = FeatureStore(config=repo_config_charlie, repo_path=None)

## Share Features

Bob already has a dataset that contains drivers' hourly scores for the last 5 days.

In [None]:
end_date = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
start_date = end_date - timedelta(days=5)

dates_df = pd.DataFrame(
        {
            "event_timestamp": [
                pd.Timestamp(dt, unit="ms", tz="UTC").round("ms")
                for dt in pd.date_range(
                    start=start_date, end=end_date, freq="60min", inclusive="left"
                )
            ]
        }
    )

In [None]:
drivers = np.int_(list(range(1001, 1021)))

drivers_df = pd.DataFrame(drivers, columns=["driver_id"])

In [None]:
drivers_df_full = drivers_df.join(dates_df, how='cross')

rows = drivers_df_full["event_timestamp"].count()
drivers_df_full["score"] = np.random.random(size=rows).astype(np.float32)

In [None]:
drivers_df_full.head(5)

Bob enriches the existing dataset in a point-in-time correct way with drivers' daily stats that Alice has shared with him

In [None]:
driver_daily_stats = fs_alice.get_historical_features(
    entity_df=drivers_df_full,
    features=[
        "driver_daily_stats_fv:profit",
        "driver_daily_stats_fv:acc_rate"
    ]
).to_df()

In [None]:
driver_daily_stats.head(5)

Bob further enriches the dataset with the zone feature using the drivers' historical locations that Charlie has shared with him

In [None]:
driver_full_stats = fs_charlie.get_historical_features(
    entity_df=driver_daily_stats,
    features=[
        "driver_locations_fv:lat",
        "driver_locations_fv:lon",
        "driver_zones_odfv:zone"
    ]
).to_df() 

In [None]:
driver_full_stats.head(5)

## Train Model

In [None]:
train_X = driver_full_stats[[
    "zone",
    "acc_rate",
    "profit"
]]

In [None]:
train_X.head(5)

In [None]:
train_Y = driver_full_stats[["score"]]

In [None]:
train_Y.head(5)

In [None]:
model = LinearRegression()
model.fit(train_X, train_Y)

## Evaluate Model

In [None]:
driver = [1001, 1002, 1003]
zone = [1, 3, 2]
acc_rate = [0.35, 0.61, 0.48]
profit = [43.23, 24.5, 50.24]
score = [1.19, 0.79, 1.27]

test_X = pd.DataFrame(list(zip(acc_rate, profit, zone)),
               columns =["acc_rate", "profit", "zone"])

test_Y = pd.DataFrame(score, columns =["score"]) 

In [None]:
test_X

In [None]:
test_Y

In [None]:
performance = model.score(test_X, test_Y)
print("Model performance:", performance)

## Make Predictions

In this section we will simulate a real-time prediction. Assume that a user requests a ride and drivers 1001 to 1005 are the closest ones. The goal is to find the most suitable for the ride. We send a request to the inference service with a list of ids.

In [None]:
driver_ids = [1001, 1002, 1003, 1004, 1005]

The inference service fetches the latest feature data for the given ids using the Online Store.

In [None]:
driver_features = fs_alice.get_online_features(
    entity_rows=[{"driver_id": driver_id} for driver_id in driver_ids],
    features=[
        "driver_daily_stats_fv:profit",
        "driver_daily_stats_fv:acc_rate"
    ],
)

In [None]:
driver_features_df = driver_features.to_df()
driver_features_df

In [None]:
driver_loactions = fs_charlie.get_online_features(
    entity_rows=[{"driver_id": driver_id} for driver_id in driver_ids],
    features=[
        "driver_locations_fv:lat",
        "driver_locations_fv:lon",
        "driver_zones_odfv:zone"
    ],
)

In [None]:
driver_locations_df = driver_loactions.to_df()
driver_locations_df

In [None]:
driver_features_df = driver_features_df.merge(driver_locations_df[["driver_id", "zone"]], on="driver_id")

In [None]:
driver_features_df

The inference service uses the model to make a prediction for every driver

In [None]:
driver_features_df["score"] = model.predict(driver_features_df[["zone", "acc_rate", "profit"]])

In [None]:
driver_features_df[["driver_id", "score"]]

The inference service returns the best driver id

In [None]:
best_driver = driver_features_df["driver_id"].iloc[driver_features_df["score"].argmax()]
print(f"Prediction for best driver id: {best_driver}")