# Fetch Features for Training Guide

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sqlalchemy

from datetime import datetime, timedelta
from feast import Entity, FeatureStore, FeatureView, Field
from feast.infra.offline_stores.contrib.postgres_offline_store.postgres_source import (
    PostgreSQLSource,
)
from feast.infra.offline_stores.contrib.postgres_offline_store.postgres import PostgreSQLOfflineStoreConfig
from feast.infra.online_stores.redis import RedisOnlineStoreConfig
from feast.repo_config import RepoConfig, RegistryConfig
from feast.types import Float32, Int64

from kale.common import mlmdutils, artifacts
from kale.ml import Signature

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
def get_sqlalchemy_engine(config):
    url = f"postgresql+psycopg2://{config['user']}:{config['password']}@{config['host']}:{config['port']}/{config['database']}"
    print("Connecting to", config["db_schema"], "schema using:", url)
    return sqlalchemy.create_engine(url, client_encoding='utf8', connect_args={'options': '-c search_path={}'.format(config["db_schema"])})

In [None]:
db_config = {
        "user": "postgres",
        "password": "postgres",
        "host": "postgresql-offline-store.default.svc.cluster.local",
        "port": 5432,
        "database": "postgres",
        "db_schema": "public"
    }

In [None]:
END_DATE = datetime.utcnow()
START_DATE = END_DATE - timedelta(days=30)

## Model Building & Training

For the purpose of this guide, we decided to use supervised learning to create a model that predicts whether a driver will complete an order.

This model takes as input the:
- daily acceptance rate = accepted trip orders / total trip orders (per day)
- daily completion rate = completed trips / accepted trip orders (per day)
- daily trips = count of completed trips in a day
- daily profit = sum of completed trips in a day

and the output we get is a boolean flag.

### Load Dataset

We start by fetching the `trip_records` dataset.

This is our entity dataframe (our entity is the driver), thus we only need to fetch the `driver_id`, `event_timestamp` and `completed` columns.

In [None]:
con = get_sqlalchemy_engine(db_config)

In [None]:
query = f"""SELECT driver_id, event_timestamp, completed \
FROM trip_records \
WHERE event_timestamp BETWEEN '{START_DATE}' AND '{END_DATE}'"""

In [None]:
driver_orders = pd.read_sql(query, con)

In [None]:
driver_orders["completed"] = driver_orders.apply(lambda x: 1 if x["completed"] == True else 0, axis=1)

In [None]:
driver_orders.head(5)

### Enrich Dataset using Feast

We enrich the dataset using the `get_historical_features()` method.

This method joins historical feature data from one or more feature views to an entity dataframe by using a time travel join. It is generally used either for training or for batch scoring.

In [None]:
offline_store_config = PostgreSQLOfflineStoreConfig(
    host="postgresql-offline-store.default.svc.cluster.local",
    database="postgres",
    db_schema="public",
    user="postgres",
    password="postgres"
)

In [None]:
online_store_config = RedisOnlineStoreConfig(
    connection_string="redis-online-store.default.svc.cluster.local:6379,username=default,password=redis,db=0"
)

In [None]:
registry_config = RegistryConfig(
    registry_store_type="KubeflowRegistryStore",
    path="",
    project="kubeflow-user"
)

In [None]:
repo_config = RepoConfig(
    project="kubeflow-user",
    registry=registry_config,
    provider="local",
    offline_store=offline_store_config,
    online_store=online_store_config
)

In [None]:
fs = FeatureStore(config=repo_config, repo_path=None)

In [None]:
driver_stats = fs.get_historical_features(
    entity_df=driver_orders,
    features=[
        "daily_driver_stats_fv:comp_rate",
        "daily_driver_stats_fv:acc_rate",
        "daily_driver_stats_fv:trips",
        "daily_driver_stats_fv:profit",
    ],
).to_df()

### Build Model

Building a model is usually a complex and experimental phase of the process. In this guide we decide to use a simple linear regression model.

In [None]:
model = LinearRegression()

### Train Model

We are now ready to split our dataset and train the model using the training dataset.

In [None]:
x = driver_stats[[
    "comp_rate",
    "acc_rate",
    "trips",
    "profit"
]]

In [None]:
y = driver_stats[["completed"]]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2,
                                                    random_state=42)

In [None]:
model.fit(x_train, y_train)

## Model Evaluation

Next step, is to evaluate the model and once its performance is satisfactory we are ready to deploy it.

In [None]:
performance = model.score(x_test, y_test)

In [None]:
print("Model performance:", performance)

## Model Packaging

To deploy a model we will package it and store it as artifact in MLMD first.

When trying to make a prediction we just want to provide a driver id and get back whether this driver will complete the ride (True or False).

We will use the online store to get the latest features for each driver that will use as input for the model.

Thus, we need to create a Transformer.

### Create Transformer

The Transformer during the `__init__()` method configures the feature store.

Every time it receives an input (driver_id) uses the `get_online_features()` to fetch the latest feature data (comp_rate, acc_rate, trips, profit) of the driver.

Then, the Transformer channels the feature data to the model that returns the prediction

In [None]:
ASSETS_PATH = "/home/jovyan/transformer_package/"

In [None]:
os.mkdir(ASSETS_PATH)

In [None]:
TRANSFORMER_CODE = """
import kfserving
from typing import Dict

from feast import FeatureStore
from feast.infra.online_stores.redis import RedisOnlineStoreConfig
from feast.repo_config import RepoConfig, RegistryConfig

class Transformer(kfserving.KFModel):
    def __init__(self, model_name: str, predictor_host: str, protocol: str = "v1"):
        super().__init__(model_name)
        self.predictor_host = predictor_host
        self.protocol = protocol
        
        online_store_config = RedisOnlineStoreConfig(
            connection_string="redis-online-store.default.svc.cluster.local:6379,username=default,password=redis,db=0"
        )
        
        registry_config = RegistryConfig(
            registry_store_type="KubeflowRegistryStore",
            path="",
            project="kubeflow-user"
        )
        
        repo_config = RepoConfig(
            project="kubeflow-user",
            registry=registry_config,
            provider="local",
            online_store=online_store_config
        )
        
        self.fs = FeatureStore(config=repo_config, repo_path=None)

    def preprocess(self, inputs: Dict):
        enriched_data = self.fs.get_online_features(
            entity_rows=[{"driver_id": driver_id} for driver_id in inputs["instances"]],
            features=[
                "daily_driver_stats_fv:comp_rate",
                "daily_driver_stats_fv:acc_rate",
                "daily_driver_stats_fv:trips",
                "daily_driver_stats_fv:profit",
            ],
        )
        return {'instances': pd.DataFrame.from_dict(enriched_data.to_dict())}

    def postprocess(self, inputs: Dict):
        pass
"""

In [None]:
with open(ASSETS_PATH + "transformer.py", "w") as f:
    f.write(TRANSFORMER_CODE)

### Submit Transformer

In [None]:
mlmd = mlmdutils.get_mlmd_instance()

In [None]:
transformer_artifact = artifacts.Transformer(
    name="FeastTransformer",
    transformer_dir=ASSETS_PATH,
    module_name="transformer",
    class_name="Transformer",
    is_stateful=True
).submit_artifact()

In [None]:
mlmd.link_artifact_as_output(transformer_artifact.id)

### Submit Model

In [None]:
mlmd = mlmdutils.get_mlmd_instance()

In [None]:
signature = Signature(
    input_size=[1] + [x.shape[1]],
    output_size=[1] + [y.shape[1]],
    input_dtype=x.dtypes,
    output_dtype=y.dtypes)

In [None]:
model_artifact = artifacts.SklearnModel(
    model=model,
    description="A driver ranking Linear Regression model",
    version="1.0.0",
    author="Kale",
    signature=signature,
    tags={"app": "feast-guide"}).submit_artifact()

In [None]:
mlmd.link_artifact_as_output(model_artifact.id)