# Batch Inference Pipeline


## 1. Import necessary libraries


In [1]:
import os
import hopsworks
import joblib
import numpy as np
from dotenv import load_dotenv
load_dotenv()

# Get the environment variables
hopsworks_api_key = os.getenv("HOPSWORKS_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


## 2. Login to Hopsworks, get the batch data and model


### 2.1 Login to Hopsworks


In [2]:
# login to hopsworks
project = hopsworks.login(api_key_value=str(hopsworks_api_key))

# get the feature store and feature view
fs = project.get_feature_store()
mr = project.get_model_registry()
fv = fs.get_feature_view("amazon_fv", version=1)

2025-03-08 10:07:09,705 INFO: Initializing external client
2025-03-08 10:07:09,709 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-08 10:07:15,347 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1212597


### 2.2 Download the best model based on rmse


In [3]:
EVALUATION_METRIC="rmse"  
SORT_METRICS_BY="min"

# get best model based on custom metrics
best_model = mr.get_best_model("amazon_stock_price_prediction_model_xgboost",
                               EVALUATION_METRIC,
                               SORT_METRICS_BY)


In [4]:
# Download the model
if not os.path.exists("../models/xgboost_model"):
    os.makedirs("../models/xgboost_model")
best_model.download("../models/xgboost_model")

Downloading model artifact (0 dirs, 1 files)... DONE

'../models/xgboost_model'

### 2.3 Prepare the data for inference


In [5]:
# Get the last batch data from the feature view
# Our batch contains last 28 days of data because we are using 28-day window for prediction
window_size = 28

# get the last batch data from the feature view
last_batch_data = fv.get_batch_data().sort_values('datetime').iloc[-window_size:, :]

last_batch_data

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.17s) 


Unnamed: 0,datetime,open,high,close,low,volume,rsi,cci
3458,2025-03-04 14:30:00+00:00,200.110001,202.100006,198.350006,197.432007,18755863,24.2647,-298.490628
1444,2025-03-04 15:30:00+00:00,198.270004,199.990005,199.404999,197.929993,8047180,27.833037,-236.149018
656,2025-03-04 16:30:00+00:00,199.389999,203.449997,203.065002,199.350006,6681653,38.635048,-133.357042
3435,2025-03-04 17:30:00+00:00,203.050003,203.759995,202.899994,201.910004,4598446,38.3563,-95.23819
2238,2025-03-04 18:30:00+00:00,202.899994,205.759995,204.850006,202.690002,4444550,43.54052,-52.653373
1774,2025-03-04 19:30:00+00:00,204.869995,206.800003,206.199997,204.389999,5216931,46.871706,-19.635454
457,2025-03-04 20:30:00+00:00,206.220001,206.389999,203.850006,203.029999,5483834,42.203689,-44.191887
61,2025-03-05 14:30:00+00:00,204.759995,206.409897,205.195007,203.259995,7710718,45.54635,-28.092445
2482,2025-03-05 15:30:00+00:00,205.205002,206.550003,205.445007,204.429993,3715484,46.169563,-12.394222
1439,2025-03-05 16:30:00+00:00,205.449997,207.350006,207.345001,204.070007,4547543,50.780061,9.106479


In [9]:
# Reshape the data for input to xgboost
last_batch_data_reshaped  = np.expand_dims(last_batch_data.drop('datetime', axis=1), axis=0).reshape(1, -1)
last_batch_data_reshaped.shape

(1, 196)

## 3. Inference


### 3.1 Load the model


In [10]:
model = joblib.load("../models/xgboost_model/xgboost_model.pkl")

### 3.2 Make predictions


In [11]:
predictions = model.predict(last_batch_data_reshaped)

In [12]:
predictions

array([[194.25198, 198.81454, 197.05263, 196.35185, 192.01303, 194.39842,
        193.24477]], dtype=float32)

### 3.3 Save the predictions to a DataFrame


In [13]:
import datetime
time_index = ["14:30:00", "15:30:00", "16:30:00", "17:30:00", "18:30:00", "19:30:00", "20:30:00"]

# Add today's date to the time index

today = datetime.date.today()
today_str = today.strftime("%Y-%m-%d")

time_index = [f"{today_str} {t}" for t in time_index]

time_index

['2025-03-08 14:30:00',
 '2025-03-08 15:30:00',
 '2025-03-08 16:30:00',
 '2025-03-08 17:30:00',
 '2025-03-08 18:30:00',
 '2025-03-08 19:30:00',
 '2025-03-08 20:30:00']

In [14]:
# Create the Dataframe with predictions and time index
import pandas as pd
predictions_df = pd.DataFrame({
    "datetime": time_index,
    "prediction": predictions.squeeze()
})

predictions_df['datetime'] = pd.to_datetime(predictions_df['datetime'])

predictions_df

Unnamed: 0,datetime,prediction
0,2025-03-08 14:30:00,194.251984
1,2025-03-08 15:30:00,198.814545
2,2025-03-08 16:30:00,197.052628
3,2025-03-08 17:30:00,196.351852
4,2025-03-08 18:30:00,192.013031
5,2025-03-08 19:30:00,194.398422
6,2025-03-08 20:30:00,193.244766


### 3.4 Upload the Predictions DataFrame to feature store


In [15]:
# Create an id column to act as primary key in our feature group
predictions_df['id'] = predictions_df['datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')
predictions_df

Unnamed: 0,datetime,prediction,id
0,2025-03-08 14:30:00,194.251984,2025-03-08 14:30:00
1,2025-03-08 15:30:00,198.814545,2025-03-08 15:30:00
2,2025-03-08 16:30:00,197.052628,2025-03-08 16:30:00
3,2025-03-08 17:30:00,196.351852,2025-03-08 17:30:00
4,2025-03-08 18:30:00,192.013031,2025-03-08 18:30:00
5,2025-03-08 19:30:00,194.398422,2025-03-08 19:30:00
6,2025-03-08 20:30:00,193.244766,2025-03-08 20:30:00


In [16]:
# Upload the prediction dataframe to the feature store
amazon_prediction_fg = fs.get_or_create_feature_group(
    name="amazon_stock_predictions",
    description="Amazon stock predictions",
    version=1,
    online_enabled=True,
    primary_key=['id'],
    event_time='datetime'
)

amazon_prediction_fg.insert(predictions_df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1212597/fs/1200226/fg/1403702


Uploading Dataframe: 100.00% |██████████| Rows 7/7 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: amazon_stock_predictions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1212597/jobs/named/amazon_stock_predictions_1_offline_fg_materialization/executions


(Job('amazon_stock_predictions_1_offline_fg_materialization', 'SPARK'), None)