# Batch Inference Pipeline

In this notebook, we will do the following tasks:
1. Create a batch inference pipeline using the pre-trained model.
2. Run the pipeline and get the predictions.


In [1]:
import hopsworks
import os
import json
import torch
import pandas as pd
import numpy as np
from dotenv import load_dotenv

# Load the.env file
load_dotenv()

# Get the environment variables
hopsworks_api_key = os.getenv("HOPSWORKS_API_KEY")


In [2]:
project = hopsworks.login(api_key_value=str(hopsworks_api_key))
fs = project.get_feature_store()


2025-02-22 16:22:26,928 INFO: Initializing external client
2025-02-22 16:22:26,930 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-02-22 16:22:30,567 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1212597


In [3]:
#  Create feature group for storing predictions
amazon_stock_predictions_fg = fs.create_feature_group("amazon_stock_predictions", version=1, description="Predicted prices for Amazon stocks", online_enabled=True, primary_key=["id"], event_time=["datetime"])





### Get the model from model registry

In [4]:
mr = project.get_model_registry()

EVALUATION_METRIC="mean_squared_error"  
SORT_METRICS_BY="min" # your sorting criteria

# get best model based on custom metrics
best_model = mr.get_best_model("amazon_stock_price_prediction_model_torch",
                               EVALUATION_METRIC,
                               SORT_METRICS_BY)


In [5]:
# Load the pre-trained model
model_dir = "../models/amazon_stock_price_prediction_model_torch"
best_model.download(model_dir)
state_dict = torch.load(f"{model_dir}/model.pt", weights_only=True)


with open("../preprocessor/hyper_params.json", "r") as f:
    hyper_params = json.load(f)

Downloading model artifact (0 dirs, 1 files)... DONE

In [6]:
# Create the model
import torch
from torch import nn
class LSTMModel(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, device:str = 'cpu'):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.device = device
        
		# LSTM layer
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
	# forward pass
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(self.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(self.device)
        
        out, (_, _) = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out
        


In [7]:
model = LSTMModel(input_dim=hyper_params['input_size'], hidden_dim=hyper_params['hidden_size'], output_dim=hyper_params['forecast_steps'], num_layers=hyper_params['num_layers'], device='cpu').to('cpu')

# Load the trained model state_dict
model.load_state_dict(state_dict)


<All keys matched successfully>

### Get Feature view

In [8]:
amazon_fv = fs.get_feature_view("amazon_fv")





In [9]:
batch_data = amazon_fv.get_batch_data()


# get the last 24 days of data for window_size
sample  = batch_data.sort_values('datetime').drop('datetime', axis=1).tail(hyper_params['window_size'])

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.92s) 


In [10]:
import pandas as pd
batch_data['datetime'] = pd.to_datetime(batch_data['datetime'], utc=True)

batch_data = batch_data.sort_values('datetime')

batch_data.tail(6)

Unnamed: 0,datetime,open,high,close,low,volume,rsi,cci
842,2025-02-21 15:30:00+00:00,219.480804,219.839996,219.264999,218.070007,6202619,26.950381,-214.759373
354,2025-02-21 16:30:00+00:00,219.25,219.740005,217.630005,217.399994,4778204,23.486556,-201.945381
623,2025-02-21 17:30:00+00:00,217.630005,217.690598,216.049698,215.949997,7934021,20.715212,-215.456538
2537,2025-02-21 18:30:00+00:00,216.039993,216.5,214.960007,214.740005,6070526,19.046318,-207.864825
1956,2025-02-21 19:30:00+00:00,214.970001,216.479904,216.439896,214.750107,5888288,27.579504,-155.573258
3223,2025-02-21 20:30:00+00:00,216.440002,216.850006,216.490005,215.630005,5715615,27.856803,-121.195865


In [11]:
model.eval()
with torch.inference_mode():
    outputs = model(torch.tensor(np.array(sample)).float().unsqueeze(0).to('cpu'))

In [12]:
outputs = outputs.reshape(-1, 1)

In [14]:
# Get the actual values for comparison
import yfinance as yf

# Get the last today data
actual_values = pd.DataFrame(yf.download('AMZN', period='1d', interval='1h', multi_level_index=False))['Close']

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


In [None]:
import pandas as pd

time_stamps = batch_data.tail(7)['datetime'].dt.time.values[::-1]

predicted_df = pd.DataFrame(outputs, columns=["Predicted"])
predicted_df = predicted_df.set_index(time_stamps)
predicted_df['Actual'] = actual_values.values

In [23]:
import dataframe_image as dfi

# Create the directory for assets
os.makedirs("../assets", exist_ok=True)

dfi.export(predicted_df, "../assets/actual_vs_predictions.png", table_conversion='matplotlib')