# Training Pipeline

In this notebook, we will do the following tasks

1. Get the data from the feature store
2. Preprocess the data
3. Train the model
4. Evaluate the model
5. Register the model to model registry


In [2]:
# Import the required libraries
import os
import hopsworks
import numpy as np
import pandas as pd

from dotenv import load_dotenv

# Load the .env file 
load_dotenv()

# Get the envrioment variables
hopsworks_api_key = os.getenv("HOPSWORKS_API_KEY")
 

In [5]:
# login hopsworks and get the features group
project = hopsworks.login(api_key_value=str(hopsworks_api_key))
fs = project.get_feature_store()



2025-02-20 22:13:03,111 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-02-20 22:13:03,114 INFO: Initializing external client
2025-02-20 22:13:03,115 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-02-20 22:13:06,138 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1212597


### Feature Selection


In [6]:
amazon_fg = fs.get_feature_group("amazon_stock_prices", version=1)

In [10]:
# Select features for training data
selected_features = amazon_fg.select(["date", "open", "high", "close", "low"])

# View the first 5 rows of selected features
selected_features.show(5)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.87s) 


Unnamed: 0,date,open,high,close,low
0,2020-07-24 18:00:00+00:00,3001.07,3015.165,3004.82,2999.01
1,2021-12-03 20:00:00+00:00,3353.08,3392.99,3391.14,3340.0
2,2021-03-17 19:00:00+00:00,3167.44,3173.05,3134.8,3132.15
3,2023-02-14 16:00:00+00:00,98.47,98.855,97.97,97.53
4,2024-10-22 15:00:00+00:00,190.045,190.3,189.84,189.39


### Feature View Creation


In [29]:
# Get or create feature view
amazon_fv = fs.get_or_create_feature_view(
    name= "amazon_fv", 
    version=1,
    query = selected_features,
    labels = [ "close"]
)

### Training Dataset Creation


In [71]:
# Get the data to calculate date ranges
df = amazon_fg.read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (3.26s) 


In [72]:
# Sort the values according to the date
df = df.sort_values("date").set_index("date")

df.head()

Unnamed: 0_level_0,close,high,low,open,id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-02 15:00:00+00:00,1515.03,1518.0,1479.59,1486.18,2019-01-02 15:00:00
2019-01-02 16:00:00+00:00,1513.5,1529.825,1513.18,1516.4,2019-01-02 16:00:00
2019-01-02 17:00:00+00:00,1533.5,1533.5,1512.93,1513.01,2019-01-02 17:00:00
2019-01-02 18:00:00+00:00,1549.0,1553.09,1533.93,1533.93,2019-01-02 18:00:00
2019-01-02 19:00:00+00:00,1543.23,1552.66,1539.15,1547.68,2019-01-02 19:00:00


In [73]:
def get_fractional_dates(column: pd.Series):
    total_length = len(column)
    
    idx_0 = 0  # First index
    idx_70 = int(0.7 * total_length)
    idx_85 = int(0.85 * total_length)
    idx_last = total_length - 1  # Last index

    def extract_date(idx):
        return str(column.index[idx]).split(" ")[0]  # Convert to string and get date part

    train_start = extract_date(idx_0)
    val_start = extract_date(idx_70)
    test_start = extract_date(idx_85)

    train_end = extract_date(idx_70 - 1)  # One day before val_start
    val_end = extract_date(idx_85 - 1)  # One day before test_start
    test_end = extract_date(idx_last)  # Last available date

    return train_start, train_end, val_start, val_end, test_start, test_end

In [74]:
train_start, train_end, val_start, val_end, test_start, test_end = get_fractional_dates(features)
train_start, train_end, val_start, val_end, test_start, test_end

('2019-01-02',
 '2023-04-18',
 '2023-04-19',
 '2024-03-19',
 '2024-03-20',
 '2025-02-19')

#### Split the data into train, val and test splits

In [75]:
X_train, X_val, X_test, y_train, y_val, y_test =  amazon_fv.train_validation_test_split(
    train_start=train_start,
    train_end = train_end,
    val_start = val_start,
    val_end = val_end,
    test_start=test_start,
    test_end = test_end    
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.72s) 

2025-02-20 22:56:51,089 INFO: Provenance cached data - overwriting last accessed/created training dataset from 6 to 7.


### Modelling