# Training Pipeline

In this notebook, we will do the following tasks

1. Get the data from the feature store
2. Preprocess the data
3. Train the model
4. Evaluate the model
5. Register the model to model registry


In [1]:
# Import the required libraries
import os
import hopsworks
import numpy as np
import pandas as pd

from dotenv import load_dotenv
from xgboost import XGBRegressor

# Load the .env file 
load_dotenv()

# Get the envrioment variables
hopsworks_api_key = os.getenv("HOPSWORKS_API_KEY")
 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# login hopsworks and get the features group
project = hopsworks.login(api_key_value=str(hopsworks_api_key))
fs = project.get_feature_store()

2025-03-07 18:52:59,251 INFO: Initializing external client
2025-03-07 18:52:59,251 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-07 18:53:03,406 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1212597


## Feature View


### Feature Selection


In [3]:
amazon_fg = fs.get_feature_group("amazon_stock_prices", version=1)

In [4]:
# Select features for training data
selected_features = amazon_fg.select(["datetime", "open", "high", "close", "low", "volume", "rsi", "cci"])

# View the first 5 rows of selected features
selected_features.show(5)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (6.34s) 


Unnamed: 0,datetime,open,high,close,low,volume,rsi,cci
0,2023-05-05 18:30:00+00:00,105.309998,105.75,105.489998,105.300003,6771547,58.103335,131.610247
1,2024-12-10 20:30:00+00:00,224.509995,225.419907,225.149994,224.200195,3000920,53.145605,-89.708144
2,2023-11-22 20:30:00+00:00,146.729996,146.860001,146.720001,146.389999,4478865,60.736397,59.546794
3,2024-10-10 14:30:00+00:00,186.720001,186.850006,186.211304,185.830002,2915806,62.999684,122.281284
4,2023-11-22 17:30:00+00:00,147.007996,147.195007,147.020004,146.782501,3321822,62.992331,94.186858


### Feature View Creation


In [5]:
# Get or create feature view
amazon_fv = fs.get_or_create_feature_view(
    name= "amazon_fv", 
    version=1,
    query = selected_features,
)

## Training Dataset Creation


In [6]:
# Get the data to calculate date ranges
df = amazon_fg.read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.56s) 


In [7]:
# Sort the values according to the date
df = df.sort_values("datetime").set_index("datetime")

df.head()

Unnamed: 0_level_0,close,high,low,open,volume,id,rsi,cci
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-03-08 19:30:00+00:00,93.464996,93.650002,93.220001,93.485001,4095763,2023-03-08 19:30:00,40.642694,-57.933234
2023-03-08 20:30:00+00:00,93.919998,94.029999,93.404999,93.464996,5037603,2023-03-08 20:30:00,46.62374,-25.644676
2023-03-09 14:30:00+00:00,95.720001,95.800003,93.559998,93.68,12940669,2023-03-09 14:30:00,62.655323,99.133179
2023-03-09 15:30:00+00:00,95.180901,96.160004,95.139999,95.720001,7181789,2023-03-09 15:30:00,57.121687,162.29182
2023-03-09 16:30:00+00:00,95.366699,95.824997,95.099998,95.190002,5973566,2023-03-09 16:30:00,58.48263,159.562871


In [8]:
def get_fractional_dates(column: pd.Series):
    total_length = len(column)
    
    idx_0 = 0  # First index
    idx_70 = int(0.7 * total_length)
    idx_85 = int(0.85 * total_length)
    idx_last = total_length - 1  # Last index

    def extract_date(idx):
        return str(column.index[idx]).split(" ")[0]  # Convert to string and get date part

    train_start = extract_date(idx_0)
    val_start = extract_date(idx_70)
    test_start = extract_date(idx_85)

    train_end = extract_date(idx_70 - 1)  # One day before val_start
    val_end = extract_date(idx_85 - 1)  # One day before test_start
    test_end = extract_date(idx_last)  # Last available date

    return train_start, train_end, val_start, val_end, test_start, test_end

In [9]:
train_start, train_end, val_start, val_end, test_start, test_end = get_fractional_dates(df)
train_start, train_end, val_start, val_end, test_start, test_end

('2023-03-08',
 '2024-07-29',
 '2024-07-29',
 '2024-11-12',
 '2024-11-12',
 '2025-03-05')

#### Split the data into train, val and test splits


In [10]:
train, val, test, _, _, _ =  amazon_fv.train_validation_test_split(
    train_start=train_start,
    train_end = train_end,
    val_start = val_start,
    val_end = val_end,
    test_start=test_start,
    test_end = test_end    
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.84s) 




In [11]:
# Drop the datetime column
train.drop("datetime", axis=1, inplace=True)
val.drop("datetime", axis=1, inplace=True)
test.drop("datetime", axis=1, inplace=True)

In [15]:
def generate_sequence(data: pd.DataFrame, window_size:int = 24, forecast_steps: int = 7):

    X = []
    y = []

    for i in range(len(data) - window_size - forecast_steps):
        X.append(data.iloc[i:i+window_size])
        y.append(data.iloc[i+window_size:i+window_size+forecast_steps, 2])

    return np.array(X), np.array(y)

In [None]:
# Generate sequences
X_train, y_train = generate_sequence(train, window_size=24, forecast_steps=7)

# check the shapes of the generated sequences
print(np.allclose(X_train[0], train.iloc[:24].values))
print(np.allclose(y_train[0], train['close'].iloc[24:24+7]))

True
True


In [23]:
X_val, y_val = generate_sequence(val, window_size=24, forecast_steps=7)

# check the shapes of the generated sequences
print(np.allclose(X_val[0], val.iloc[:24]))
print(np.allclose(y_val[0], val['close'].iloc[24: 24+7]))

True
True


In [None]:
X_test, y_test = generate_sequence(test, window_size=28, forecast_steps=7)

# check the shapes of the generated sequences
print(np.allclose(X_test[0], test.iloc[:24]))
print(np.allclose(y_test[0], test['close'].iloc[24: 24+7]))

True
True


In [26]:
# Print thes shapes of training validation and test sets
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((2395, 24, 7), (2395, 7), (494, 24, 7), (494, 7), (486, 24, 7), (486, 7))

## Modelling


In [28]:
from sklearn.metrics import root_mean_squared_error

# Reshape the data
X_train_reshaped = X_train.reshape(X_train.shape[0], -1)
X_val_reshaped = X_val.reshape(X_val.shape[0], -1)

# Initialize and train the model
model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
model.fit(X_train_reshaped, y_train)

# Make predictions on the validation set
y_pred_val = model.predict(X_val_reshaped)

# Calculate the validation RMSE
val_rmse = root_mean_squared_error(y_val, y_pred_val)
print(f'Validation RMSE: {val_rmse}')

Validation RMSE: 35.78784679520415


In [30]:
# Save the metrics to a dict

metrics = {
    'rmse': val_rmse
}


In [29]:
# Make predictions on the test set

forecast = model.predict(X_test[-1].reshape(1, -1))
forecast

array([[150.06346, 166.70276, 147.3533 , 143.6741 , 125.94607, 142.8028 ,
        167.77267]], dtype=float32)

## Register the model to model registry

In [None]:
import joblib
# Save the model to the disk
model_dir = "../models/xgboost_model"
model_path = os.path.join(model_dir, "xgboost_model.pkl")


if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    
joblib.dump(model, model_path)

['../models/xgboost_model\\xgboost_model.pkl']

In [33]:
# get the model registry
mr = project.get_model_registry()


In [None]:

# Create a model in the model registry
model = mr.python.create_model(
    name="amazon_stock_price_prediction_model_xgboost",
    description = "XGBoost model for predicting Amazon stock prices",
    input_example = X_train[0],
    feature_view = amazon_fv
)

# Save model to the model registry
model.save(model_dir)

Uploading: 100.000%|██████████| 2412107/2412107 elapsed<00:04 remaining<00:001.30s/it]
Uploading: 100.000%|██████████| 3079/3079 elapsed<00:02 remaining<00:000:13,  3.35s/it]
Model export complete: 100%|██████████| 6/6 [00:14<00:00,  2.46s/it]                   

Model created, explore it at https://c.app.hopsworks.ai:443/p/1212597/models/amazon_stock_price_prediction_model_xgboost/1





Model(name: 'amazon_stock_price_prediction_model_xgboost', version: 1)