# LightGBM Prediction Model

In this notebook, we train a LightGBM model for predicting the realized volatility. We will use the features generated in the feature_eng_notebook to train the model.

In [2]:
# libraries and settings
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import optuna
import os 
import warnings

warnings.filterwarnings('ignore')

In [3]:
# reading the training data 
df_train = pd.read_csv('../data/intermediate/df_train.csv')

df_train.dropna(inplace=True)
df_train['stock_id'] = df_train['row_id'].apply(lambda x: int(x.split('-')[0])).astype('category')

## Training and Fine-Tuning the LGBM model

In [7]:
best_model_container = {'model': None}

# --------------------------
# RMSPE loss (gradient + hessian)
# --------------------------
def rmspe_objective(y_pred, dataset):
    y_true = dataset.get_label()
    eps = 0
    residual = (y_true - y_pred) / (y_true + eps)

    grad = -2.0 * residual / (y_true + eps)
    hess = 2.0 / (y_true + eps) ** 2

    return grad, hess

# --------------------------
# RMSPE evaluation function
# --------------------------
def rmspe_eval(y_pred, dataset):
    y_true = dataset.get_label()
    eps = 0
    pct_error = (y_true - y_pred) / (y_true + eps)
    score = np.sqrt(np.mean(pct_error ** 2))
    return 'RMSPE', score, False  


def objective(trial):

    params = {
        'objective': rmspe_objective,
        'metric': 'none',  
        'verbosity': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 16, 128),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 0.8),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 5.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 5.0),
        'num_boost_round': trial.suggest_int('num_boost_round', 600, 1000),
        'early_stopping_rounds': 50
    }

    X = df_train.drop(columns=['target', 'row_id'])
    y = df_train['target']  

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

    categorical_features = ['stock_id']

    lgb_train = lgb.Dataset(
        X_train, 
        y_train, 
        categorical_feature = categorical_features
    )
    
    lgb_val = lgb.Dataset(
        X_val, 
        y_val, 
        categorical_feature = categorical_features, 
        reference = lgb_train
    )

    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_val],
        feval= rmspe_eval
    )


    def rmspe(y_true, y_pred, eps = 0):
      pct_error = (y_true - y_pred) / (y_true + eps)
      return np.sqrt(np.mean(pct_error ** 2))


    y_val_pred = model.predict(X_val, num_iteration = model.best_iteration)
    rmspe_score = rmspe(y_val, y_val_pred)  

    trial.set_user_attr("model", model)
        
    return rmspe_score


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("Best RMSPE:", study.best_value)
print("Best Parameters:")
print(study.best_params)


best_model = study.best_trial.user_attrs["model"]

[I 2025-07-05 23:57:23,463] A new study created in memory with name: no-name-7e70d250-1633-4db1-a2d1-5d03b29ab5d7
[I 2025-07-05 23:58:02,146] Trial 0 finished with value: 0.23631128656606631 and parameters: {'learning_rate': 0.009135168854849492, 'num_leaves': 62, 'max_depth': 6, 'min_data_in_leaf': 10, 'feature_fraction': 0.8995861493609176, 'bagging_fraction': 0.7450178728664922, 'bagging_freq': 5, 'lambda_l1': 2.828367901987114, 'lambda_l2': 1.1963377793696401, 'num_boost_round': 998}. Best is trial 0 with value: 0.23631128656606631.
[I 2025-07-05 23:58:13,582] Trial 1 finished with value: 0.23108472686151604 and parameters: {'learning_rate': 0.08492343017049693, 'num_leaves': 39, 'max_depth': 6, 'min_data_in_leaf': 57, 'feature_fraction': 0.7009842917838829, 'bagging_fraction': 0.6092944117851753, 'bagging_freq': 6, 'lambda_l1': 2.7424725459982504, 'lambda_l2': 0.7728249490311628, 'num_boost_round': 808}. Best is trial 1 with value: 0.23108472686151604.
[I 2025-07-05 23:58:37,491] 

Best RMSPE: 0.2306230863164971
Best Parameters:
{'learning_rate': 0.08591330360774954, 'num_leaves': 73, 'max_depth': 7, 'min_data_in_leaf': 71, 'feature_fraction': 0.8947105935504415, 'bagging_fraction': 0.6032728220618001, 'bagging_freq': 4, 'lambda_l1': 4.582124444115889, 'lambda_l2': 2.6503402021640348, 'num_boost_round': 815}


In [27]:
best_model.save_model('../models/lgbm_model.txt')

<lightgbm.basic.Booster at 0x2320d803490>