# LightGBM Prediction Model

In this notebook, we train a LightGBM model for predicting the realized volatility. We will use the features generated in the feature_eng_notebook to train the model.

In [1]:
# libraries and settings
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
import warnings
warnings.filterwarnings('ignore')

In [2]:
# reading the training data 
df_train = pd.read_csv('../data/df_train.csv')
train = pd.read_csv('../data/train.csv')
time_order_df = pd.read_csv('../data/time_order.csv')

df_train['target'] = train['target']
df_train.dropna(inplace=True)

df_train['stock_id'] = df_train['row_id'].apply(lambda x: int(x.split('-')[0])).astype('category')
df_train['time_id'] = df_train['row_id'].apply(lambda x: int(x.split('-')[1]))
df_train = pd.merge(df_train, time_order_df, on = 'time_id')
df_train.sort_values('time_id_ordered', inplace = True)

## Training and Fine-Tuning the LGBM model

In [3]:
best_model_container = {'model': None}

# --------------------------
# RMSPE loss (gradient + hessian)
# --------------------------
def rmspe_objective(y_pred, dataset):
    y_true = dataset.get_label()
    eps = 0
    residual = (y_true - y_pred) / (y_true + eps)

    grad = -2.0 * residual / (y_true + eps)
    hess = 2.0 / (y_true + eps) ** 2

    return grad, hess

# --------------------------
# RMSPE evaluation function
# --------------------------
def rmspe_eval(y_pred, dataset):
    y_true = dataset.get_label()
    eps = 0
    pct_error = (y_true - y_pred) / (y_true + eps)
    score = np.sqrt(np.mean(pct_error ** 2))
    return 'RMSPE', score, False  


def objective(trial):

    params = {
        'objective': rmspe_objective,
        'metric': 'none',  
        'verbosity': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 16, 128),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 0.8),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 5.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 5.0),
        'num_boost_round': trial.suggest_int('num_boost_round', 600, 1000),
        'early_stopping_rounds': 50
    }
    
    df_train.sort_values('time_id_ordered', inplace = True)
    t_train = max(df_train['time_id_ordered'])*0.8
    X_train = df_train[df_train['time_id_ordered'] <= t_train].drop(columns=['target', 'row_id', 'time_id', 'time_id_ordered'])
    X_val = df_train[df_train['time_id_ordered'] > t_train].drop(columns=['target', 'row_id', 'time_id', 'time_id_ordered'])
    y_train = df_train[df_train['time_id_ordered'] <= t_train]['target'] 
    y_val = df_train[df_train['time_id_ordered'] > t_train]['target'] 

    categorical_features = ['stock_id']

    lgb_train = lgb.Dataset(
        X_train, 
        y_train, 
        categorical_feature = categorical_features
    )
    
    lgb_val = lgb.Dataset(
        X_val, 
        y_val, 
        categorical_feature = categorical_features, 
        reference = lgb_train
    )

    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_val],
        feval= rmspe_eval
    )


    def rmspe(y_true, y_pred, eps = 0):
      pct_error = (y_true - y_pred) / (y_true + eps)
      return np.sqrt(np.mean(pct_error ** 2))


    y_val_pred = model.predict(X_val, num_iteration = model.best_iteration)
    rmspe_score = rmspe(y_val, y_val_pred)  

    trial.set_user_attr("model", model)
        
    return rmspe_score


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("Best RMSPE:", study.best_value)
print("Best Parameters:")
print(study.best_params)


best_model = study.best_trial.user_attrs["model"]

[I 2025-08-12 19:34:14,024] A new study created in memory with name: no-name-a93f5c17-38ab-4432-813f-9bb55699d5e3
[I 2025-08-12 19:34:31,002] Trial 0 finished with value: 0.25632836334893205 and parameters: {'learning_rate': 0.01743776849492105, 'num_leaves': 32, 'max_depth': 4, 'min_data_in_leaf': 47, 'feature_fraction': 0.7337152178219352, 'bagging_fraction': 0.6576178480880025, 'bagging_freq': 10, 'lambda_l1': 2.0556658714205134, 'lambda_l2': 1.4233039209478338, 'num_boost_round': 653}. Best is trial 0 with value: 0.25632836334893205.
[I 2025-08-12 19:34:48,874] Trial 1 finished with value: 0.251305013115712 and parameters: {'learning_rate': 0.0935184996422937, 'num_leaves': 53, 'max_depth': 3, 'min_data_in_leaf': 47, 'feature_fraction': 0.9785295436573411, 'bagging_fraction': 0.6922022192648933, 'bagging_freq': 6, 'lambda_l1': 1.812135464305611, 'lambda_l2': 1.6373945791212818, 'num_boost_round': 957}. Best is trial 1 with value: 0.251305013115712.
[I 2025-08-12 19:35:05,978] Trial

Best RMSPE: 0.23939975288290602
Best Parameters:
{'learning_rate': 0.09778061003075043, 'num_leaves': 74, 'max_depth': 7, 'min_data_in_leaf': 16, 'feature_fraction': 0.8901027340775708, 'bagging_fraction': 0.7692196875273071, 'bagging_freq': 6, 'lambda_l1': 4.727082704441052, 'lambda_l2': 4.43290536399525, 'num_boost_round': 704}


In [4]:
best_model.save_model('../models/lgbm_model.txt')

<lightgbm.basic.Booster at 0x1fce9f59c30>