# Weighted Regression Baseline

The idea of this notebook is to show that a simple weight help optimise the RMSPE as discussed here : https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/250324

I got the idea to work on individual stock regression and the importance of taking the custom evaluation into account in my EDA Notebook here : https://www.kaggle.com/lucasmorin/target-error-exploration-stock-time-clustering


In [1]:
import os
from sklearn.metrics import r2_score
import glob
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
data_dir = '/content/drive/MyDrive/Colab Notebooks/RBS DL 2025/PRO/data'

In [4]:
train = pd.read_csv(f'{data_dir}/train.csv')
train.head()

Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747


In [5]:
book_example = pd.read_parquet(f'{data_dir}/book_train.parquet/stock_id=0')
trade_example =  pd.read_parquet(f'{data_dir}/trade_train.parquet/stock_id=0')
stock_id = '0'
book_example = book_example[book_example['time_id']==5]
book_example.loc[:,'stock_id'] = stock_id
trade_example = trade_example[trade_example['time_id']==5]
trade_example.loc[:,'stock_id'] = stock_id

In [6]:
book_example['wap'] = (book_example['bid_price1'] * book_example['ask_size1'] +
                                book_example['ask_price1'] * book_example['bid_size1']) / (
                                       book_example['bid_size1']+ book_example['ask_size1'])

In [7]:
fig = px.line(book_example, x="seconds_in_bucket", y="wap", title='WAP of stock_id_0, time_id_5')
fig.show()

In [8]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

In [9]:
book_example.loc[:,'log_return'] = log_return(book_example['wap'])
book_example = book_example[~book_example['log_return'].isnull()]

In [10]:
fig = px.line(book_example, x="seconds_in_bucket", y="log_return", title='Log return of stock_id_0, time_id_5')
fig.show()

In [11]:
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))
realized_vol = realized_volatility(book_example['log_return'])
print(f'Realized volatility for stock_id 0 on time_id 5 is {realized_vol}')

Realized volatility for stock_id 0 on time_id 5 is 0.004499364172786558


In [12]:
list_order_book_file_train = glob.glob(f'{data_dir}/book_train.parquet/*')

In [13]:
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)

model_dict = {}

def realized_volatility_per_time_id_linear(file_path, prediction_column_name, train_test = True):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')

    poly = PolynomialFeatures(degree=3)

    # if train_test:

    #     df_realized_vol_per_stock_joined = train.merge(df_realized_vol_per_stock[['row_id',prediction_column_name]], on = ['row_id'], how = 'right')

    #     weights = 1/np.square(df_realized_vol_per_stock_joined.target)

    #     X = np.array(df_realized_vol_per_stock_joined[[prediction_column_name]]).reshape(-1, 1)
    #     X_ = poly.fit_transform(X)
    #     y = df_realized_vol_per_stock_joined.target


    #     reg = LinearRegression().fit(X_, y, sample_weight = weights)
    #     #df_realized_vol_per_stock[[prediction_column_name]] = reg.predict(X_)
    #     df_realized_vol_per_stock_joined = df_realized_vol_per_stock_joined.reset_index(drop=True)
    #     df_realized_vol_per_stock_joined[prediction_column_name] = pd.Series(reg.predict(X_), dtype=np.float32)

    #     model_dict[stock_id] = reg
    if train_test:
        df_realized_vol_per_stock_joined = train.merge(
            df_realized_vol_per_stock[['row_id', prediction_column_name]],
            on='row_id', how='right'
        )

        weights = 1 / np.square(df_realized_vol_per_stock_joined.target)

        X = np.array(df_realized_vol_per_stock_joined[[prediction_column_name]]).reshape(-1, 1)
        X_ = poly.fit_transform(X)
        y = df_realized_vol_per_stock_joined.target

        reg = LinearRegression().fit(X_, y, sample_weight=weights)

        # ✅ FIX — assign raw NumPy values
        df_realized_vol_per_stock_joined[prediction_column_name] = reg.predict(X_).astype(np.float32)

        model_dict[stock_id] = reg

        return df_realized_vol_per_stock_joined[['row_id', prediction_column_name]]

    else:

        reg = model_dict[stock_id]

        X = np.array(df_realized_vol_per_stock[[prediction_column_name]]).reshape(-1, 1)
        X_ = poly.fit_transform(X)
        df_realized_vol_per_stock[[prediction_column_name]] = reg.predict(X_)

    #return df_realized_vol_per_stock[['row_id',prediction_column_name]]
    return df_realized_vol_per_stock_joined[['row_id', prediction_column_name]]


In [14]:
print("Shape of df_realized_vol_per_stock:", df_realized_vol_per_stock.shape)
print("Length of reg.predict(X_):", len(reg.predict(X_)))
print("Index of df_realized_vol_per_stock:", df_realized_vol_per_stock.index)

NameError: name 'df_realized_vol_per_stock' is not defined

In [15]:
def past_realized_volatility_per_stock_linear(list_file,prediction_column_name, train_test = True):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id_linear(file,prediction_column_name,train_test)])
    return df_past_realized

df_past_realized_train = past_realized_volatility_per_stock_linear(list_file=list_order_book_file_train,prediction_column_name='pred')

TypeError: incompatible index of inserted column with frame index

In [None]:
train = train[['row_id','target']]
df_joined = train.merge(df_past_realized_train[['row_id','pred']], on = ['row_id'], how = 'left')

In [None]:
from sklearn.metrics import r2_score
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))
R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

Submission

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
df_naive_pred_test = df_past_realized_train = past_realized_volatility_per_stock_linear(list_file=list_order_book_file_test,
                                                           prediction_column_name='target', train_test = False)
df_naive_pred_test.to_csv('submission.csv',index = False)