In [4]:
#Importing the libraries needed
import numpy as np
import pandas as pd
from scipy.stats import kurtosis, skew
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Load training and test data
train_data = pd.read_csv('/kaggle/input/taramani-quant-research-contest-tqrc/train_data.csv')
test_data = pd.read_csv('/kaggle/input/taramani-quant-research-contest-tqrc/final_test_data.csv')

# Features for Limit Order Book Alpha Prediction

We list the features that we have to predict the alpha from the Limit Order Book (LOB).

## 1. Price Features
- **Mid-price**:  
  $$ \text{midprice}_i = \frac{\text{bid_price}_i + \text{ask_price}_i}{2} $$
  
- **Weighted mid-price**:  
  $$ \text{weighted_mid_price} = \frac{\text{bid_price}_1 \cdot \text{ask_volume}_1 + \text{ask_price}_1 \cdot \text{bid_volume}_1}{\text{bid_volume}_1 + \text{ask_volume}_1} $$

## 2. Spread Features
- **Bid-ask spread**:  
  $$ \text{spread}_i = \text{ask_price}_i - \text{bid_price}_i $$

- **Spread skew**:  
  $$ \text{spread_skew} = \frac{\text{spread}_1}{\text{spread}_5} $$

## 3. Volume Imbalance Features
- **Volume imbalance**:  
  $$ \text{volume_imbalance}_i = \frac{\text{bid_volume}_i - \text{ask_volume}_i}{\text{bid_volume}_i + \text{ask_volume}_i} $$

- **Bid-ask imbalance**:  
  Ratio of total bid volume to total ask volume across all levels.

## 4. Order Flow Features
- **Order flow imbalance**:  
  Measures the imbalance in price changes between bid and ask sides.

## 5. Market Depth Features
- **Total depth**:  
  Sum of volumes across all levels on both sides.
  
- **Market depth ratio**:  
  Ratio of total bid volume to total ask volume.

## 6. Price Movement and Momentum Features
- **Price momentum**:  
  $$ \text{price_momentum} = \frac{\text{last_trade_price} - \text{midprice}_1}{\text{midprice}_1} $$

- **Mid-price slope**:  
  Rate of change of mid-price across levels.

## 7. Liquidity Features
- **Liquidity impact**:  
  Ratio of recent order counts to total order count.

- **Liquidity weight**:  
  Ratio of level 1 volume to level 2 volume.

## 8. Market Pressure Features
- **Market pressure**:  
  $$ \text{market_pressure} = \frac{\text{recent_buy_orders} - \text{recent_sell_orders}}{\text{total_orders}} $$

- **Order book pressure**:  
  Ratio of total bid volume to total ask volume.

## 9. Price Sensitivity Features
- **Bid/Ask price sensitivity**:  
  Rate of price change relative to volume change.

## 10. VWAP Features
- **Volume Weighted Average Price (VWAP)**:  
  $$ \text{VWAP}_i = \frac{\text{bid_price}_i \cdot \text{bid_volume}_i + \text{ask_price}_i \cdot \text{ask_volume}_i}{\text{bid_volume}_i + \text{ask_volume}_i} $$

## 11. Relative Volume Features
- **Relative volume at best bid/ask**:  
  Ratio of volume at best bid/ask to total volume on that side.

## 12. Statistical Features
- **Mean and cumulative sum of prices and volumes across levels.**
- **Volume momentum**:  
  Rate of change of volume across levels.

## 13. Market Skew and Depth Imbalance Features
- **Market skew**:  
  Measure of asymmetry in volume distribution.

- **Depth imbalance**:  
  Various ratios comparing bid and ask volumes at different levels.

These features capture different aspects of the limit order book dynamics, including price trends, volume imbalances, liquidity, and market pressure.


In [5]:
# Feature Engineering
def create_features(df):
    # Mid-Price
    for i in range(1, 6):
        df[f'midprice_{i}'] = (df[f'bid_price_{i}'] + df[f'ask_price_{i}']) / 2
    
    df['weighted_mid_price_1'] = (df['bid_price_1'] * df['ask_volume_1'] + df['ask_price_1'] * df['bid_volume_1']) / (df['bid_volume_1'] + df['ask_volume_1'])

    # Price Spread, Weighted Spread, Normalized Spread, and Volume Imbalance
    for i in range(1, 6):
        df[f'spread_{i}'] = df[f'ask_price_{i}'] - df[f'bid_price_{i}']
        df[f'weighted_spread_{i}'] = df[f'ask_volume_{i}'] * df[f'ask_price_{i}'] - df[f'bid_price_{i}'] * df[f'bid_volume_{i}']
        df[f'norm_spread_{i}'] = df[f'spread_{i}'] / df[f'midprice_{i}']
        df[f'volume_imbalance_{i}'] = (df[f'bid_volume_{i}'] - df[f'ask_volume_{i}']) / (df[f'bid_volume_{i}'] + df[f'ask_volume_{i}'])


    df['bid_ask_imbalance'] = df[[f'bid_volume_{i}' for i in range(1, 6)]].sum(axis=1) / (df[[f'ask_volume_{i}' for i in range(1, 6)]].sum(axis=1) + df[[f'bid_volume_{i}' for i in range(1, 6)]].sum(axis=1))

    # Order Flow (difference in price levels over time)
    df['order_flow'] = (df['bid_price_1'] - df['ask_price_1']) + (df['bid_volume_1'] - df['ask_volume_1'])

    # Depth (Total Volume at Top 5 Levels)
    df['depth'] = (df[['bid_volume_1', 'bid_volume_2', 'bid_volume_3', 'bid_volume_4', 'bid_volume_5']].sum(axis=1) +
                   df[['ask_volume_1', 'ask_volume_2', 'ask_volume_3', 'ask_volume_4', 'ask_volume_5']].sum(axis=1))


   # Basic price features
    df['midprice_1'] = (df['bid_price_1'] + df['ask_price_1']) / 2
    df['midprice_5'] = (df['bid_price_5'] + df['ask_price_5']) / 2
    df['weighted_mid_price_1'] = (df['bid_price_1'] * df['ask_volume_1'] + df['ask_price_1'] * df['bid_volume_1']) / (df['bid_volume_1'] + df['ask_volume_1'])

    # Spread features
    df['spread_1'] = df['ask_price_1'] - df['bid_price_1']
    df['spread_5'] = df['ask_price_5'] - df['bid_price_5']
    df['spread_skew'] = df['spread_1'] / df['spread_5']
    df['spread_depth_ratio_1'] = df['spread_1'] / (df['bid_volume_1'] + df['ask_volume_1'] + 1e-9)

    # Volume imbalance features
    for i in range(1, 6):
        df[f'volume_imbalance_{i}'] = (df[f'bid_volume_{i}'] - df[f'ask_volume_{i}']) / (df[f'bid_volume_{i}'] + df[f'ask_volume_{i}'] + 1e-9)
    df['bid_ask_imbalance'] = df[[f'bid_volume_{i}' for i in range(1, 6)]].sum(axis=1) / (df[[f'ask_volume_{i}' for i in range(1, 6)]].sum(axis=1) + df[[f'bid_volume_{i}' for i in range(1, 6)]].sum(axis=1))

    # Order flow features
    df['order_flow'] = (df['bid_price_1'] - df['ask_price_1']) + (df['bid_volume_1'] - df['ask_volume_1'])
    df['order_flow_imbalance'] = ((df['bid_price_1'] - df['bid_price_2']) - (df['ask_price_1'] - df['ask_price_2'])) / ((df['bid_price_1'] + df['ask_price_1']) / 2)

    # Market depth features
    df['depth'] = df[[f'bid_volume_{i}' for i in range(1, 6)]].sum(axis=1) + df[[f'ask_volume_{i}' for i in range(1, 6)]].sum(axis=1)
    df['market_depth_ratio'] = df[[f'bid_volume_{i}' for i in range(1, 6)]].sum(axis=1) / df[[f'ask_volume_{i}' for i in range(1, 6)]].sum(axis=1)

    # Price movement and momentum features
    df['price_momentum'] = (df['last_trade_price'] - df['midprice_1']) / df['midprice_1']
    df['midprice_slope'] = (df['midprice_5'] - df['midprice_1']) / 5
    df['bid_ask_slope'] = (df['ask_price_5'] - df['ask_price_1']) / (df['bid_price_1'] - df['bid_price_5'])

    # Liquidity features
    df['liquidity_impact'] = (df['recent_buy_order_count'] + df['recent_sell_order_count']) / df['total_order_count']
    df['liquidity_weight'] = (df['bid_volume_1'] + df['ask_volume_1']) / (df['bid_volume_2'] + df['ask_volume_2'] + 1e-9)

    # Market pressure features
    df['market_pressure'] = (df['recent_buy_order_count'] - df['recent_sell_order_count']) / df['total_order_count']
    df['order_book_pressure'] = df[[f'bid_volume_{i}' for i in range(1, 6)]].sum(axis=1) / (df[[f'ask_volume_{i}' for i in range(1, 6)]].sum(axis=1) + 1e-6)

    # Price sensitivity features
    df['bid_price_sensitivity'] = (df['bid_price_5'] - df['bid_price_1']) / df[['bid_volume_1', 'bid_volume_5']].sum(axis=1)
    df['ask_price_sensitivity'] = (df['ask_price_5'] - df['ask_price_1']) / df[['ask_volume_1', 'ask_volume_5']].sum(axis=1)

    # Volume-weighted average price (VWAP) features
    for i in range(1, 6):
        df[f'vwap_{i}'] = (df[f'bid_price_{i}'] * df[f'bid_volume_{i}'] + df[f'ask_price_{i}'] * df[f'ask_volume_{i}']) / (df[f'bid_volume_{i}'] + df[f'ask_volume_{i}'] + 1e-9)

    # Relative volume features
    df['rvbb'] = df['bid_volume_1'] / df[[f'bid_volume_{i}' for i in range(1, 6)]].sum(axis=1)
    df['rvba'] = df['ask_volume_1'] / df[[f'ask_volume_{i}' for i in range(1, 6)]].sum(axis=1)
    df['rvbf'] = df['rvba'] + df['rvbb']

    # Price difference features
    for i in range(1, 5):
        df[f'bid_price_diff_{i}_{i+1}'] = df[f'bid_price_{i}'] - df[f'bid_price_{i+1}']
        df[f'ask_price_diff_{i}_{i+1}'] = df[f'ask_price_{i+1}'] - df[f'ask_price_{i}']

    # Mean price and volume features
    df['ask_price_mean'] = df[[f'ask_price_{i}' for i in range(1, 6)]].mean(axis=1)
    df['bid_price_mean'] = df[[f'bid_price_{i}' for i in range(1, 6)]].mean(axis=1)
    df['ask_volume_mean'] = df[[f'ask_volume_{i}' for i in range(1, 6)]].mean(axis=1)
    df['bid_volume_mean'] = df[[f'bid_volume_{i}' for i in range(1, 6)]].mean(axis=1)

    # Cumulative sum features
    df['ask_price_cumsum'] = df[[f'ask_price_{i}' for i in range(1, 6)]].cumsum(axis=1).iloc[:, -1]
    df['bid_price_cumsum'] = df[[f'bid_price_{i}' for i in range(1, 6)]].cumsum(axis=1).iloc[:, -1]
    df['ask_vol_cumsum'] = df[[f'ask_volume_{i}' for i in range(1, 6)]].cumsum(axis=1).iloc[:, -1]
    df['bid_vol_cumsum'] = df[[f'bid_volume_{i}' for i in range(1, 6)]].cumsum(axis=1).iloc[:, -1]
    df['accumulated_price_diff'] = df['ask_price_cumsum'] - df['bid_price_cumsum']
    df['accumulated_volume_diff'] = df['ask_vol_cumsum'] - df['bid_vol_cumsum']

    # Volume momentum features
    df['bid_volume_momentum'] = df[[f'bid_volume_{i}' for i in range(1, 6)]].diff(axis=1).sum(axis=1)
    df['ask_volume_momentum'] = df[[f'ask_volume_{i}' for i in range(1, 6)]].diff(axis=1).sum(axis=1)

    # Market skew feature
    df['market_skew'] = ((df['bid_volume_1'] - df['ask_volume_1']) + (df['bid_volume_2'] - df['ask_volume_2'])) / ((df['bid_volume_1'] + df['ask_volume_1']) + (df['bid_volume_2'] + df['ask_volume_2']) + 1e-9)

    # Depth imbalance features
    df['depth_imbalance'] = (df['bid_volume_1'] + df['bid_volume_2']) / (df['ask_volume_1'] + df['ask_volume_2'] + 1e-9)
    # First, create individual depth imbalance features
    for level in range(1, 6):
        df[f'depth_imbalance_{level}'] = df[f'bid_volume_{level}'] / (df[f'ask_volume_{level}'] + 1e-9)

    # Then, create the cumulative depth imbalance
    df['cumulative_depth_imbalance'] = df[[f'depth_imbalance_{level}' for level in range(1, 6)]].sum(axis=1)

    # Create depth imbalance full
    df['depth_imbalance_full'] = df[[f'bid_volume_{i}' for i in range(1, 6)]].sum(axis=1) / df[[f'ask_volume_{i}' for i in range(1, 6)]].sum(axis=1)
    # Additional ratio features
    df['depth_ratio_bid'] = df['bid_volume_1'] / df[[f'bid_volume_{i}' for i in range(1, 6)]].sum(axis=1)
    df['depth_ratio_ask'] = df['ask_volume_1'] / df[[f'ask_volume_{i}' for i in range(1, 6)]].sum(axis=1)
    df['volume_concentration_1'] = df['bid_volume_1'] / (df['bid_volume_1'] + df['bid_volume_2'] + 1e-9)
    
    #Volatility Estimator (Microprice)
    df['microprice'] = (df['bid_price_1'] * df['ask_volume_1'] + df['ask_price_1'] * df['bid_volume_1']) / \
                       (df['bid_volume_1'] + df['ask_volume_1'])
    #Weighted Price Depth
    df['weighted_price_depth'] =  ((df[[f'bid_price_{i}' for i in range(1, 6)]] * df[[f'bid_volume_{i}' for i in range(1, 6)]]).sum(axis=1) + (df[[f'ask_price_{i}' for i in range(1, 6)]] * df[[f'ask_volume_{i}' for i in range(1, 6)]]).sum(axis=1))  / ((df[[f'bid_volume_{i}' for i in range(1, 6)]]).sum(axis=1) + (df[[f'ask_volume_{i}' for i in range(1, 6)]]).sum(axis=1))

    #Order Imbalance
    df['order_imbalance'] = (df[['bid_volume_1', 'bid_volume_2', 'bid_volume_3', 'bid_volume_4', 'bid_volume_5']].sum(axis=1) - 
                             df[['ask_volume_1', 'ask_volume_2', 'ask_volume_3', 'ask_volume_4', 'ask_volume_5']].sum(axis=1)) / \
                            (df[['bid_volume_1', 'bid_volume_2', 'bid_volume_3', 'bid_volume_4', 'bid_volume_5']].sum(axis=1) + 
                             df[['ask_volume_1', 'ask_volume_2', 'ask_volume_3', 'ask_volume_4', 'ask_volume_5']].sum(axis=1))

    return df

# Apply features on both train and test data
train_data = create_features(train_data)
test_data = create_features(test_data)

In [6]:
#The features to be applied
features = [
    'accumulated_price_diff', 'accumulated_volume_diff', 'ask_price_1', 'ask_price_2', 'ask_price_3', 'ask_price_4',
    'ask_price_5', 'ask_price_diff_1_2', 'ask_price_diff_2_3', 'ask_price_diff_3_4', 'ask_price_diff_4_5',
    'ask_price_mean', 'ask_volume_1', 'ask_volume_2', 'ask_volume_3', 'ask_volume_4', 'ask_volume_5',
    'ask_volume_mean', 'ask_volume_momentum', 'ask_vol_cumsum', 'bid_ask_imbalance', 'bid_ask_slope',
    'bid_price_1', 'bid_price_2', 'bid_price_3', 'bid_price_4', 'bid_price_5', 'bid_price_diff_1_2',
    'bid_price_diff_2_3', 'bid_price_diff_3_4', 'bid_price_diff_4_5', 'bid_price_mean', 'bid_price_sensitivity',
    'bid_volume_1', 'bid_volume_2', 'bid_volume_3', 'bid_volume_4', 'bid_volume_5', 'bid_volume_mean',
    'bid_volume_momentum', 'bid_vol_cumsum', 'depth', 'depth_imbalance', 'depth_imbalance_1', 'depth_imbalance_2',
    'depth_imbalance_3', 'depth_imbalance_4', 'depth_imbalance_5', 'last_trade_price', 'liquidity_impact',
    'liquidity_weight', 'market_depth_ratio', 'market_pressure', 'market_skew', 'midprice_1', 'midprice_2',
    'midprice_3', 'midprice_4', 'midprice_5', 'midprice_slope', 'microprice', 'order_flow_imbalance',
    'order_imbalance', 'price_momentum', 'recent_buy_order_count', 'recent_sell_order_count', 'rvba', 'rvbb', 'rvbf',
    'spread_depth_ratio_1', 'spread_skew', 'volume_concentration_1', 'volume_imbalance_1', 'volume_imbalance_2',
    'volume_imbalance_3', 'volume_imbalance_4', 'volume_imbalance_5', 'vwap_1', 'vwap_2', 'vwap_3', 'vwap_4',
    'vwap_5', 'weighted_mid_price_1', 'weighted_price_depth', 'weighted_spread_1', 'weighted_spread_2',
    'weighted_spread_3', 'weighted_spread_4', 'weighted_spread_5'
]

## Note on Deprecation Warnings

You may see the following deprecation warnings when running the code:

/tmp/ipykernel_56674/3187598405.py:6: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  X = X.fillna(method='ffill').fillna(method='bfill')
/tmp/ipykernel_56674/3187598405.py:7: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  y = y.fillna(method='ffill').fillna(method='bfill')


These errors are just deprecation ones and do not affect the functionality of our code, and hence can be ignored for working purposes.

In [7]:
# Prepare the data
X = train_data[features]
y = train_data['actual_returns']

# Fill NaN values
X = X.fillna(method='ffill').fillna(method='bfill')
y = y.fillna(method='ffill').fillna(method='bfill')

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create and train a Ridge regression model with cross-validation
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
ridge = Ridge(random_state=42)
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on validation set
y_val_pred = best_model.predict(X_val)

# Calculate correlation
correlation = np.corrcoef(y_val, y_val_pred)[0, 1]
print(f"Validation Correlation: {correlation}")
print(f"Best alpha: {grid_search.best_params_['alpha']}")

# Predict on test data
test_features = test_data[features].fillna(method='ffill').fillna(method='bfill')
test_features_scaled = scaler.transform(test_features)
test_predictions = best_model.predict(test_features_scaled)

  X = X.fillna(method='ffill').fillna(method='bfill')
  y = y.fillna(method='ffill').fillna(method='bfill')


Validation Correlation: 0.42694585508662025
Best alpha: 0.01


  test_features = test_data[features].fillna(method='ffill').fillna(method='bfill')


In [8]:

# Create submission file
submission = pd.DataFrame({
    'timestamp_code': test_data['timestamp_code'],
    'predicted_returns': test_predictions
})

# Ensure we have exactly 104980 rows
assert len(submission) == 104980, f"Submission has {len(submission)} rows instead of 104980"

submission.to_csv('submission.csv', index=False)
print(f"Submission file created with {len(submission)} rows")

Submission file created with 104980 rows
