# Training a Model for each Feature List

In this notebook, we train a diffrent lag_reg_ensemble model with different list of features chosen based on our clustering data. Then each model is evaluated on the test set and saved; we will ensemble all these models using ridge regression in a diffrerent notebook.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, SGDRegressor, LinearRegression
from sklearn.preprocessing import StandardScaler
import os
import gc
import warnings
from tqdm.notebook import tqdm
import time
from collections import defaultdict
warnings.filterwarnings('ignore')

In [2]:
class IncrementalLagEnsemble:
    """Incrementally trains models on different lag configurations using SGD."""
    
    def __init__(self, feature_batch_size=20, lag_batch_size=5, n_epochs=9):
        self.feature_batch_size = feature_batch_size
        self.lag_batch_size = lag_batch_size
        self.n_epochs = n_epochs  
        self.models = {}
        self.scalers = {}
        self.feature_names = None
        self.performance_history = defaultdict(list)
        
        # Define lag strategies with odd/even splits for many configurations
        self.lag_strategies = {
            # Original micro lags
            'micro': [1, 2, 3, 4, 5],
            'micro_odd': [1, 3, 5, 7, 9],
            'micro_even': [2, 4, 6, 8, 10],
            
            # Ultra short with odd/even
            'ultra_short': [6, 8, 10, 12, 15],
            'ultra_short_odd': [7, 9, 11, 13, 15, 17],
            'ultra_short_even': [6, 8, 10, 12, 14, 16],
            
            # Short with odd/even
            'short': [20, 25, 30, 40, 50],
            'short_odd': [21, 25, 31, 41, 51],
            'short_even': [20, 24, 30, 40, 50],
            
            # Short medium with odd/even
            'short_medium': [60, 75, 90, 105, 120],
            'short_medium_odd': [61, 75, 91, 105, 121],
            'short_medium_even': [60, 74, 90, 104, 120],
            
            # Medium with odd/even
            'medium': [150, 180, 210, 240, 300],
            'medium_odd': [151, 181, 211, 241, 301],
            
            # Medium long with odd/even
            'medium_long': [360, 420, 480, 540, 600],
            'medium_long_odd': [361, 421, 481, 541, 601],
            
            # Long with odd/even
            'long': [720, 840, 960, 1080, 1200],
            'long_odd': [721, 841, 961, 1081, 1201],
            
            # Very long with odd/even
            'very_long': [1440, 1800, 2160, 2520, 2880],
            'very_long_odd': [1441, 1801, 2161, 2521, 2881],
            
            # Ultra long (keeping original only due to very large values)
            'ultra_long': [3600, 4320, 5040, 5760, 7200]
        }
        
    def create_lag_features_batch(self, df, lag_list):
        """Create lag features for a batch of lags."""
        lag_features = []
        
        for lag in lag_list:
            lagged = df.shift(-lag)
            lagged.columns = [f'{col}_lag_{lag}' for col in df.columns]
            lag_features.append(lagged)
        
        result = pd.concat([df] + lag_features, axis=1)
        result = result.fillna(0)
        
        return result
    
    def train_sgd_model(self, X, y, model_name):
        """Train SGD model incrementally with more epochs."""
        print(f"  Training SGD model: {model_name} ({self.n_epochs} epochs)")
        
        # Initialize model and scaler if not exists
        if model_name not in self.models:
            self.models[model_name] = SGDRegressor(
                loss='huber',
                penalty='elasticnet',
                alpha=0.0001,
                l1_ratio=0.15,
                learning_rate='invscaling',
                eta0=0.01,
                power_t=0.25,
                random_state=42,
                warm_start=True,
                max_iter=1000,
                tol=1e-3
            )
            self.scalers[model_name] = StandardScaler()
            
        model = self.models[model_name]
        scaler = self.scalers[model_name]
        
        # Train in epochs with smaller chunks for better convergence
        chunk_size = 25000  # Slightly smaller chunks for more updates
        for epoch in range(self.n_epochs):
            # Shuffle indices for each epoch
            indices = np.random.permutation(len(X))
            
            for start_idx in range(0, len(X), chunk_size):
                end_idx = min(start_idx + chunk_size, len(X))
                
                # Get shuffled chunk
                chunk_indices = indices[start_idx:end_idx]
                X_chunk = X.iloc[chunk_indices]
                y_chunk = y[chunk_indices]
                
                # Scale
                if start_idx == 0 and epoch == 0:
                    X_scaled = scaler.fit_transform(X_chunk)
                else:
                    X_scaled = scaler.transform(X_chunk)

                
                # Partial fit
                model.partial_fit(X_scaled, y_chunk)

            # Print progress
            if epoch % 3 == 0:
                print(f"    Epoch {epoch+1}/{self.n_epochs} completed")
        
        return model
    
    def train_feature_batch(self, feature_batch, X_full, y, strategy_name, lag_list):
        """Train on a batch of features with specific lags."""
        print(f"\n  Processing feature batch ({len(feature_batch)} features) with {strategy_name} lags")
        
        # Select feature batch
        X_batch = X_full[feature_batch].copy()
        
        # Create lag features
        X_with_lags = self.create_lag_features_batch(X_batch, lag_list)
        
        # Train SGD model
        model_name = f"{strategy_name}_{feature_batch[0]}_{feature_batch[-1]}"
        self.train_sgd_model(X_with_lags, y, model_name)
        
        # Clean up
        del X_batch, X_with_lags
        gc.collect()
        
    def fit(self, X, y):
        """Fit ensemble using incremental training."""
        print("Training Incremental Lag Ensemble with Odd/Even Lag Configurations...")
        print(f"Total epochs per model: {self.n_epochs}")
        
        self.feature_names = X.columns.tolist()
        n_features = len(self.feature_names)
        
        # Split features into batches
        feature_batches = []
        for i in range(0, n_features, self.feature_batch_size):
            batch = self.feature_names[i:i+self.feature_batch_size]
            feature_batches.append(batch)
        
        print(f"Split {n_features} features into {len(feature_batches)} batches")
        print(f"Total lag strategies (including odd/even): {len(self.lag_strategies)}")
        
        # Calculate total lag values
        total_lags = sum(len(lags) for lags in self.lag_strategies.values())
        print(f"Total unique lag values: {total_lags}")
        
        # Train models for each combination of feature batch and lag strategy
        total_models = len(feature_batches) * len(self.lag_strategies)
        model_count = 0
        
        for strategy_name, lag_list in self.lag_strategies.items():
            print(f"\nTraining {strategy_name} strategy (lags: {lag_list})")
            
            for batch_idx, feature_batch in enumerate(feature_batches):
                model_count += 1
                print(f"Progress: {model_count}/{total_models} models")
                
                self.train_feature_batch(feature_batch, X, y, strategy_name, lag_list)
                
                # Clean up periodically
                if batch_idx % 2 == 0:
                    gc.collect()
        
        
        print(f"\nTotal models trained: {len(self.models)}")
        
    def predict_batch(self, X, feature_batch, strategy_name, lag_list):
        """Make predictions for a specific feature batch and lag strategy."""
        model_name = f"{strategy_name}_{feature_batch[0]}_{feature_batch[-1]}"
        
        if model_name not in self.models:
            return None
            
        # Select features
        X_batch = X[feature_batch].copy()
        
        # Create lag features
        X_with_lags = self.create_lag_features_batch(X_batch, lag_list)
        
        # Scale and predict
        X_scaled = self.scalers[model_name].transform(X_with_lags)
        predictions = self.models[model_name].predict(X_scaled)
        
        # Clean up
        del X_batch, X_with_lags, X_scaled
        gc.collect()
        
        return predictions
    
    def predict(self, X):
        """Make ensemble predictions."""
        all_predictions = {}
        
        # Recreate feature batches
        n_features = len(self.feature_names)
        feature_batches = []
        for i in range(0, n_features, self.feature_batch_size):
            batch = self.feature_names[i:i+self.feature_batch_size]
            if all(col in X.columns for col in batch):
                feature_batches.append(batch)
        
        # Get predictions from each model
        for strategy_name, lag_list in self.lag_strategies.items():
            for feature_batch in feature_batches:
                pred = self.predict_batch(X, feature_batch, strategy_name, lag_list)
                if pred is not None:
                    model_name = f"{strategy_name}_{feature_batch[0]}_{feature_batch[-1]}"
                    all_predictions[model_name] = pred
        
        return pd.DataFrame(all_predictions)


## Find the Features_list Based on the X Clusters

We sample one by one from each cluster untill all the X features are sampled. We remove the ones with less than 10 features in them.

In [5]:
clusters = pd.read_csv('clusters.csv', index_col=0)

clusters_dict = defaultdict(list)

for c in np.unique(clusters):
    X = clusters[clusters['0'] == c]
    clusters_dict[c] = X.index.tolist()

total = 0

features_list = []

while total<780:
    features = []
    for k in clusters_dict.keys():
        if len(clusters_dict[k])>0:
            features.append(clusters_dict[k][-1])
            clusters_dict[k].pop()
            total += 1
    features_list.append(features)
    features = list([])
while len(features_list[-1])<10:
    features_list.pop()

In [33]:
train = pd.read_parquet('data/train.parquet')

test = pd.read_parquet('data/test.parquet')

order_features = train.columns.tolist()[:5]

Y = train['label']

In [34]:
row_id = pd.read_csv('closest_rows.csv', index_col=0)

new = test.reset_index()

new['row_id'] = row_id['0']

new.head()

Unnamed: 0,ID,bid_qty,ask_qty,buy_qty,sell_qty,volume,X1,X2,X3,X4,...,X773,X774,X775,X776,X777,X778,X779,X780,label,row_id
0,1,0.317,8.102,13.164,10.272,23.436,-0.341229,0.041851,-0.020094,-0.206221,...,-0.043417,1.521787,1.548965,1.495735,1.16673,0.281056,-0.187831,-0.599553,0,112334
1,2,2.608,2.111,123.562,40.163,163.725,-1.029564,-1.382505,-1.214935,-1.020241,...,-0.07709,-0.703054,-0.716951,-0.721292,-0.674619,-0.639318,-0.736268,-0.86222,0,69300
2,3,2.768,10.787,126.137,118.266,244.403,-2.59409,-5.486158,-4.744466,-3.930152,...,-0.030627,-0.703514,-0.717525,-0.731701,-0.750998,-0.789366,-0.850941,-1.033131,0,152075
3,4,0.948,12.157,16.069,31.723,47.792,0.240745,0.997585,1.028965,1.081052,...,-0.03338,1.521167,1.551771,1.582833,1.62583,1.762155,1.911924,1.962445,0,255828
4,5,1.084,3.493,32.679,37.327,70.006,0.067189,0.772852,0.772152,0.714846,...,-0.004915,-0.703161,-0.7169,-0.714699,-0.652209,-0.623165,-0.699887,-0.640094,0,390226


In [36]:
new = new.sort_values('row_id')

new.head()

Unnamed: 0,ID,bid_qty,ask_qty,buy_qty,sell_qty,volume,X1,X2,X3,X4,...,X773,X774,X775,X776,X777,X778,X779,X780,label,row_id
192617,192618,1.037,20.607,20.377,18.854,39.231,1.90712,0.428308,0.981918,1.342953,...,-0.046392,1.523177,1.547193,1.46273,1.084867,0.196849,-0.24948,-0.644863,0,-1
188498,188499,24.169,2.624,7.885,9.266,17.151,0.555521,0.46112,0.72813,0.766869,...,-0.012611,1.52299,1.553631,1.584729,1.626858,1.580276,1.221571,0.683121,0,-1
424484,424485,9.5,10.256,27.151,12.778,39.929,0.696916,-0.095659,-0.216179,-0.155187,...,-0.017567,-0.703437,-0.717389,-0.731427,-0.747933,-0.755011,-0.796735,-0.778168,0,-1
424482,424483,1.91,4.205,121.387,90.664,212.051,1.207702,0.99058,1.02987,1.112571,...,-0.046124,-0.703374,-0.669391,-0.506669,-0.38612,-0.519117,-0.647202,-0.578245,0,-1
268120,268121,4.567,4.285,37.114,25.157,62.271,-1.007777,-0.636524,-0.791924,-0.824496,...,-0.02553,1.521299,1.55117,1.54263,1.314757,0.461959,-0.050952,-0.498569,0,-1


## Include the Order Book Data

We include the order book data in all the models.

In [10]:
X_trains = {}

X_tests = {}

for i in range(len(features_list)):
    X_trains[i] = train[order_features+features_list[i]]
    X_tests[i] = new[order_features+features_list[i]]

## Training the Models

Each model is trained and evaluated on the test set; we store each test prediction in a parquet file. For each model, we use a different feature batch size based on the number of features in the specific features_list used for that model.

In [23]:
%%time

train_preds = []

test_preds = []

for i in [0,1,2]:
    ensemble = IncrementalLagEnsemble(
      feature_batch_size=20,  # Slightly smaller batches due to more features
      lag_batch_size=5,
      n_epochs=9  # Increased to 9 as requested
    )

    ensemble.fit(X_trains[i], Y)

    train_preds.append(ensemble.predict(X_trains[i]))

    test_preds.append(ensemble.predict(X_tests[i]))

    print()
    print(f"Model {i} Finished")
    print()

    del ensemble
    gc.collect()

train_preds = pd.concat(train_preds, axis=1)
test_preds = pd.concat(test_preds, axis=1)

train_preds.to_parquet('train_preds_[0,1,2].parquet')
test_preds.to_parquet('test_preds_[0,1,2].parquet')

del test_preds, train_preds
gc.collect()
    


Training Incremental Lag Ensemble with Odd/Even Lag Configurations...
Total epochs per model: 9
Split 179 features into 9 batches
Total lag strategies (including odd/even): 21
Total unique lag values: 107

Training micro strategy (lags: [1, 2, 3, 4, 5])
Progress: 1/189 models

  Processing feature batch (20 features) with micro lags
  Training SGD model: micro_bid_qty_X501 (9 epochs)
    Epoch 1/9 completed
    Epoch 4/9 completed
    Epoch 7/9 completed
Progress: 2/189 models

  Processing feature batch (20 features) with micro lags
  Training SGD model: micro_X498_X715 (9 epochs)
    Epoch 1/9 completed
    Epoch 4/9 completed
    Epoch 7/9 completed
Progress: 3/189 models

  Processing feature batch (20 features) with micro lags
  Training SGD model: micro_X727_X685 (9 epochs)
    Epoch 1/9 completed
    Epoch 4/9 completed
    Epoch 7/9 completed
Progress: 4/189 models

  Processing feature batch (20 features) with micro lags
  Training SGD model: micro_X653_X97 (9 epochs)
    Epoc

0

In [15]:
%%time

train_preds = []

test_preds = []

for i in [3,4,5,7]:
    ensemble = IncrementalLagEnsemble(
      feature_batch_size=15,  # Slightly smaller batches due to more features
      lag_batch_size=5,
      n_epochs=9  # Increased to 9 as requested
    )

    ensemble.fit(X_trains[i], Y)

    train_preds.append(ensemble.predict(X_trains[i]))

    test_preds.append(ensemble.predict(X_tests[i]))

    print()
    print(f"Model {i} Finished")
    print()

    del ensemble
    gc.collect()

train_preds = pd.concat(train_preds, axis=1)
test_preds = pd.concat(test_preds, axis=1)

train_preds.to_parquet('train_preds_[3,4,5,7].parquet')
test_preds.to_parquet('test_preds_[3,4,5,7].parquet')

del test_preds, train_preds
gc.collect()
    


Training Incremental Lag Ensemble with Odd/Even Lag Configurations...
Total epochs per model: 9
Split 89 features into 6 batches
Total lag strategies (including odd/even): 21
Total unique lag values: 107

Training micro strategy (lags: [1, 2, 3, 4, 5])
Progress: 1/126 models

  Processing feature batch (15 features) with micro lags
  Training SGD model: micro_bid_qty_X502 (9 epochs)
    Epoch 1/9 completed
    Epoch 4/9 completed
    Epoch 7/9 completed
Progress: 2/126 models

  Processing feature batch (15 features) with micro lags
  Training SGD model: micro_X660_X24 (9 epochs)
    Epoch 1/9 completed
    Epoch 4/9 completed
    Epoch 7/9 completed
Progress: 3/126 models

  Processing feature batch (15 features) with micro lags
  Training SGD model: micro_X135_X119 (9 epochs)
    Epoch 1/9 completed
    Epoch 4/9 completed
    Epoch 7/9 completed
Progress: 4/126 models

  Processing feature batch (15 features) with micro lags
  Training SGD model: micro_X324_X470 (9 epochs)
    Epoch

0

In [16]:
%%time

train_preds = []

test_preds = []

for i in [6,8]:
    ensemble = IncrementalLagEnsemble(
      feature_batch_size=12,  # Slightly smaller batches due to more features
      lag_batch_size=5,
      n_epochs=9  # Increased to 9 as requested
    )

    ensemble.fit(X_trains[i], Y)

    train_preds.append(ensemble.predict(X_trains[i]))

    test_preds.append(ensemble.predict(X_tests[i]))

    print()
    print(f"Model {i} Finished")
    print()

    del ensemble
    gc.collect()

train_preds = pd.concat(train_preds, axis=1)
test_preds = pd.concat(test_preds, axis=1)

train_preds.to_parquet('train_preds_[6,8].parquet')
test_preds.to_parquet('test_preds_[6,8].parquet')

del test_preds, train_preds
gc.collect()
    

Training Incremental Lag Ensemble with Odd/Even Lag Configurations...
Total epochs per model: 9
Split 35 features into 3 batches
Total lag strategies (including odd/even): 21
Total unique lag values: 107

Training micro strategy (lags: [1, 2, 3, 4, 5])
Progress: 1/63 models

  Processing feature batch (12 features) with micro lags
  Training SGD model: micro_bid_qty_X94 (9 epochs)
    Epoch 1/9 completed
    Epoch 4/9 completed
    Epoch 7/9 completed
Progress: 2/63 models

  Processing feature batch (12 features) with micro lags
  Training SGD model: micro_X76_X456 (9 epochs)
    Epoch 1/9 completed
    Epoch 4/9 completed
    Epoch 7/9 completed
Progress: 3/63 models

  Processing feature batch (11 features) with micro lags
  Training SGD model: micro_X184_X567 (9 epochs)
    Epoch 1/9 completed
    Epoch 4/9 completed
    Epoch 7/9 completed

Training micro_odd strategy (lags: [1, 3, 5, 7, 9])
Progress: 4/63 models

  Processing feature batch (12 features) with micro_odd lags
  Trai

0

In [17]:
%%time

train_preds = []

test_preds = []

for i in [9,10,11]:
    ensemble = IncrementalLagEnsemble(
      feature_batch_size=20,  # Slightly smaller batches due to more features
      lag_batch_size=5,
      n_epochs=9  # Increased to 9 as requested
    )

    ensemble.fit(X_trains[i], Y)

    train_preds.append(ensemble.predict(X_trains[i]))

    test_preds.append(ensemble.predict(X_tests[i]))

    print()
    print(f"Model {i} Finished")
    print()

    del ensemble
    gc.collect()

train_preds = pd.concat(train_preds, axis=1)
test_preds = pd.concat(test_preds, axis=1)

train_preds.to_parquet('train_preds_[9,10,11].parquet')
test_preds.to_parquet('test_preds_[9,10,11].parquet')

del test_preds, train_preds
gc.collect()

Training Incremental Lag Ensemble with Odd/Even Lag Configurations...
Total epochs per model: 9
Split 17 features into 1 batches
Total lag strategies (including odd/even): 21
Total unique lag values: 107

Training micro strategy (lags: [1, 2, 3, 4, 5])
Progress: 1/21 models

  Processing feature batch (17 features) with micro lags
  Training SGD model: micro_bid_qty_X282 (9 epochs)
    Epoch 1/9 completed
    Epoch 4/9 completed
    Epoch 7/9 completed

Training micro_odd strategy (lags: [1, 3, 5, 7, 9])
Progress: 2/21 models

  Processing feature batch (17 features) with micro_odd lags
  Training SGD model: micro_odd_bid_qty_X282 (9 epochs)
    Epoch 1/9 completed
    Epoch 4/9 completed
    Epoch 7/9 completed

Training micro_even strategy (lags: [2, 4, 6, 8, 10])
Progress: 3/21 models

  Processing feature batch (17 features) with micro_even lags
  Training SGD model: micro_even_bid_qty_X282 (9 epochs)
    Epoch 1/9 completed
    Epoch 4/9 completed
    Epoch 7/9 completed

Training

0