# The First Model

We are going to use the X_clustered.csv (constructed in data eng & feature selection notebook) file to train our first model.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, SGDRegressor
from sklearn.preprocessing import StandardScaler
import os
import gc
import warnings
from tqdm.notebook import tqdm
from collections import defaultdict
warnings.filterwarnings('ignore')

## Loading the Clustered Data

We will need the temporal order of test data, and for that, we use the file closest_rows.csv to reconstruct the time order.

In [3]:
#### training the model with the clustered data set

train = pd.read_parquet('X_clustered.parquet')

test = pd.read_parquet('X_clustered_test.parquet')

row_id = pd.read_csv('closest_rows.csv', index_col=0)

Y = pd.read_parquet('data/train.parquet')['label']

new = test.reset_index()

new['row_id'] = row_id['0']

new = new.sort_values('row_id')

In [13]:
features = new.columns.tolist()[1:-1]

X_test = new[features]

### Train & Val Data

We use the last 100000 data poinst of train as our validation set

In [13]:
#### training the model with the original data set

train = pd.read_parquet('data/train.parquet')

Y = train['label']

features = train.columns.tolist()[:-1]

X = train[features]

X_train = X[:-100000]

Y_train = Y[:-100000]

X_val = X[-100000:]

Y_val = Y[-100000:]

## The Model

Our model is going to be an ensemble of many linear regression models that are trained using the SGDRegressor in sklearn. Each linear regression model uses a fixed numbers of features(20 in the following code), and lagged versiosn of those features. The lag shift is chosen based on different temporal patterns, and we train a linear reg model for each pattern. The final model is the simple average of all these models.

In [5]:
class IncrementalLagEnsemble:
    """Incrementally trains models on different lag configurations using SGD."""
    
    def __init__(self, feature_batch_size=20, lag_batch_size=5, n_epochs=9):
        self.feature_batch_size = feature_batch_size
        self.lag_batch_size = lag_batch_size
        self.n_epochs = n_epochs  
        self.models = {}
        self.scalers = {}
        self.feature_names = None
        self.model_weights = {}
        self.performance_history = defaultdict(list)
        
        # Define lag strategies with odd/even splits for many configurations
        self.lag_strategies = {
            # Original micro lags
            'micro': [1, 2, 3, 4, 5],
            'micro_odd': [1, 3, 5, 7, 9],
            'micro_even': [2, 4, 6, 8, 10],
            
            # Ultra short with odd/even
            'ultra_short': [6, 8, 10, 12, 15],
            'ultra_short_odd': [7, 9, 11, 13, 15, 17],
            'ultra_short_even': [6, 8, 10, 12, 14, 16],
            
            # Short with odd/even
            'short': [20, 25, 30, 40, 50],
            'short_odd': [21, 25, 31, 41, 51],
            'short_even': [20, 24, 30, 40, 50],
            
            # Short medium with odd/even
            'short_medium': [60, 75, 90, 105, 120],
            'short_medium_odd': [61, 75, 91, 105, 121],
            'short_medium_even': [60, 74, 90, 104, 120],
            
            # Medium with odd/even
            'medium': [150, 180, 210, 240, 300],
            'medium_odd': [151, 181, 211, 241, 301],
            
            # Medium long with odd/even
            'medium_long': [360, 420, 480, 540, 600],
            'medium_long_odd': [361, 421, 481, 541, 601],
            
            # Long with odd/even
            'long': [720, 840, 960, 1080, 1200],
            'long_odd': [721, 841, 961, 1081, 1201],
            
            # Very long with odd/even
            'very_long': [1440, 1800, 2160, 2520, 2880],
            'very_long_odd': [1441, 1801, 2161, 2521, 2881],
            
            # Ultra long (keeping original only due to very large values)
            'ultra_long': [3600, 4320, 5040, 5760, 7200]
        }
        
    def create_lag_features_batch(self, df, lag_list):
        """Create lag features for a batch of lags."""
        lag_features = []
        
        for lag in lag_list:
            lagged = df.shift(-lag)
            lagged.columns = [f'{col}_lag_{lag}' for col in df.columns]
            lag_features.append(lagged)
        
        result = pd.concat([df] + lag_features, axis=1)
        result = result.fillna(0)
        
        return result
    
    def train_sgd_model(self, X, y, model_name):
        """Train SGD model incrementally with more epochs."""
        print(f"  Training SGD model: {model_name} ({self.n_epochs} epochs)")
        
        # Initialize model and scaler if not exists
        if model_name not in self.models:
            self.models[model_name] = SGDRegressor(
                loss='huber',
                penalty='elasticnet',
                alpha=0.0001,
                l1_ratio=0.15,
                learning_rate='invscaling',
                eta0=0.01,
                power_t=0.25,
                random_state=42,
                warm_start=True,
                max_iter=1000,
                tol=1e-3
            )
            self.scalers[model_name] = StandardScaler()
            
        model = self.models[model_name]
        scaler = self.scalers[model_name]
        
        # Train in epochs with smaller chunks for better convergence
        chunk_size = 25000  # Slightly smaller chunks for more updates
        for epoch in range(self.n_epochs):
            # Shuffle indices for each epoch
            indices = np.random.permutation(len(X))
            
            for start_idx in range(0, len(X), chunk_size):
                end_idx = min(start_idx + chunk_size, len(X))
                
                # Get shuffled chunk
                chunk_indices = indices[start_idx:end_idx]
                X_chunk = X.iloc[chunk_indices]
                y_chunk = y[chunk_indices]
                
                # Scale
                if start_idx == 0 and epoch == 0:
                    X_scaled = scaler.fit_transform(X_chunk)
                else:
                    X_scaled = scaler.transform(X_chunk)
                
                # Partial fit
                model.partial_fit(X_scaled, y_chunk)
            
            # Print progress
            if epoch % 3 == 0:
                print(f"    Epoch {epoch+1}/{self.n_epochs} completed")
        
        return model
    
    def train_feature_batch(self, feature_batch, X_full, y, strategy_name, lag_list):
        """Train on a batch of features with specific lags."""
        print(f"\n  Processing feature batch ({len(feature_batch)} features) with {strategy_name} lags")
        
        # Select feature batch
        X_batch = X_full[feature_batch].copy()
        
        # Create lag features
        X_with_lags = self.create_lag_features_batch(X_batch, lag_list)
        
        # Train SGD model
        model_name = f"{strategy_name}_{feature_batch[0]}_{feature_batch[-1]}"
        self.train_sgd_model(X_with_lags, y, model_name)
        
        # Clean up
        del X_batch, X_with_lags
        gc.collect()
        
    def fit(self, X, y):
        """Fit ensemble using incremental training."""
        print("Training Incremental Lag Ensemble with Odd/Even Lag Configurations...")
        print(f"Total epochs per model: {self.n_epochs}")
        
        self.feature_names = X.columns.tolist()
        n_features = len(self.feature_names)
        
        # Split features into batches
        feature_batches = []
        for i in range(0, n_features, self.feature_batch_size):
            batch = self.feature_names[i:i+self.feature_batch_size]
            feature_batches.append(batch)
        
        print(f"Split {n_features} features into {len(feature_batches)} batches")
        print(f"Total lag strategies (including odd/even): {len(self.lag_strategies)}")
        
        # Calculate total lag values
        total_lags = sum(len(lags) for lags in self.lag_strategies.values())
        print(f"Total unique lag values: {total_lags}")
        
        # Train models for each combination of feature batch and lag strategy
        total_models = len(feature_batches) * len(self.lag_strategies)
        model_count = 0
        
        for strategy_name, lag_list in self.lag_strategies.items():
            print(f"\nTraining {strategy_name} strategy (lags: {lag_list})")
            
            for batch_idx, feature_batch in enumerate(feature_batches):
                model_count += 1
                print(f"Progress: {model_count}/{total_models} models")
                
                self.train_feature_batch(feature_batch, X, y, strategy_name, lag_list)
                
                # Clean up periodically
                if batch_idx % 2 == 0:
                    gc.collect()
        
        # Initialize equal weights
        for model_name in self.models:
            self.model_weights[model_name] = 1.0 / len(self.models)
        
        print(f"\nTotal models trained: {len(self.models)}")
        
    def predict_batch(self, X, feature_batch, strategy_name, lag_list):
        """Make predictions for a specific feature batch and lag strategy."""
        model_name = f"{strategy_name}_{feature_batch[0]}_{feature_batch[-1]}"
        
        if model_name not in self.models:
            return None
            
        # Select features
        X_batch = X[feature_batch].copy()
        
        # Create lag features
        X_with_lags = self.create_lag_features_batch(X_batch, lag_list)
        
        # Scale and predict
        X_scaled = self.scalers[model_name].transform(X_with_lags)
        predictions = self.models[model_name].predict(X_scaled)
        
        # Clean up
        del X_batch, X_with_lags, X_scaled
        gc.collect()
        
        return predictions
    
    def predict(self, X):
        """Make ensemble predictions."""
        all_predictions = []
        weights = []
        
        # Recreate feature batches
        n_features = len(self.feature_names)
        feature_batches = []
        for i in range(0, n_features, self.feature_batch_size):
            batch = self.feature_names[i:i+self.feature_batch_size]
            if all(col in X.columns for col in batch):
                feature_batches.append(batch)
        
        # Get predictions from each model
        for strategy_name, lag_list in self.lag_strategies.items():
            for feature_batch in feature_batches:
                pred = self.predict_batch(X, feature_batch, strategy_name, lag_list)
                if pred is not None:
                    all_predictions.append(pred)
                    model_name = f"{strategy_name}_{feature_batch[0]}_{feature_batch[-1]}"
                    weights.append(self.model_weights.get(model_name, 1.0))
        
        # Weighted average
        if all_predictions:
            weights = np.array(weights) / np.sum(weights)
            return np.average(all_predictions, axis=0, weights=weights)
        else:
            return np.zeros(len(X))


## Training the Model

We train the model for 9 epochs (each SGDRegressor trains for 9 epochs). The model consists of 252 smaller linear regression models. We tried feature_batch_size = (15, 20, 25), and 20 gives the best validation score.

In [None]:
## training with the original dataset

ensemble_original = IncrementalLagEnsemble(
    feature_batch_size=20,  
    lag_batch_size=5,
    n_epochs=9 
)
ensemble_original.fit(X_train, Y_train)

## Run the Model on the Test Set

In [27]:
y_test_pred = ensemble.predict(X_test)

In [31]:
new['pred'] = y_test_pred

In [35]:
new_sorted = new.sort_values('ID')

In [37]:
new_sorted.head()

Unnamed: 0,ID,bid_qty,ask_qty,buy_qty,sell_qty,volume,1,2,3,4,...,167,168,169,170,171,172,173,174,row_id,pred
0,1,0.317,8.102,13.164,10.272,23.436,0.026487,0.053709,-0.095664,-0.832326,...,0.033602,1.217343,4.68054,1.38604,3.495303,2.506787,0.180057,0.982409,112334,-0.068603
1,2,2.608,2.111,123.562,40.163,163.725,-0.021333,0.007705,-0.101896,-0.704061,...,-1.677375,-0.244482,3.746843,1.754129,0.531684,-0.339125,-0.398399,-0.936513,69300,0.180521
2,3,2.768,10.787,126.137,118.266,244.403,0.108185,-0.021955,-0.089188,-0.226271,...,0.307522,0.529769,-0.875603,0.122851,-0.579363,0.14365,-0.203422,-0.047355,152075,-0.512381
3,4,0.948,12.157,16.069,31.723,47.792,0.023319,0.015206,-0.085059,-0.536935,...,-0.04623,0.533205,-0.198328,0.616502,0.105334,-0.043589,-0.123433,-0.09369,255828,-0.062439
4,5,1.084,3.493,32.679,37.327,70.006,-0.108662,-0.095576,-0.111953,-0.500968,...,0.272161,1.456934,0.063305,0.877043,-0.852162,-0.592059,-0.659861,-0.166776,390226,-0.008075


In [39]:
y_pred = new_sorted['pred']

submission = pd.read_csv('data/sample_submission.csv')

submission['prediction'] = y_pred.reset_index(drop=True)


submission.to_csv('submission.csv', index = False)

out = pd.read_csv('submission.csv')

out.head()

Unnamed: 0,ID,prediction
0,1,-0.068603
1,2,0.180521
2,3,-0.512381
3,4,-0.062439
4,5,-0.008075
