In [1]:
## Use get stock data and store into df

In [3]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
import pandas as pd
import pickle

df = pd.read_csv('../optiver_train.csv')

## Initial Training

In [7]:
def generate_features(df):
    features = ['seconds_in_bucket', 'imbalance_buy_sell_flag', 'imbalance_size', 'matched_size',
                'bid_size', 'ask_size', 'reference_price', 'far_price', 'near_price', 'ask_price',
                'bid_price', 'wap', 'imb_s1', 'imb_s2']
    
    # Create a copy to modify
    df = df.copy()
    
    # Imbalance features
    df['imb_s1'] = (df['bid_size'] - df['ask_size']) / (df['bid_size'] + df['ask_size'])
    df['imb_s2'] = (df['imbalance_size'] - df['matched_size']) / (df['matched_size'] + df['imbalance_size'])
    
    # Price difference features
    prices = ['reference_price', 'far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    for i, a in enumerate(prices):
        for j, b in enumerate(prices[i+1:], i+1):
            df[f'{a}_{b}_diff'] = df[a] - df[b]
            features.append(f'{a}_{b}_diff')
    return df, features

In [10]:
def train_initial_model(df_train, start_date_id, end_date_id):
    # Filter to include only the days within the specified range
    df_train = df_train[(df_train['date_id'] >= start_date_id) & (df_train['date_id'] <= end_date_id)].copy()
    
    # Prepare features and target
    feature_names = df_train.drop(columns=['target', 'date_id', 'stock_id']).columns.tolist()
    X = df_train[feature_names].values
    y = df_train['target'].values

    # Time series cross-validation
    xgboost_models = []
    xgboost_cv_errors = []
    tscv = TimeSeriesSplit(n_splits=5)
    
    for train_index, test_index in tscv.split(X):
        X_train_cv, X_test_cv = X[train_index], X[test_index]
        y_train_cv, y_test_cv = y[train_index], y[test_index]

        # XGBoost model
        dtrain = xgb.DMatrix(X_train_cv, label=y_train_cv)
        dtest = xgb.DMatrix(X_test_cv, label=y_test_cv)
        params = {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'learning_rate': 0.01}
        model_xgb = xgb.train(params, dtrain, num_boost_round=50, evals=[(dtest, 'eval')],
                              early_stopping_rounds=30, verbose_eval=False)
        xgboost_models.append(model_xgb)
        xgboost_cv_errors.append(model_xgb.best_score)

    average_mae = np.mean(xgboost_cv_errors)
    print(f"Average MAE across all folds: {average_mae}")

    # Save the final model
    model_filename = f'XGBoost_{start_date_id}_{end_date_id}.pickle'
    with open(model_filename, 'wb') as f:
        pickle.dump(xgboost_models[-1], f)

    return xgboost_models[-1], model_filename


In [11]:
#Feature Engineering + model parameters 
df_train = df.dropna(subset=['target']).copy()
df_train, feature_names = generate_features(df_train)
start_date = 50
end_date = 400

initial_model, model_filename = train_initial_model(df_train, start_date, end_date)
print("Initial model trained and saved as:", model_filename)

Average MAE across all folds: 6.650049333042716
Initial model trained and saved as: XGBoost_50_400.pickle


## Continual Training

In [15]:
def incremental_training(df_train, date_ids, initial_model, retrain_day):
    # Filter data up to the retrain_day
    df_train_filtered = df_train[df_train['date_id'] <= retrain_day].copy()

    # Prepare features and target
    feature_names = df_train_filtered.drop(columns=['target', 'date_id', 'stock_id']).columns.tolist()
    X_train = df_train_filtered[feature_names].values
    y_train = df_train_filtered['target'].values

    # Retrain the model with cumulative data
    xgb.set_config(verbosity=0)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    params = {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'learning_rate': 0.01}
    model = xgb.train(params, dtrain, num_boost_round=50, verbose_eval=False, xgb_model=initial_model)

    # Save the final model
    model_filename = f'XGBoost_retrain_{retrain_day}.pickle'
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)

    return model, model_filename

In [16]:
retrain_day = 401
final_model, model_filename = incremental_training(df_train, date_ids, initial_model, retrain_day)
print("Final model trained and saved as:", model_filename)

Final model trained and saved as: XGBoost_retrain_401.pickle


## Inference

In [17]:
def inference_next_day(df_train, model_file):
    with open(model_file, 'rb') as f:
        model = pickle.load(f)

    last_day = int(model_file.split('_')[-1].split('.')[0])
    inference_day = last_day + 1

    # Filter data for inference day
    df_inference = df_train[df_train['date_id'] == inference_day].copy()

    # Prepare features for inference
    feature_names = df_inference.drop(columns=['target', 'date_id', 'stock_id']).columns.tolist()
    X_inference = df_inference[feature_names].values

    # Perform inference
    d_inference = xgb.DMatrix(X_inference)
    predictions = model.predict(d_inference)

    # Add predictions to the DataFrame
    df_inference['prediction'] = predictions

    # Save the results
    inference_filename = f'inference_{inference_day}.csv'
    df_inference.to_csv(inference_filename, index=False)

    return inference_filename


inference_file = inference_next_day(df_train, 'XGBoost_50_400.pickle')
print("Inference results saved as:", inference_file)

Inference results saved as: inference_401.csv
