In [2]:
import gc  # Garbage collection for memory management
import os  # Operating system-related functions
import time  # Time-related functions
import warnings  # Handling warnings
from itertools import combinations  # For creating combinations of elements
from warnings import simplefilter  # Simplifying warning handling

import numpy as np  # Numerical operations
import pandas as pd  # Data manipulation and analysis

# Remove annoying warnings lol
simplefilter(action="ignore")

low_cpu_mode = True


In [3]:

df = pd.read_csv("train.csv")
df = df.dropna(subset=["target"])
df.reset_index(drop=True, inplace=True)
df_shape = df.shape

In [4]:
if low_cpu_mode:
	df = df.sample(n=100000)
	print(df.shape)

(100000, 17)


In [5]:
split_day = 435
df_train = df[df["date_id"] < split_day]
df_valid = df[df["date_id"] == split_day]

# Display a message indicating offline mode and the shapes of the training and validation sets
print(f"train : {df_train.shape}, valid : {df_valid.shape}")

train : (90350, 17), valid : (213, 17)


In [33]:
def create_timeseries(df):
    # Sort the DataFrame
    df.sort_values(by=['stock_id', 'date_id', 'seconds_in_bucket'], inplace=True)

    # Create a MultiIndex from the unique values of date_id and seconds_in_bucket
    multi_index = pd.MultiIndex.from_product(
        [df['date_id'].unique(), df['seconds_in_bucket'].unique()],
        names=['date_id', 'seconds_in_bucket']
    )

    # Function to reindex each group
    def reindex_group(group):
        group.set_index(['date_id', 'seconds_in_bucket'], inplace=True)
        group = group.dropna(subset=['stock_id'])
        return group.reindex(multi_index).reset_index()

    # Group by stock_id and apply the reindexing function
    time_series_df = df.groupby('stock_id').apply(reindex_group).reset_index(drop=True)
    time_series_df = time_series_df.dropna(subset=['stock_id'])

    return time_series_df

time_series_df_train = create_timeseries(df_train)
time_series_df_valid = create_timeseries(df_valid)
time_series_df = create_timeseries(df)

# Display a message indicating offline mode and the shapes of the training and validation sets
print(f"train : {time_series_df_train.shape}, valid : {time_series_df_valid.shape}")


train : (90350, 17), valid : (213, 17)


In [7]:
import torch
import torch.nn as nn

loss = nn.L1Loss()

In [28]:
# OVERALL PREDICTION ATTEMPT

import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
import itertools

# Parameter Search
p = d = q = range(0, 5)  
pdq = list(itertools.product(p, d, q))

best_aic = float("inf")
best_pdq = None
best_model = None

for param in pdq:
    try:
        sample_df = time_series_df_train[time_series_df_train['stock_id'] == 0]
        model = ARIMA(sample_df['target'], order=param)
        results = model.fit()

        if results.aic < best_aic:
            best_aic = results.aic
            best_pdq = param
            best_model = results
    except:
        continue

print(f"Best ARIMA{best_pdq} model with AIC: {best_aic}")


total_predictions = 0
total_loss = 0

# Iterate over each stock_id
for stock_id in time_series_df_train['stock_id'].unique():
    print(f"Processing stock_id: {stock_id}")

    try:
        # Filter data for the current stock_id
        df_stock = time_series_df_train[time_series_df_train['stock_id'] == stock_id]

        # Fit ARIMA model (using differenced data if necessary)
        model = ARIMA(df_stock['target'].dropna(), order=best_pdq)
        results = model.fit()

        valid_targets = time_series_df_valid[time_series_df_valid['stock_id'] == stock_id]['target'].dropna()

        forecast = results.forecast(steps=len(valid_targets))
        
        total_predictions += 1
        mae_loss = loss(torch.tensor(forecast.values), torch.tensor(valid_targets.values)).item()
        total_loss += mae_loss

        print(f'Current MAE: {mae_loss}. Rolling MAE Loss: {total_loss / total_predictions}')
    except:
        print('Error in predicting stock')

print(f'Final MAE: {total_loss / total_predictions}')


Best ARIMA(2, 0, 3) model with AIC: 2922.9557616292586
Processing stock_id: 0.0
Current MAE: 4.105270747428987. Rolling MAE Loss: 4.105270747428987
Processing stock_id: 1.0
Current MAE: 4.9150880123501235. Rolling MAE Loss: 4.510179379889555
Processing stock_id: 2.0
Current MAE: 6.14368855058773. Rolling MAE Loss: 5.054682436788947
Processing stock_id: 3.0
Current MAE: 0.8018717618161673. Rolling MAE Loss: 3.991479768045752
Processing stock_id: 4.0
Current MAE: 4.152122369942367. Rolling MAE Loss: 4.023608288425075
Processing stock_id: 5.0
Current MAE: 3.500529195737872. Rolling MAE Loss: 3.9364284396438745
Processing stock_id: 6.0
Error in predicting stock
Processing stock_id: 7.0
Error in predicting stock
Processing stock_id: 8.0
Error in predicting stock
Processing stock_id: 9.0
Current MAE: 3.7021879345285287. Rolling MAE Loss: 3.9029655103416823
Processing stock_id: 10.0
Error in predicting stock
Processing stock_id: 11.0
Current MAE: 9.020639626734864. Rolling MAE Loss: 4.5426747

In [55]:
# INDIVIDUAL STOCK 
test_stock_id = 6


# Parameter Search
p = q = range(0, 5)
d = range(0, 1)  
pdq = list(itertools.product(p, d, q))

best_aic = float("inf")
best_pdq = None
best_model = None

for param in pdq:
    try:
        sample_df = time_series_df[time_series_df['stock_id'] == test_stock_id]
        model = ARIMA(sample_df['target'], order=param)
        results = model.fit()

        if results.aic < best_aic:
            best_aic = results.aic
            best_pdq = param
            best_model = results
    except:
        continue


print(f"Best ARIMA{best_pdq} model with AIC: {best_aic}")

n_splits = 5

# Calculate the size of each fold (using K-fold val)
df_stock = time_series_df[time_series_df['stock_id'] == test_stock_id]
n_samples = len(df_stock)
fold_size = n_samples // n_splits
scores = []

print(n_samples, fold_size)


print(f"Processing stock_id: {test_stock_id}")

for i in range(n_splits):
    train = df_stock.iloc[:i * fold_size + fold_size]
    test = df.iloc[i * fold_size + fold_size:(i + 1) * fold_size + fold_size]

    model = ARIMA(train['target'], order=best_pdq) 
    results = model.fit()

    test_targets = test['target'].dropna()
    forecast = results.forecast(steps=len(test))


    # Evaluate the model
    mae_loss = loss(torch.tensor(forecast.values), torch.tensor(test_targets.values)).item()
    print(f'Fold {i} Loss: {mae_loss}')
    scores.append(mae_loss)

# Calculate average score
average_score = np.mean(scores)
print(f"Average Score: {average_score}")

Best ARIMA(0, 0, 0) model with AIC: 3375.112843739838
473 94
Processing stock_id: 6
Fold 0 Loss: 5.014735106348673
Fold 1 Loss: 5.022173192045805
Fold 2 Loss: 3.9358587830444627
Fold 3 Loss: 4.17663872913984
Fold 4 Loss: 4.387835446347974
Average Score: 4.507448251385351


In [None]:
# Function to generate imbalance features
def imbalance_features(df):
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    # V1 features
    # Calculate various features using Pandas eval function
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")
    
    # Create features for pairwise price imbalances
    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    # Calculate additional features
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    
    # Calculate various statistical aggregation features
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        
    # Calculate shifted and return features for specific columns
    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    
    # Calculate diff features for specific columns
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)

    # Replace infinite values with 0
    return df.replace([np.inf, -np.inf], 0)

# Function to generate time and stock-related features
def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  # Seconds
    df["minute"] = df["seconds_in_bucket"] // 60  # Minutes
    
    return df

# Function to generate all features by combining imbalance and other features
def generate_all_features(df):
    
    # Generate imbalance features
    df = imbalance_features(df)
    df = other_features(df)
    #gc.collect()  # Perform garbage collection to free up memory
    
    return df
