# TradeIn-TradeOut

Predicting the closing price movements for hundreds of Nasdaq listed stocks using data from the order book and the closing auction of the stock.

In [None]:
import pandas as pd
from strictyaml import load, YAMLError, Map, Str
from collections import OrderedDict

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn import linear_model
from sklearn import neighbors
from sklearn import ensemble

Class that allows one to load and save datasets.

In [None]:
class FileOperations:
    @staticmethod
    def load_raw_dataset(filepath: str) -> pd.DataFrame:
        schema = {
            "stock_id": "Int64",
            "date_id": "Int64",
            "seconds_in_bucket": "Int64",
            "imbalance_size": "float32",
            "imbalance_buy_sell_flag": "float32",
            "reference_price": "float32",
            "matched_size": "float32",
            "far_price": "float32",
            "near_price": "float32",
            "bid_price": "float32",
            "bid_size": "float32",
            "ask_price": "float32",
            "ask_size": "float32",
            "wap": "float32",
            "target": "float32",
            "time_id": "Int64",
            "row_id": "string"
        }

        return pd.read_csv(filepath, dtype=schema)


    @staticmethod
    def save_prep_dataset(df: pd.DataFrame, filepath: str) -> None:
        df.to_csv(filepath)
        return

Class that allows one to import Configuration Parameters from config.yaml

In [None]:
try:
    schema = Map({
        "inputDirectory": Str(),
        "inputFile": Str(),
        "outputDirectory": Str(),
        "outputFile": Str()
    })

    with open("./config.yaml", "r") as file:
        configDataMap: OrderedDict = load(yaml_string=file.read(), schema=schema).data
    file.close()

except YAMLError as error:
    print(error)
    exit()


class ConfigReader:
    def __init__(self):
        self.inputDirectory = configDataMap.get("inputDirectory")
        self.inputFile = configDataMap.get("inputFile")
        self.outputDirectory = configDataMap.get("outputDirectory")
        self.outputFile = configDataMap.get("outputFile")

Class that provides preprocessing functionality. This includes:
1. Removing null values from specified columns.
2. Filling null values with user specified values or the mean values.
3. Get the number of null values per column.

In [None]:
class Preprocess:
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def get_null_values_per_column(self):
        print(self.df.isnull().sum())
        return self

    def remove_null_values(self, column: str):
        self.df = self.df.dropna(subset=[column])
        return self

    def fill_null_values(self, column: list[str] = None, value: int | float = None, type_fill: str = None):
        if type_fill is None:
            self.df.loc[:, column] = self.df[column].fillna(value)
            return self

        elif type_fill == 'mean':
            input_cols = [c for c in self.df.columns if c != "row_id"]
            output_cols = [c for c in self.df.columns if c != "row_id"]

            self.impute_columns(input_cols, output_cols)
            return self

    def impute_columns(self, input_cols, output_cols):
        for input_col, output_col in zip(input_cols, output_cols):
            mean_value = self.df[input_col].mean()

            self.df.loc[:, output_col] = self.df[input_col].fillna(mean_value)
        return self

Class that engineers new features from the dataset. These include:
1. Spread: The gap between the Bid and Ask prices.
2. Imbalance Ratio: Ratio of the imbalance size to the matched sizes.
3. Volume: The total number of bid and sell sizes.
4. Mid-Price: Middle value between the bid price and the ask size.
5. Liquidity Imbalance: The ratio of the difference in the bid and ask size to the total volume.
6. Match Ratio: The ratio of the matched size to the total volume.
7. Minutes: The seconds in the bucket converted to minutes 

In [None]:
class FeatureEngineer:
    def __init__(self, df: pd.DataFrame) -> None:
        self.df = df.copy()

    def generate_spread(self):
        self.df.loc[:, 'spread'] = self.df['bid_price'] - self.df['ask_price']
        return self

    def generate_imbalance_ratio(self):
        self.df.loc[:, 'imbalance_ratio'] = self.df['imbalance_size'] / self.df['matched_size']
        return self

    def generate_volume(self):
        self.df.loc[:, 'volume'] = self.df['bid_size'] + self.df['ask_size']
        return self

    def generate_mid_price(self):
        self.df.loc[:, 'mid_price'] = (self.df['ask_price'] + self.df['bid_price']) / 2
        return self

    def liquidity_imbalance(self):
        self.df.loc[:, 'liquidity_imbalance'] = ((self.df['bid_size'] - self.df['ask_size']) /
                                                 (self.df['bid_size'] + self.df['ask_size']))
        return self

    def match_ratio(self):
        self.df.loc[:, 'matched_ratio'] = self.df['matched_size'] / (self.df['bid_size'] + self.df['ask_size'])
        return self

    def generate_minutes(self):
        self.df.loc[:, 'minutes'] = self.df['seconds_in_bucket'] // 60
        return self

Class that splits the dataset into Training and Test datasets using the 80/20 rule.


In [None]:
def split_data(df: pd.DataFrame):
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    train_y = train_df['target']
    train_x = train_df.drop('target', axis=1)

    test_y = test_df['target']
    test_x = test_df.drop('target', axis=1)

    return train_x, train_y, test_x, test_y


def to_pandas(iterator):
    yield pd.DataFrame(iterator)

Main program

In [None]:
fileOperator = FileOperations()
configOperator = ConfigReader()

rawDataFrame = fileOperator.load_raw_dataset(configOperator.inputDirectory + configOperator.inputFile)

processedDataFrame = \
    Preprocess(rawDataFrame) \
        .get_null_values_per_column() \
        .remove_null_values('target') \
        .fill_null_values(['far_price', 'near_price'], 0.0) \
        .fill_null_values(type_fill="mean") \
        .df

processedDataFrame.loc[processedDataFrame['imbalance_buy_sell_flag'] == 0.0, 'imbalance_size'] = 0.0

engineeredDataFrame = \
    FeatureEngineer(processedDataFrame) \
        .generate_spread() \
        .generate_imbalance_ratio() \
        .generate_volume() \
        .generate_mid_price() \
        .liquidity_imbalance() \
        .match_ratio() \
        .generate_minutes() \
        .df

print("\nRaw DataFrame Example Data")
print(rawDataFrame.head())

print("\nNull Values in Raw DataFrame")
Preprocess(rawDataFrame).get_null_values_per_column()

print("\nData Frame After Feature Engineering")
print(engineeredDataFrame.head())

print("\nNow splitting the dataset into Training and Test Sets based on a 80/20 split.")
trainX, trainY, crossValX, crossValY = split_data(engineeredDataFrame)


print("Training the following Machine Learning Models:")
print("1. Linear Regression")
linearRegModel = linear_model.Ridge(alpha=.5)
linearRegModel.fit(trainX, trainY)
linearTarget = linearRegModel.predict(crossValX)
lTError = mean_squared_error(crossValY, linearTarget)

# print("2. Elastic Net")
# elasticNetModel = linear_model.ElasticNet(alpha=.5, tol=0.001)
# elasticNetModel.fit(trainX, trainY)
# elasticNetTarget = elasticNetModel.predict(crossValX)
# eNTError = mean_squared_error(crossValY, elasticNetTarget)

print("3. K Nearest Neighbors")
knnModel = neighbors.KNeighborsRegressor(n_neighbors=5, weights="uniform", n_jobs=5)
knnModel.fit(trainX, trainY)
knnModelTarget = knnModel.predict(crossValX)
kMTError = mean_squared_error(crossValY, knnModelTarget)

print("4. Random Forest")
randomForestModel = ensemble.RandomForestRegressor(24, max_samples=0.7, n_jobs=5)
randomForestModel.fit(trainX, trainY)
randomForestTarget = randomForestModel.predict(crossValX)
rFTError = mean_squared_error(crossValY, randomForestTarget)

print("Here are the generated statistics for the tested algorithms on the cross validation set.")
print("MSE for Linear Regression:", lTError)
print("MSE for K Nearest Neighbors:", kMTError)
print("MSE for Random Forest:", rFTError)

# fileOperator.save_prep_dataset(engineeredDataFrame, configOperator.outputDirectory + configOperator.outputFile)

# References
- https://www.kaggle.com/code/hrhuynguyen/eda-for-training-dataset
- https://www.kaggle.com/code/yekenot/feature-elimination-by-catboost
- https://www.kaggle.com/competitions/optiver-trading-at-the-close/discussion/453609
- https://www.kaggle.com/code/sohier/optiver-2023-basic-submission-demo
- https://www.kaggle.com/competitions/optiver-trading-at-the-close/discussion/443396
- https://www.kaggle.com/code/zulqarnainali/explained-singel-model-optiver
- https://www.kaggle.com/code/verracodeguacas/fold-cv
- https://www.kaggle.com/code/cv13j0/optiver-ml-trading-at-the-close
- https://www.kaggle.com/code/jirkaborovec/optiver-eda-pytorch-regression
- https://www.kaggle.com/code/aniketkolte04/optiver-2023-eda-pytorch-lstm-attention-model
- https://francescobranda.netlify.app/post/distributed_deep_learning/
- https://github.com/maxpumperla/elephas#basic-spark-integration