In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

%load_ext autoreload
%autoreload 2

n_stocks = 24
n_venues = 6

In [3]:
X_train = pd.read_parquet('./data/X_train.parquet')
y_train = pd.read_parquet('./data/y_train.parquet')

# Merge X_train and y_train according to "obs_id"
df_train = X_train.merge(y_train, on="obs_id")
df_train["bid_ask_spread"] = df_train["ask"] - df_train["bid"]
df_train["Limit Order"] = (df_train["price"] == df_train["bid"]) | (df_train["price"] == df_train["ask"])

df_train["bid_ask_spread"] = df_train["bid_ask_spread"].astype(np.float16)
df_train["eqt_code_cat"] = df_train["eqt_code_cat"].astype(np.int16)

n_obs = len(X_train["obs_id"].unique())

In [4]:
X_test = pd.read_parquet('./data/X_test.parquet')

df_test = X_test
df_test["bid_ask_spread"] = df_test["ask"] - df_test["bid"]
df_test["Limit Order"] = (df_test["price"] == df_test["bid"]) | (df_test["price"] == df_test["ask"])

df_test["bid_ask_spread"] = df_test["bid_ask_spread"].astype(np.float16)

n_obs_test = len(X_test["obs_id"].unique())

In [6]:
def compute_features(df, n_obs, nb_ticks_max=None, factors=[1, 2, 3]):
    features_df = pd.DataFrame({"obs_id": range(n_obs)})
    tick_size = 0.01

    if nb_ticks_max is not None:
        df = df[(df["price"] >= df["bid"] - nb_ticks_max*tick_size) & (df["price"] <= df["ask"] + nb_ticks_max*tick_size)]

    venues_count = df[["obs_id", "venue", "price"]].groupby(["obs_id", "venue"], as_index=False).count()
    venues_count.rename(columns={"price": "count"}, inplace=True)

    for i in range(n_venues):
        curr_df = venues_count[venues_count["venue"] == i][["obs_id", "count"]]
        curr_df.rename(columns={"count": "venue_" + str(i)}, inplace=True)
        features_df = features_df.merge(curr_df, on="obs_id", how="left")

    features_df.fillna(0, inplace=True)

    curr_df = df[["obs_id", "bid_ask_spread"]].groupby("obs_id").mean()
    features_df = features_df.merge(curr_df, on="obs_id", how="left")

    curr_df = df[["obs_id", "bid_size"]].groupby("obs_id").mean()
    features_df = features_df.merge(curr_df, on="obs_id", how="left")

    curr_df = df[["obs_id", "ask_size"]].groupby("obs_id").mean()
    features_df = features_df.merge(curr_df, on="obs_id", how="left")

    print("Computing Number of outliers per factor...")
    # NB OUTLIERS PER FACTOR
    for factor in (factors):
        print("     Factor", '{:02d}'.format(factor), end="\r")
        curr_df = df[(df["price"] <= df["bid"] - factor*tick_size) | (df["price"] >= df["ask"] + factor*tick_size)][["obs_id", "price"]].groupby(["obs_id"], as_index=False).count()
        curr_df.rename(columns={"price": "nb_outliers_" + str(factor)}, inplace=True)
        features_df = features_df.merge(curr_df, on="obs_id", how="left")

    features_df.fillna(0, inplace=True)

    print("Computing Number of outliers per factor and side...")
    print(" ASK side...")
    # ASK PRICE OF OUTLIERS PER FACTOR
    for factor in (factors):
        print("     Factor", '{:02d}'.format(factor), end="\r")
        curr_df = df[(df["side"] == "A") & ((df["price"] <= df["bid"] - factor*tick_size) | (df["price"] >= df["ask"] + factor*tick_size))][["obs_id", "price"]].groupby(["obs_id"], as_index=False).mean()[["obs_id", "price"]]
        curr_df.rename(columns={"price": "ask_price_outliers_" + str(factor)}, inplace=True)
        features_df = features_df.merge(curr_df, on="obs_id", how="left")

    features_df.fillna(0, inplace=True)

    print(" BID side...")
    # BID PRICE OF OUTLIERS PER FACTOR
    for factor in (factors):
        print("     Factor", '{:02d}'.format(factor), end="\r")
        curr_df = df[(df["side"] == "B") & ((df["price"] <= df["bid"] - factor*tick_size) | (df["price"] >= df["ask"] + factor*tick_size))][["obs_id", "price"]].groupby(["obs_id"], as_index=False).mean()[["obs_id", "price"]]
        curr_df.rename(columns={"price": "bid_price_outliers_" + str(factor)}, inplace=True)
        features_df = features_df.merge(curr_df, on="obs_id", how="left")

    features_df.fillna(0, inplace=True)

    print("Computing Number of outliers per factor and action...")
    print(" ASK side, ADD action...")
    # FLUX OF OUTLIERS PER FACTOR ASK SIDE (ADDITION)
    for factor in (factors):
        print("     Factor", '{:02d}'.format(factor), end="\r")
        curr_df = df[(df["side"] == "A") & (df["action"] == "A") & ((df["price"] <= df["bid"] - factor*tick_size) | (df["price"] >= df["ask"] + factor*tick_size))][["obs_id", "flux"]].groupby(["obs_id"], as_index=False).mean()[["obs_id", "flux"]]
        curr_df.rename(columns={"flux": "ask_flux_outliers_add_" + str(factor)}, inplace=True)
        features_df = features_df.merge(curr_df, on="obs_id", how="left")

    features_df.fillna(0, inplace=True)

    print(" BID side, ADD action...")
    # FLUX OF OUTLIERS PER FACTOR BID SIDE (ADDITION)
    for factor in (factors):
        print("     Factor", '{:02d}'.format(factor), end="\r")
        curr_df = df[(df["side"] == "B") & (df["action"] == "A") & ((df["price"] <= df["bid"] - factor*tick_size) | (df["price"] >= df["ask"] + factor*tick_size))][["obs_id", "flux"]].groupby(["obs_id"], as_index=False).mean()[["obs_id", "flux"]]
        curr_df.rename(columns={"flux": "bid_flux_outliers_add_" + str(factor)}, inplace=True)
        features_df = features_df.merge(curr_df, on="obs_id", how="left")

    features_df.fillna(0, inplace=True)

    print(" ASK side, DELETE action...")
    # FLUX OF OUTLIERS PER FACTOR ASK SIDE (UPDATE)
    for factor in (factors):
        print("     Factor", '{:02d}'.format(factor), end="\r")
        curr_df = df[(df["side"] == "A") & (df["action"] == "U") & ((df["price"] <= df["bid"] - factor*tick_size) | (df["price"] >= df["ask"] + factor*tick_size))][["obs_id", "flux"]].groupby(["obs_id"], as_index=False).mean()[["obs_id", "flux"]]
        curr_df.rename(columns={"flux": "ask_flux_outliers_upd_" + str(factor)}, inplace=True)
        features_df = features_df.merge(curr_df, on="obs_id", how="left")

    features_df.fillna(0, inplace=True)

    print(" BID side, DELETE action...")
    # FLUX OF OUTLIERS PER FACTOR BID SIDE (UPDATE)
    for factor in (factors):
        print("     Factor", '{:02d}'.format(factor), end="\r")
        curr_df = df[(df["side"] == "B") & (df["action"] == "U") & ((df["price"] <= df["bid"] - factor*tick_size) | (df["price"] >= df["ask"] + factor*tick_size))][["obs_id", "flux"]].groupby(["obs_id"], as_index=False).mean()[["obs_id", "flux"]]
        curr_df.rename(columns={"flux": "bid_flux_outliers_upd_" + str(factor)}, inplace=True)
        features_df = features_df.merge(curr_df, on="obs_id", how="left")

    features_df.fillna(0, inplace=True)

    return features_df

In [7]:
features_train_df = compute_features(df_train, n_obs, factors=[1, 5, 10, 50, 100])
features_test_df = compute_features(df_test, n_obs_test, factors=[1, 5, 10, 50, 100])

Computing Number of outliers per factor...
Computing Number of outliers per factor and side...
 ASK side...
 BID side...100
Computing Number of outliers per factor and action...
 ASK side, ADD action...
 BID side, ADD action...
 ASK side, DELETE action...
 BID side, DELETE action...
Computing Number of outliers per factor...
Computing Number of outliers per factor and side...
 ASK side...
 BID side...100
Computing Number of outliers per factor and action...
 ASK side, ADD action...
 BID side, ADD action...
 ASK side, DELETE action...
 BID side, DELETE action...
     Factor 100

In [9]:
# Save features
path = "features/"
# get all the folder names
folder_names = [x[0] for x in os.walk(path)][1:]
# get the last character of each folder name, convert to int and get the max
try :
    last_model_idx = max([int(x[-1]) for x in folder_names])
except :
    last_model_idx = 0
# create the new folder name
new_folder_name = path + "model " + str(last_model_idx + 1)
os.mkdir(new_folder_name)
features_train_df.to_parquet(new_folder_name + "/features_train.parquet")
features_test_df.to_parquet(new_folder_name + "/features_test.parquet")