In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

# Train on these
normal_market_path = "training_data_normal_market.parquet"
stressed_market_path = "training_data_stressed_market.parquet"
hft_dominated_path = "training_data_hft_dominated.parquet"

# Just to visualize
mini_flash_crash_path = "training_data_mini_flash_crash.parquet"
flash_crash_path = "training_data_flash_crash.parquet"

# Master file
master_path = "master_training_data.parquet"

In [2]:
# Concatting the files that we will actually train on
files = [
    normal_market_path,
    stressed_market_path,
    hft_dominated_path,
]

df = pd.read_parquet(master_path)

### Visualizing the data

In [13]:
normal_market_df = pd.read_parquet(normal_market_path)

print(normal_market_df.head(n=100))

    step    bid     ask     mid  spread       scenario     timestamp
0      0    0.0     0.0    0.00     0.0  normal_market  1.768680e+09
1      1  999.9  1000.0  999.95     0.1  normal_market  1.768680e+09
2      2  999.9  1000.0  999.95     0.1  normal_market  1.768680e+09
3      3  999.8  1000.0  999.90     0.2  normal_market  1.768680e+09
4      4  999.8   999.9  999.85     0.1  normal_market  1.768680e+09
..   ...    ...     ...     ...     ...            ...           ...
95    95  999.8   999.9  999.85     0.1  normal_market  1.768680e+09
96    96  999.8   999.9  999.85     0.1  normal_market  1.768680e+09
97    97  999.8   999.9  999.85     0.1  normal_market  1.768680e+09
98    98  999.8   999.9  999.85     0.1  normal_market  1.768680e+09
99    99  999.8   999.9  999.85     0.1  normal_market  1.768680e+09

[100 rows x 7 columns]


In [3]:
stressed_market_df = pd.read_parquet(stressed_market_path)

print(stressed_market_df.head(n=100))

    step    bid     ask      mid  spread         scenario     timestamp
0      0    0.0     0.0     0.00     0.0  stressed_market  1.768680e+09
1      1  997.0  1003.0  1000.00     6.0  stressed_market  1.768680e+09
2      2  997.4  1000.0   998.70     2.6  stressed_market  1.768680e+09
3      3  996.8   998.7   997.75     1.9  stressed_market  1.768680e+09
4      4  997.6   997.9   997.75     0.3  stressed_market  1.768680e+09
..   ...    ...     ...      ...     ...              ...           ...
95    95  993.6   994.0   993.80     0.4  stressed_market  1.768680e+09
96    96  993.8   994.0   993.90     0.2  stressed_market  1.768680e+09
97    97  993.9   994.3   994.10     0.4  stressed_market  1.768680e+09
98    98  993.9   994.4   994.15     0.5  stressed_market  1.768680e+09
99    99  993.9   995.8   994.85     1.9  stressed_market  1.768680e+09

[100 rows x 7 columns]


In [4]:
hft_dominated_df = pd.read_parquet(hft_dominated_path)

print(hft_dominated_df.head(n=100))

    step    bid     ask      mid  spread       scenario     timestamp
0      0    0.0     0.0     0.00     0.0  hft_dominated  1.768680e+09
1      1  999.7  1000.3  1000.00     0.6  hft_dominated  1.768680e+09
2      2  999.5  1000.5  1000.00     1.0  hft_dominated  1.768680e+09
3      3  999.6  1000.0   999.80     0.4  hft_dominated  1.768680e+09
4      4  999.6  1000.0   999.80     0.4  hft_dominated  1.768680e+09
..   ...    ...     ...      ...     ...            ...           ...
95    95  999.0   999.6   999.30     0.6  hft_dominated  1.768680e+09
96    96  999.3   999.4   999.35     0.1  hft_dominated  1.768680e+09
97    97  999.3   999.5   999.40     0.2  hft_dominated  1.768680e+09
98    98  999.3   999.6   999.45     0.3  hft_dominated  1.768680e+09
99    99  999.3   999.5   999.40     0.2  hft_dominated  1.768680e+09

[100 rows x 7 columns]


In [5]:
mini_flash_crash_df = pd.read_parquet(mini_flash_crash_path)

print(mini_flash_crash_df.head(n=100))

    step     bid     ask      mid  spread          scenario     timestamp
0      0     0.0     0.0     0.00     0.0  mini_flash_crash  1.768680e+09
1      1   999.9  1000.1  1000.00     0.2  mini_flash_crash  1.768680e+09
2      2   999.9  1000.1  1000.00     0.2  mini_flash_crash  1.768680e+09
3      3  1000.0  1001.2  1000.60     1.2  mini_flash_crash  1.768680e+09
4      4  1000.0  1002.2  1001.10     2.2  mini_flash_crash  1.768680e+09
..   ...     ...     ...      ...     ...               ...           ...
95    95  1004.3  1005.6  1004.95     1.3  mini_flash_crash  1.768680e+09
96    96  1004.9  1005.7  1005.30     0.8  mini_flash_crash  1.768680e+09
97    97  1005.3  1005.7  1005.50     0.4  mini_flash_crash  1.768680e+09
98    98  1005.1  1005.7  1005.40     0.6  mini_flash_crash  1.768680e+09
99    99  1004.3  1005.4  1004.85     1.1  mini_flash_crash  1.768680e+09

[100 rows x 7 columns]


In [6]:
flash_crash_df = pd.read_parquet(flash_crash_path)

print(flash_crash_df.head(n=100))

    step     bid     ask      mid  spread     scenario     timestamp
0      0     0.0     0.0     0.00     0.0  flash_crash  1.768680e+09
1      1   999.8  1000.2  1000.00     0.4  flash_crash  1.768680e+09
2      2   999.9  1000.1  1000.00     0.2  flash_crash  1.768680e+09
3      3  1000.0  1000.1  1000.05     0.1  flash_crash  1.768680e+09
4      4   999.9  1000.2  1000.05     0.3  flash_crash  1.768680e+09
..   ...     ...     ...      ...     ...          ...           ...
95    95  1000.0  1000.4  1000.20     0.4  flash_crash  1.768680e+09
96    96  1000.1  1000.3  1000.20     0.2  flash_crash  1.768680e+09
97    97  1000.1  1000.3  1000.20     0.2  flash_crash  1.768680e+09
98    98   999.9  1000.5  1000.20     0.6  flash_crash  1.768680e+09
99    99  1000.0  1000.4  1000.20     0.4  flash_crash  1.768680e+09

[100 rows x 7 columns]


In [3]:
# Remove invalid rows
df = df[(df["bid"] > 0) & (df["ask"] > 0) & (df["mid"] > 0)]

# Encode labels
le = LabelEncoder()
df["y"] = le.fit_transform(df["scenario"])

# Build X and y
X = df.drop(columns=["scenario", "y", "timestamp", "step"])
y = df["y"]

print(dict(zip(le.classes_, le.transform(le.classes_))))

TypeError: string indices must be integers, not 'str'

In [9]:
# XGBoost 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softmax",
    num_class=3
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred, target_names=le.classes_))

                 precision    recall  f1-score   support

  hft_dominated       0.96      0.99      0.97      9000
  normal_market       1.00      1.00      1.00      9000
stressed_market       0.94      0.83      0.88      2175

       accuracy                           0.98     20175
      macro avg       0.97      0.94      0.95     20175
   weighted avg       0.98      0.98      0.97     20175

