In [None]:
!pip install catboost[gpu]



In [None]:
from google.colab import drive
drive.mount('/content/download')

Drive already mounted at /content/download; to attempt to forcibly remount, call drive.mount("/content/download", force_remount=True).


In [None]:
import numpy as np # linear algebra
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime
from scipy.special import logsumexp

from catboost import Pool, cv, CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import mean_squared_error, classification_report

In [None]:
df = pd.read_csv("/content/download/MyDrive/1-09-1-20.csv")
print(df.shape)
df.tail()

In [None]:
new_columns = [
    'index', 'timestamp', 'time', 'bid1', 'bid1vol', 'bid2', 'bid2vol',
    'bid3', 'bid3vol', 'bid4', 'bid4vol', 'bid5', 'bid5vol',
    'bid6', 'bid6vol', 'bid7', 'bid7vol', 'bid8', 'bid8vol',
    'bid9', 'bid9vol', 'bid10', 'bid10vol', 'ask1', 'ask1vol',
    'ask2', 'ask2vol', 'ask3', 'ask3vol', 'ask4', 'ask4vol',
    'ask5', 'ask5vol', 'ask6', 'ask6vol', 'ask7', 'ask7vol',
    'ask8', 'ask8vol', 'ask9', 'ask9vol', 'ask10', 'ask10vol'
]
df.columns = new_columns
df['mid'] = 0.5*(df['bid1'] + df['ask1'])

print(df.head())

In [None]:
interval_set = 5 * 1000 #predicted timeslot
df['timestamp_predict'] = df['timestamp'] + interval_set
df['mom'] = df['mid']
# Use numpy to find the closest indices efficiently
timestamps = df['timestamp'].values
predicted_timestamps = df['timestamp_predict'].values
closest_indices = np.searchsorted(timestamps, predicted_timestamps)

for ind, row in df.iterrows():
    if ind == len(df) - 1:
        df.at[ind, 'mom'] = 0
        break

    # Find the closest timestamp index
    closest_index = closest_indices[ind]
    if closest_index >= len(df):
        closest_index = len(df) - 1

    if row['mid'] < df.loc[closest_index, 'mid']:
        df.at[ind, 'mom'] = 1
    if row['mid'] > df.loc[closest_index, 'mid']:
        df.at[ind, 'mom'] = -1
    if row['mid'] == df.loc[closest_index, 'mid']:
        df.at[ind, 'mom'] = 0


In [None]:
## y is binary.
display(df['mom'].describe())

In [None]:
bid_cols = ['bid1', 'bid2', 'bid3', 'bid4', 'bid5', 'bid6', 'bid7', 'bid8', 'bid9', 'bid10']
bid_vol_cols = ['bid1vol', 'bid2vol', 'bid3vol', 'bid4vol', 'bid5vol', 'bid6vol', 'bid7vol', 'bid8vol', 'bid9vol', 'bid10vol']
ask_cols = ['ask1', 'ask2', 'ask3', 'ask4', 'ask5', 'ask6', 'ask7', 'ask8', 'ask9', 'ask10']
ask_vol_cols = ['ask1vol','ask2vol', 'ask3vol', 'ask4vol', 'ask5vol', 'ask6vol', 'ask7vol', 'ask8vol', 'ask9vol', 'ask10vol']

group_cols = {"bid_cols":bid_cols,"bid_vol_cols":bid_vol_cols,"ask_cols":ask_cols,"ask_vol_cols":ask_vol_cols}

*  Additional features could include: rank, which bid number is the max/min, etc'
* features between the aggregated features (e.g. max bid div max ask..)

In [None]:
for group in group_cols.keys():
    print(group)
    df[f"{group}_max"] = df[group_cols[group]].max(axis=1)
    df[f"{group}_min"] = df[group_cols[group]].min(axis=1)
    df[f"{group}_spread"] = df[f"{group}_max"].div(df[f"{group}_min"])
    df[f"{group}_logsumexp"] = df[group_cols[group]].apply(logsumexp)

    # Additional features
    df[f"{group}_rank_max"] = df[group_cols[group]].idxmax(axis=1).apply(lambda x: group_cols[group].index(x))
    df[f"{group}_rank_min"] = df[group_cols[group]].idxmin(axis=1).apply(lambda x: group_cols[group].index(x))


# Split back into train and test, and build model

In [None]:
end_time = df['timestamp'].tolist()[-1] - interval_set
df = df[df['timestamp'] <= end_time]

In [None]:
split_index = int(len(df) * 0.8)
train = df.iloc[:split_index]
test = df.iloc[split_index:]

In [None]:
train.to_csv("/content/download/MyDrive/download/train_hft.csv")
test.to_csv("/content/download/MyDrive/download/test_hft_nodates.csv")

In [None]:
train = pd.read_csv("/content/download/MyDrive/download/train_hft.csv")
test = pd.read_csv("/content/download/MyDrive/download/test_hft_nodates.csv")

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
X = train.drop(['mom', 'index', 'timestamp', 'time'],axis=1)
y = train['mom']

In [None]:
from sklearn.model_selection import KFold
model = CatBoostClassifier(iterations=1000, depth=5, learning_rate=0.1, loss_function='MultiClass', verbose=False, task_type="GPU")

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_results = []
models = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)
    models.append(model)
    # Evaluate the model
    score = model.score(X_test, y_test)
    cv_results.append(score)


In [None]:
print(cv_results)

[0.5957740863342134, 0.5945243711218623, 0.5951090234128952, 0.5948694667435035, 0.5956501199458568]


In [None]:
from scipy.stats import mode

y_test = test['mom'].tolist()
X_test = test.drop(['mom', 'index', 'timestamp', 'time'],axis=1)
predictions = np.zeros((X_test.shape[0], len(models)))

for i, model in enumerate(models):
    predictions[:, i] = model.predict(X_test).flatten()

# Take the mean of the predictions
mean_predictions = predictions.mean(axis=1)

# Apply the specified logic for the final prediction
final_predictions = np.where(mean_predictions <= -0.79, -1, np.where(mean_predictions >= 0.79, 1, 0))

print("Final Predictions:", final_predictions)

Final Predictions: [-1  1 -1 ...  1  1  1]


In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_test, final_predictions)
print("Accuracy:", accuracy)

# Calculate F1-score
f1 = f1_score(y_test, final_predictions, average='weighted')
print("F1-Score:", f1)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, final_predictions)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.4431952504121045
F1-Score: 0.3674930125362931
Confusion Matrix:
 [[102104   4024 165957]
 [ 33230   3125 177506]
 [ 30965   3789 225470]]


## Features importances


In [None]:
feature_importances = model.get_feature_importance(train_pool)
feature_names = X.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    if score > 0.2:
        print('{0}: {1:.2f}'.format(name, score))

In [None]:
import shap
shap.initjs()

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(train_pool)

# visualize the training set predictions
# SHAP plots for all the data is very slow, so we'll only do it for a sample. Taking the head instead of a random sample is dangerous!
shap.force_plot(explainer.expected_value,shap_values[0,:300], X.iloc[0,:300])

In [None]:
# summarize the effects of all the features
shap.summary_plot(shap_values, X)