In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

class Network(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Network, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.fc1 = nn.Linear(self.input_size, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    



In [4]:
df = pd.read_csv('cleaned_dataset.csv')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
data = df[df['trading_day'] == 25]
data = data.drop(['trading_day'], axis=1)
data = data[['AskPrice1', 'AskVolume1', 'BidPrice1', 'BidVolume1', 'AskPrice2', 'AskVolume2', 'BidPrice2', 'BidVolume2', 'AskPrice3', 'AskVolume3', 'BidPrice3', 'BidVolume3']]
train_data = data.iloc[:20000, :]
test_data = data.iloc[20000:25000, :]
train_data

Unnamed: 0,AskPrice1,AskVolume1,BidPrice1,BidVolume1,AskPrice2,AskVolume2,BidPrice2,BidVolume2,AskPrice3,AskVolume3,BidPrice3,BidVolume3
1887801,769.0,40,768.5,116,769.5,293,768.0,293,770.0,603,767.5,154
1887802,768.5,5,768.0,293,769.0,132,767.5,184,769.5,338,767.0,323
1887803,768.5,4,768.0,303,769.0,311,767.5,305,769.5,396,767.0,364
1887804,769.0,185,768.5,24,769.5,409,768.0,357,770.0,754,767.5,317
1887805,769.0,163,768.5,23,769.5,417,768.0,458,770.0,765,767.5,319
...,...,...,...,...,...,...,...,...,...,...,...,...
1907796,773.0,1077,772.5,291,773.5,1149,772.0,550,774.0,1272,771.5,542
1907797,773.0,1072,772.5,294,773.5,1149,772.0,558,774.0,1272,771.5,542
1907798,773.0,1073,772.5,295,773.5,1149,772.0,558,774.0,1272,771.5,542
1907799,773.0,1071,772.5,294,773.5,1149,772.0,559,774.0,1272,771.5,542


In [5]:
def mid_price_change_label(dataset):
    dataset['mid_price'] = (dataset['BidPrice1'] + dataset['AskPrice1']) / 2
    # mid_price_change is the difference between the current mid_price and the previous mid_price
    # add this label to the previous datapoint
    dataset['mid_price_change'] = dataset['mid_price'].diff().shift(-1)
    dataset['mid_price_change_label'] = dataset['mid_price_change'].apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    dataset = dataset.drop(['mid_price', 'mid_price_change'], axis=1)
    return dataset


In [16]:
# squeeze the last 10 columns of the dataframe into one column, and rolling
# average the last 10 days of the dataframe into one column

def transform_dataframe(df, window_size=10):
    # Collect the columns
    columns = [f"{col}_lag_{i}" for i in range(window_size, 0, -1) for col in df.columns]

    # Create the new dataframe
    transformed_data = []
    for i in range(len(df) - window_size):
        row_data = df.iloc[i:i+window_size].values.flatten()
        transformed_data.append(row_data)

    transformed_df = pd.DataFrame(transformed_data, columns=columns)

    # Using the target from the current row
    # transformed_df['target'] = df.iloc[window_size:]['your_target_column'].values
    
    return transformed_df

transform_dataframe(train_data, window_size=10)


Unnamed: 0,AskPrice1_lag_10,AskVolume1_lag_10,BidPrice1_lag_10,BidVolume1_lag_10,AskPrice2_lag_10,AskVolume2_lag_10,BidPrice2_lag_10,BidVolume2_lag_10,AskPrice3_lag_10,AskVolume3_lag_10,...,BidPrice1_lag_1,BidVolume1_lag_1,AskPrice2_lag_1,AskVolume2_lag_1,BidPrice2_lag_1,BidVolume2_lag_1,AskPrice3_lag_1,AskVolume3_lag_1,BidPrice3_lag_1,BidVolume3_lag_1
0,769.0,40.0,768.5,116.0,769.5,293.0,768.0,293.0,770.0,603.0,...,768.0,70.0,769.0,379.0,767.5,342.0,769.5,461.0,767.0,412.0
1,768.5,5.0,768.0,293.0,769.0,132.0,767.5,184.0,769.5,338.0,...,768.0,134.0,769.0,389.0,767.5,353.0,769.5,461.0,767.0,413.0
2,768.5,4.0,768.0,303.0,769.0,311.0,767.5,305.0,769.5,396.0,...,768.0,189.0,769.0,392.0,767.5,366.0,769.5,461.0,767.0,439.0
3,769.0,185.0,768.5,24.0,769.5,409.0,768.0,357.0,770.0,754.0,...,768.0,359.0,769.0,409.0,767.5,326.0,769.5,466.0,767.0,434.0
4,769.0,163.0,768.5,23.0,769.5,417.0,768.0,458.0,770.0,765.0,...,768.0,294.0,769.0,417.0,767.5,324.0,769.5,473.0,767.0,442.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19985,773.0,1127.0,772.5,101.0,773.5,1162.0,772.0,565.0,774.0,1272.0,...,772.5,291.0,773.5,1149.0,772.0,550.0,774.0,1272.0,771.5,542.0
19986,773.0,1127.0,772.5,100.0,773.5,1160.0,772.0,565.0,774.0,1273.0,...,772.5,291.0,773.5,1149.0,772.0,550.0,774.0,1272.0,771.5,542.0
19987,773.0,1106.0,772.5,111.0,773.5,1160.0,772.0,569.0,774.0,1273.0,...,772.5,294.0,773.5,1149.0,772.0,558.0,774.0,1272.0,771.5,542.0
19988,773.0,1108.0,772.5,147.0,773.5,1160.0,772.0,569.0,774.0,1273.0,...,772.5,295.0,773.5,1149.0,772.0,558.0,774.0,1272.0,771.5,542.0


In [26]:
# import the random forest classifier
from sklearn.ensemble import RandomForestClassifier

X = renewed_train_data[renewed_train_data.columns[:-1]]
y = renewed_train_data[renewed_train_data.columns[-1]]

clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0, n_jobs=-1)
clf.fit(X, y)

# check the accuracy
clf.score(X, y)


0.97365

In [None]:
train_data = train_data.to_numpy()
train_data = torch.from_numpy(train_data)
train_data = train_data.float()
train_data = train_data.to(device)

test_data = test_data.to_numpy()
test_data = torch.from_numpy(test_data)
test_data = test_data.float()
test_data = test_data.to(device)



In [None]:
data['label'] = data['mid_price_change'].apply(lambda x: 1 if x > 0 else 0)
