CREATE A MODEL THAT CAN PREDICT FUTURE KALSHI PRICES

Take each market in each day of data. Starting at 9:30a, take 3 hours of data. This will be the input data. Then take the next half hour for output data. Then slide the window by 30 minutes. We will end up with 5 slices per day. Return two arrays, one with the five input slices and one with the five output slices.

In [8]:
import pandas as pd
import numpy as np

data = [[i+1, i+1] for i in range(390)]
df = pd.DataFrame(data, columns=['Column1', 'Column2'])

#takes in a dataframe and returns input/output pairs for training/testing
def split_a_market(market_df, window_size=210, stride=30):
    input_array = []
    output_array = []
    
    entries = 0
    for i in range(0, len(market_df), stride):
        if entries == 5:
            break
        
        window_start = i
        window_end = i + window_size
        
        window_data = market_df.iloc[window_start:window_end]
    
        inputs = window_data.iloc[:180].values
        outputs = window_data.iloc[180:].values
        
        input_array.append(inputs)
        output_array.append(outputs)
        
        entries += 1
    
    input_array = np.array(input_array)
    output_array = np.array(output_array)
    
    return input_array, output_array
    

Take each csv and run split_a_market on it to get training and testing splits for that day. Then add those to the broader train and test arrays.

In [45]:
import os

filespath = '../data_storage/ml_training_data/combined_data'
file_list = ['../data_storage/ml_training_data/combined_data/' + str(file) for file in os.listdir(filespath) if os.path.isfile(os.path.join(filespath, file))]

x = np.empty((0, 180, 4))
y = np.empty((0, 30, 1))

for fp in file_list:
    try:
        df = pd.read_csv(fp)
        df = df.drop(columns=['time'])
        
        # def time_to_numerical(time_str):
        #     hours, minutes = map(int, time_str.split(':'))
        #     return hours + minutes / 60.0

        # df['time'] = df['time'].apply(time_to_numerical)
        inputs, outputs = split_a_market(df)
        outputs = outputs[:,:,:1]
    except:
        continue

    for row in inputs:
        x = np.append(x, [row], axis=0)
    for row in outputs:
        y = np.append(y, [row], axis=0)

x.shape, y.shape

((2250, 180, 4), (2250, 30, 1))

Split the data

In [46]:
from sklearn.model_selection import train_test_split
import torch

X_input = x 
Y_output = y

# Reshape the input and output arrays into 2D arrays
X_input_flat = X_input.reshape((X_input.shape[0], -1))  # Shape (2250, 180*5)
Y_output_flat = Y_output.reshape((Y_output.shape[0], -1))  # Shape (2250, 30*5)

X_train, X_test, Y_train, Y_test = train_test_split(X_input_flat, Y_output_flat, test_size=0.2, random_state=42)

# Reshape back to original shape
X_train = X_train.reshape((X_train.shape[0], X_input.shape[1], X_input.shape[2]))
X_test = X_test.reshape((X_test.shape[0], X_input.shape[1], X_input.shape[2]))
Y_train = Y_train.reshape((Y_train.shape[0], Y_output.shape[1], Y_output.shape[2]))
Y_test = Y_test.reshape((Y_test.shape[0], Y_output.shape[1], Y_output.shape[2]))

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
Y_train = Y_train.astype(np.float32)
Y_test = Y_test.astype(np.float32)

X_train_tensor = torch.Tensor(X_train)
X_test_tensor = torch.Tensor(X_test)
Y_train_tensor = torch.Tensor(Y_train)
Y_test_tensor = torch.Tensor(Y_test)



X_train shape: (1800, 180, 4)
X_test shape: (450, 180, 4)
Y_train shape: (1800, 30, 1)
Y_test shape: (450, 30, 1)


In [47]:
X_train_tensor = torch.reshape(X_train_tensor,   
                                      (X_train_tensor.shape[0], 180, 
                                       X_train_tensor.shape[2]))
X_test_tensor = torch.reshape(X_test_tensor,  
                                     (X_test_tensor.shape[0], 180, 
                                      X_test_tensor.shape[2])) 

print("Training Shape:", X_train_tensor.shape, Y_train_tensor.shape)
print("Testing Shape:", X_test_tensor.shape, Y_test_tensor.shape) 

Training Shape: torch.Size([1800, 180, 4]) torch.Size([1800, 30, 1])
Testing Shape: torch.Size([450, 180, 4]) torch.Size([450, 30, 1])


In [52]:
import torch
import torch.nn as nn

class FinancialLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(FinancialLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -30:, :])  # Output the next 30 minutes
        return out

# Define hyperparameters
input_size = 4  # 5 features per minute
hidden_size = 64
num_layers = 2
output_size = 1  # 5 features per minute for the next 30 minutes

# Create the LSTM model
model = FinancialLSTM(input_size, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 1000
for epoch in range(num_epochs):
    outputs = model(X_train_tensor)
    
    loss = criterion(outputs, Y_train_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')


Epoch [1/100], Loss: 1449.3834228515625
Epoch [2/100], Loss: 1445.1053466796875
Epoch [3/100], Loss: 1440.9029541015625
Epoch [4/100], Loss: 1436.7242431640625
Epoch [5/100], Loss: 1432.5262451171875
Epoch [6/100], Loss: 1428.2593994140625
Epoch [7/100], Loss: 1423.8709716796875
Epoch [8/100], Loss: 1419.30517578125
Epoch [9/100], Loss: 1414.5057373046875
Epoch [10/100], Loss: 1409.4154052734375
Epoch [11/100], Loss: 1403.9793701171875
Epoch [12/100], Loss: 1398.147705078125
Epoch [13/100], Loss: 1391.880615234375
Epoch [14/100], Loss: 1385.15185546875
Epoch [15/100], Loss: 1377.9569091796875
Epoch [16/100], Loss: 1370.317626953125
Epoch [17/100], Loss: 1362.2841796875
Epoch [18/100], Loss: 1353.927978515625
Epoch [19/100], Loss: 1345.3282470703125
Epoch [20/100], Loss: 1336.5552978515625
Epoch [21/100], Loss: 1327.6630859375
Epoch [22/100], Loss: 1318.6971435546875
Epoch [23/100], Loss: 1309.7037353515625
Epoch [24/100], Loss: 1300.73486328125
Epoch [25/100], Loss: 1291.841796875
Epoc

In [53]:
# Evaluate the model
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    
    output = outputs[0, :, 0]
    actual = Y_test_tensor[0, :, 0]
    print(output, actual)
    
    test_loss = criterion(outputs, Y_test_tensor)
    print(f'Test Loss: {test_loss.item()}')

tensor([10.6977, 10.6977, 10.6977, 10.6977, 10.6977, 10.6977, 10.6977, 10.6977,
        10.6977, 10.6977, 10.6977, 10.6977, 10.6977, 10.6977, 10.6977, 10.6977,
        10.6977, 10.6977, 10.6977, 10.6977, 10.6977, 10.6977, 10.6977, 10.6977,
        10.6977, 10.6977, 10.6977, 10.6977, 10.6977, 10.6977]) tensor([32., 33., 33., 34., 35., 34., 32., 32., 31., 32., 31., 31., 32., 33.,
        34., 33., 34., 35., 34., 33., 34., 35., 37., 37., 37., 36., 34., 33.,
        32., 32.])
Test Loss: 959.7330932617188
