In [1]:
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import MinMaxScaler
import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("data/HMM-SPY.csv")
features = ["Open", "Adj Close", "MA_12", "MA_21", "HMM", "lrets_bench", "lrets_strat"]
df = df[features]
df.head()

Unnamed: 0,Open,Adj Close,MA_12,MA_21,HMM,lrets_bench,lrets_strat
0,256.820007,242.321487,233.62229,238.259514,2,0.009351,0.0
1,257.559998,243.454041,234.584504,238.076622,3,0.004663,-0.0
2,256.26001,244.31279,236.014256,237.912283,2,0.003521,0.0
3,257.679993,244.407166,237.952047,237.749759,3,0.000386,-0.0
4,256.859985,242.916016,238.834431,237.456805,2,-0.00612,0.003731


In [3]:
df["lrets_bench_roll"] = df["lrets_bench"].rolling(window=10).mean()
df["TARGET"] = df["lrets_strat"].shift(1)
df.dropna(inplace=True)

In [4]:
# get non_stationary columns
non_stationaries = []
for column in df.columns:
    dftest = adfuller(df[column].values)
    p_value = dftest[1]
    t_test = dftest[0] < dftest[4]["1%"]
    if p_value > 0.05 or not t_test:
        non_stationaries.append(column)
df[non_stationaries] = np.log(df[non_stationaries])

In [5]:
scaler = MinMaxScaler()
df.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1])

In [6]:
#split the data
X_data = df.iloc[:, : -1].values
y_data = df.iloc[:, -1].values

In [7]:
def split_sequence(input_data, n_steps):
    X = list()
    for i in range(len(input_data)):
        end_ix = i + n_steps
        if end_ix > len(input_data)-1:
            break
        seq_x = input_data[i:end_ix, :-1]        
        X.append(seq_x)
    return np.array(X)

In [8]:
n_timesteps = 8

X = split_sequence(X_data, n_timesteps)
y = y_data[n_timesteps: ]

In [9]:
# summarize and check the X sequence
print("")
print("X summary:")
counts = 0
for i in range(len(X)):
    counts += 1
    if counts <=2:
        print(X[i])
    else:
        break

# summarize and check the y sequence
print("")
print("y summary:")
counts = 0
for i in range(len(y)):
    counts += 1
    if counts <=2:
        print(y[i])
    else:
        break


X summary:
[[0.20064088 0.1777452  0.0472111  0.         0.33333333 0.50481327
  0.53223262]
 [0.19651226 0.1803982  0.05237563 0.00496689 0.33333333 0.58226423
  0.59579465]
 [0.19242205 0.18107234 0.05665925 0.01136369 0.66666667 0.57456887
  0.59579465]
 [0.20465531 0.191766   0.06085597 0.02012078 1.         0.61353273
  0.42164448]
 [0.19334353 0.18208334 0.06354648 0.02499517 0.66666667 0.53429341
  0.63750451]
 [0.19605276 0.1803982  0.06564806 0.02923554 0.66666667 0.56539409
  0.68835771]
 [0.20206512 0.20033197 0.06962947 0.0346993  1.         0.64946563
  0.7835711 ]
 [0.21426199 0.21142959 0.07541099 0.04019253 0.66666667 0.61510364
  0.79955837]]
[[0.19651226 0.1803982  0.05237563 0.00496689 0.33333333 0.58226423
  0.59579465]
 [0.19242205 0.18107234 0.05665925 0.01136369 0.66666667 0.57456887
  0.59579465]
 [0.20465531 0.191766   0.06085597 0.02012078 1.         0.61353273
  0.42164448]
 [0.19334353 0.18208334 0.06354648 0.02499517 0.66666667 0.53429341
  0.63750451]
 [0

In [10]:
class dataset(Dataset):
    def __init__(self,X,y):
        self.X = torch.tensor(X,dtype=torch.float32)
        self.y = torch.tensor(y,dtype=torch.float32)
        self.length = self.X.shape[0]
    def __getitem__(self,idx):
        return self.X[idx],self.y[idx]
    def __len__(self):
        return self.length

# Trainset
trainset = dataset(X, y)

# Data Loader
trainloader = DataLoader(trainset, batch_size=20, shuffle=False)

In [11]:
print(trainset.X.shape)
trainset.X[0:3]

torch.Size([835, 8, 7])


tensor([[[0.2006, 0.1777, 0.0472, 0.0000, 0.3333, 0.5048, 0.5322],
         [0.1965, 0.1804, 0.0524, 0.0050, 0.3333, 0.5823, 0.5958],
         [0.1924, 0.1811, 0.0567, 0.0114, 0.6667, 0.5746, 0.5958],
         [0.2047, 0.1918, 0.0609, 0.0201, 1.0000, 0.6135, 0.4216],
         [0.1933, 0.1821, 0.0635, 0.0250, 0.6667, 0.5343, 0.6375],
         [0.1961, 0.1804, 0.0656, 0.0292, 0.6667, 0.5654, 0.6884],
         [0.2021, 0.2003, 0.0696, 0.0347, 1.0000, 0.6495, 0.7836],
         [0.2143, 0.2114, 0.0754, 0.0402, 0.6667, 0.6151, 0.7996]],

        [[0.1965, 0.1804, 0.0524, 0.0050, 0.3333, 0.5823, 0.5958],
         [0.1924, 0.1811, 0.0567, 0.0114, 0.6667, 0.5746, 0.5958],
         [0.2047, 0.1918, 0.0609, 0.0201, 1.0000, 0.6135, 0.4216],
         [0.1933, 0.1821, 0.0635, 0.0250, 0.6667, 0.5343, 0.6375],
         [0.1961, 0.1804, 0.0656, 0.0292, 0.6667, 0.5654, 0.6884],
         [0.2021, 0.2003, 0.0696, 0.0347, 1.0000, 0.6495, 0.7836],
         [0.2143, 0.2114, 0.0754, 0.0402, 0.6667, 0.6151, 0.

In [12]:
trainset.y[:10]

tensor([ 0.0098, -0.0001,  0.0086,  0.0013, -0.0068, -0.0081,  0.0000,  0.0045,
         0.0095, -0.0046])

In [13]:
from torch import nn
from torch.autograd import Variable

class LSTM_Net(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTM_Net, self).__init__()        
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, 5)
        self.fc2 = nn.Linear(5, output_dim)
        
    def forward(self, x):
        h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))                            
        c0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
        ula, (h_out, _) = self.lstm(x, (h0, c0))
        h_out = h_out.view(-1, self.hidden_dim)
        out = self.fc1(h_out)
        out = self.fc2(out)
        
        return out

In [14]:
input_dim = X.shape[2]
hidden_dim = 100
layer_dim = 1
output_dim = 1
model = LSTM_Net(input_dim, hidden_dim, layer_dim, output_dim)


In [15]:
for i in range(len(list(model.parameters()))):
    print(list(model.parameters())[i].size())

torch.Size([400, 7])
torch.Size([400, 100])
torch.Size([400])
torch.Size([400])
torch.Size([5, 100])
torch.Size([5])
torch.Size([1, 5])
torch.Size([1])


In [16]:
epochs = 1000

criterion = torch.nn.MSELoss()    
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-1)

losses = []

iter = 0
for epoch in range(epochs):
    for i, (X_train, y_train) in enumerate(trainloader):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train.unsqueeze(dim=1))                       
        loss.backward()
        optimizer.step()
        iter += 1
        
    if iter % 100 == 0:
        print(f"Loss: {loss}")
        losses.append(loss.detach())

Loss: 6.920337796145759e-07
Loss: 3.6115384318691213e-06
Loss: 2.1946823835605755e-05
Loss: 1.209432127780019e-07
Loss: 8.741676538193133e-06
Loss: 3.0876442451699404e-06
Loss: 1.890551061478618e-06
Loss: 0.00011230882228119299
Loss: 3.2188805221267103e-07


KeyboardInterrupt: 

In [None]:
plt.plot(losses)
plt.show()