In [2]:
!pip install fastparquet
!pip install pyarrow



In [14]:
import numpy as np
import pandas as pd
import math
import sklearn
import sklearn.preprocessing
import random
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
device = "cuda:0" if torch.cuda.is_available() else "cpu"
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [15]:
df = pd.read_parquet('./BTC-USDT.parquet')

df.drop(['volume', 'quote_asset_volume', 'number_of_trades', 'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume'],1,inplace=True)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2347102 entries, 2017-08-17 04:00:00 to 2022-02-07 23:59:00
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   open    float32
 1   high    float32
 2   low     float32
 3   close   float32
dtypes: float32(4)
memory usage: 53.7 MB


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,open,high,low,close
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-08-17 04:00:00,4261.47998,4261.47998,4261.47998,4261.47998
2017-08-17 04:01:00,4261.47998,4261.47998,4261.47998,4261.47998
2017-08-17 04:02:00,4280.560059,4280.560059,4280.560059,4280.560059
2017-08-17 04:03:00,4261.47998,4261.47998,4261.47998,4261.47998
2017-08-17 04:04:00,4261.47998,4261.47998,4261.47998,4261.47998


In [18]:
def normalize_data(df):
    min_max_scaler = sklearn.preprocessing.MinMaxScaler()
    opens = df.open.values.reshape(-1,1)
    lows = df.low.values.reshape(-1,1)
    highs = df.high.values.reshape(-1,1)
    closes = df['close'].values.reshape(-1,1)
    scale = min_max_scaler.fit(opens + highs + lows + closes)
    df['open'] = scale.transform(opens)
    df['high'] = scale.transform(highs)
    df['low'] = scale.transform(lows)
    df['close'] = scale.transform(closes)
    return df

def load_data(stock, seq_len):
    data_raw = stock.values # convert to numpy array
    data = []
    
    # create all possible sequences of length seq_len
    for index in range(len(data_raw) - seq_len): 
        data.append(data_raw[index: index + seq_len])
    
    data = np.array(data); 
    test_set_size = int(np.round(test_set_size_percentage/100*data.shape[0]))
    train_set_size = data.shape[0] - test_set_size
    
    x_train = data[:train_set_size,:-1,:]
    y_train = data[:train_set_size,-1,:]
    
    x_test = data[train_set_size:,:-1,:]
    y_test = data[train_set_size:,-1,:]
    
    return [x_train, y_train, x_test, y_test]

In [19]:
valid_set_size_percentage = 10 
test_set_size_percentage = 10 
# normalize stock
df_stock_norm = df.copy()
df_stock_norm = normalize_data(df_stock_norm)

# create train, test data
seq_len = 20 # choose sequence length
x_train, y_train, x_test, y_test = load_data(df_stock_norm, seq_len)
print('x_train.shape = ',x_train.shape)
print('y_train.shape = ', y_train.shape)
print('x_test.shape = ', x_test.shape)
print('y_test.shape = ',y_test.shape)

x_train.shape =  (2112374, 19, 4)
y_train.shape =  (2112374, 4)
x_test.shape =  (234708, 19, 4)
y_test.shape =  (234708, 4)


In [12]:
class Model(nn.Module):

    def __init__(self):
        super(Model, self).__init__()
        self.lstm = nn.LSTM(input_size=4, hidden_size=128, num_layers=1, bidirectional=False, batch_first=True, dropout = 0.2)
        self.fn = nn.Linear(128, 4)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        batchsize, max_len, emb_dim = x.shape
        output, (h, c) = self.lstm(x)
        output = self.relu(output[:,-1,:])
        output = self.fn(output)

        return output

In [32]:
index_in_epoch = 0;
perm_array  = np.arange(x_train.shape[0])
np.random.shuffle(perm_array)

# function to get the next batch
def get_next_batch(batch_size):
    global index_in_epoch, x_train, perm_array   
    start = index_in_epoch
    index_in_epoch += batch_size
    
    if index_in_epoch > x_train.shape[0]:
        np.random.shuffle(perm_array) # shuffle permutation array
        start = 0 # start next epoch
        index_in_epoch = batch_size
        
    end = index_in_epoch
    return x_train[perm_array[start:end]], y_train[perm_array[start:end]]

# parameters
learning_rate = 0.001
batch_size = 128
n_epochs = 3
train_set_size = x_train.shape[0]
test_set_size = x_test.shape[0]

model = Model().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.MSELoss()

model.train()
cnt = 0
avg_loss = 0.
for epoch in range(int(n_epochs * train_set_size/ batch_size)):
    optimizer.zero_grad()
    x_batch, y_batch = get_next_batch(batch_size)
    x_batch = torch.tensor(x_batch).to(device)
    y_batch = torch.tensor(y_batch)
    y_pred = model(x_batch).cpu()
    loss = criterion(y_pred, y_batch)
    avg_loss += loss.item()
    loss.backward()
    optimizer.step()
    
    cnt += 1
    if cnt % 1000 == 0:
        print("batch count:{}, avg train loss:{}".format(cnt, avg_loss / 1000))
        avg_loss = 0.

torch.save(model.state_dict(), "./parameters.pt")

  "num_layers={}".format(dropout, num_layers))


batch count:1000, avg train loss:7.895523555878903e-05
batch count:2000, avg train loss:4.832209505032381e-07
batch count:3000, avg train loss:3.7643369036466365e-07
batch count:4000, avg train loss:3.028107323945761e-07
batch count:5000, avg train loss:2.689605074106538e-07
batch count:6000, avg train loss:2.476487094371649e-07
batch count:7000, avg train loss:2.1073113575642567e-07
batch count:8000, avg train loss:1.8187151296000793e-07
batch count:9000, avg train loss:1.6813274816840362e-07
batch count:10000, avg train loss:1.5091208274586876e-07
batch count:11000, avg train loss:1.545325941272324e-07
batch count:12000, avg train loss:1.2945435915412418e-07
batch count:13000, avg train loss:9.927452048863473e-08
batch count:14000, avg train loss:1.1354343699299108e-07
batch count:15000, avg train loss:1.1212307348973382e-07
batch count:16000, avg train loss:1.2857228527263232e-07
batch count:17000, avg train loss:6.218717319184108e-08
batch count:18000, avg train loss:7.933534823756

In [33]:
model = Model().to(device)
model.load_state_dict(torch.load("./parameters.pt"))
model.eval()
criterion = torch.nn.MSELoss(reduction="sum")

  "num_layers={}".format(dropout, num_layers))


In [37]:
loss = 0.
batch_size = 256
num = math.ceil(test_set_size / batch_size)
start = 0
with torch.no_grad():
  for cnt in range(num):
    x_batch = x_test[start:min(start+batch_size, test_set_size),:,:]
    y_batch = y_test[start:min(start+batch_size, test_set_size),:]
    y_pred = model(torch.tensor(x_batch).to(device)).cpu()
    loss += criterion(y_pred, torch.tensor(y_batch))
    start += batch_size
    #print(y_batch[:5,:])
    #print(y_pred[:5,:].cpu().numpy())

print("test mse loss:{}".format(loss / test_set_size))

test mse loss:2.3527253745214693e-07
