In [25]:
import torch
import csv
import math
import pandas as pd
from pandas import read_csv
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import Dataset, DataLoader, sampler
import random

torch.set_default_dtype(torch.float64)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
class MyDataset(torch.utils.data.Dataset):


    def __init__(self, X, Y, output_normalized, classification):
        super().__init__()
        self.X = torch.from_numpy(X)
        self.Y = torch.from_numpy(Y)

    def __getitem__(self, index):
        x = self.X[index]
        y = self.Y[index]
        return x, y

    def __len__(self):
        return self.X.shape[0]


In [34]:
class LSTM(torch.nn.Module):
    def __init__(self, classification=False, class_number=1):
        super().__init__()
        input_size = 7

        self.BN_before = torch.nn.BatchNorm1d(10)
        self.embedding=torch.nn.LSTM(
            input_size = input_size,
            hidden_size=32,
            num_layers=1,
            batch_first=True,
            bidirectional=True,
        )
        self.rnn=torch.nn.LSTM(
            input_size= 64,
            hidden_size=128,
            dropout=0.5,
            num_layers=2,
            batch_first=True
        )
        self.linear1 = torch.nn.Linear(128, 1)
        self.relu = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(128, class_number)

    def forward(self,x):
        out = x
#         out = self.BN_before(out)
        out, _ = self.embedding(x)
        output,(h_n,c_n) = self.rnn(out)
        out = output[:,-1,:] 
        out = self.linear1(out)
#         out = self.relu(out)
#         out = self.linear2(out)
        
        return out

In [28]:
seed = 4
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)


train_epoch = 5
batch_size = 16
learning_rate = 0.0001
output_normalized = False
output_delta = True
classification = False
if classification:
    output_delta = True

In [37]:
# load the dataset
train_dataframe = read_csv('./train_data.csv', engine='python')

train_dataframe = train_dataframe.drop(columns=['Unnamed: 0', 'Time'])
# train_dataframe = train_dataframe.drop(columns=['Unnamed: 0', 'Time'])
train_ndarray = train_dataframe.values
train_dataframe

Unnamed: 0,Date,MidPrice,LastPrice,Volume,BidPrice1,BidVolume1,AskPrice1,AskVolume1
0,2018-06-01,3.7865,3.786,1081100.0,3.786,21500.0,3.787,12900.0
1,2018-06-01,3.7835,3.784,1425900.0,3.783,44200.0,3.784,119200.0
2,2018-06-01,3.7835,3.784,1485500.0,3.783,42700.0,3.784,61100.0
3,2018-06-01,3.7845,3.785,1551600.0,3.784,145900.0,3.785,16900.0
4,2018-06-01,3.7835,3.784,2063300.0,3.783,347700.0,3.784,200100.0
5,2018-06-01,3.7835,3.784,2408600.0,3.783,352600.0,3.784,70700.0
6,2018-06-01,3.7860,3.785,2494800.0,3.784,415000.0,3.788,5800.0
7,2018-06-01,3.7860,3.785,2494800.0,3.784,133200.0,3.788,85900.0
8,2018-06-01,3.7860,3.784,2611500.0,3.784,16500.0,3.788,85900.0
9,2018-06-01,3.7855,3.784,2619200.0,3.784,8800.0,3.787,80100.0


In [38]:
X, delta_Y, original_Y = [],[], []
for i in range(len(train_ndarray)-30):
    if (i+1) % 100000 == 0:
        print("{}/{}".format(i, len(train_ndarray)-30))
    tmp = train_ndarray[i:i+30]
    if tmp[0,0] != tmp[-1,0]:
        continue
    tmp = tmp[:, 1:].astype(np.float64)
    delta_ave_midPrice = np.average(tmp[10:, 0]) - tmp[9, 0]
    X.append(tmp[:10])
    delta_Y.append([float(delta_ave_midPrice), float(tmp[9, 0])])
    original_Y.append([float(np.average(tmp[10:, 0])), float(tmp[9, 0])])

X, delta_Y, original_Y = np.array(X, dtype=np.float64), np.array(delta_Y, dtype=np.float64), np.array(original_Y, dtype=np.float64)

99999/430009
199999/430009
299999/430009
399999/430009


In [39]:
if not output_delta:
    delta_Y = np.delete(delta_Y, 1, axis=0)
    orignal_Y = np.delete(original_Y, 1, axis=0)
    
if not output_delta:
    Y = original_Y
else:
    Y = delta_Y
    
if classification:
    Y = delta_Y.copy()
    for i in range(len(Y)):
        if Y[i][0] > 5e-4:
            Y[i][0] = 0
        elif Y[i][0] > -5e-4:
            Y[i][0] = 1
        else:
            Y[i][0] = 2


X_mean = np.mean(X, axis=(1, ))[:, np.newaxis, :]
X_std = np.std(X, axis=(1, ))[:, np.newaxis, :]
X_std[X_std==0] = 1
X_normalized = (X-X_mean) / X_std

train_number = int(0.8 * len(X))

# train_X, train_Y = X_normalized[-train_number:], Y[-train_number:]
# val_X, val_Y =  X_normalized[:-train_number], original_Y[:-train_number]

train_X, train_Y = X_normalized[:train_number], Y[:train_number]
val_X, val_Y =  X_normalized[train_number:], original_Y[train_number:]

# train_X, train_Y = X_normalized[:], Y[:]
# val_X, val_Y =  X_normalized[:], original_Y[:]

In [40]:
print(train_X[0])

[[ 1.36907069  2.23606798 -1.7060623   2.52357307 -0.87046968  0.62576814
  -1.1113685 ]
 [-1.12014875 -0.74535599 -1.08194394 -0.91766294 -0.71998867 -1.08087224
   0.82659856]
 [-1.12014875 -0.74535599 -0.97406269 -0.91766294 -0.72993234 -1.08087224
  -0.23262897]
 [-0.29040893  0.74535599 -0.85441587  0.22941573 -0.04580722 -0.51199211
  -1.03844406]
 [-1.12014875 -0.74535599  0.07180619 -0.91766294  1.29194909 -1.08087224
   2.30149541]
 [-1.12014875 -0.74535599  0.6968296  -0.91766294  1.32443178 -1.08087224
  -0.05761031]
 [ 0.95420079  0.74535599  0.85285919  0.22941573  1.73808883  1.19464826
  -1.24080939]
 [ 0.95420079  0.74535599  0.85285919  0.22941573 -0.12999703  1.19464826
   0.21950257]
 [ 0.95420079 -0.74535599  1.06409647  0.22941573 -0.90361527  1.19464826
   0.21950257]
 [ 0.53933088 -0.74535599  1.07803415  0.22941573 -0.95465949  0.62576814
   0.11376213]]


In [41]:
train_dataset = MyDataset(train_X, train_Y, output_normalized, classification)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size, 
    shuffle=True,
    drop_last=True
)

# build val set
val_dataset = MyDataset(val_X, val_Y, False, False)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size, 
    drop_last=True
)

net = LSTM(classification, 1).to(device)

optimizer = torch.optim.Adam(net.parameters(),lr=learning_rate)

if classification:
    loss_F =  torch.nn.CrossEntropyLoss()
else:
    loss_F = torch.nn.MSELoss()

In [42]:
for epoch in range(10): 
    net.train()
    print("epoch:", epoch)
    sum_loss = 0
    for step, input_data in enumerate(train_dataloader):
        x, y = input_data        
        x, y = x.to(device), y[:, 0].to(device)
        pred_y = net(x)
        
        pred_y = pred_y.flatten()
        loss = loss_F(pred_y, y) # 计算loss
        
        sum_loss += loss
        
        if (step+1) %1000 == 0: # 每50步，计算精度
            if output_normalized:
                print("{}/{} steps".format(step, len(train_dataloader)), float(loss), pred_y[0], float(y[0]))
            else:
                print("{}/{} steps".format(step, len(train_dataloader)), float(loss), pred_y[0], float(y[0]))
        # print(x[:2], pred_y[:2])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print("epoch: {}, average train loss: {}".format(epoch, sum_loss/(len(train_dataloader)+1)))

    net.eval()
    sum_loss = 0

    for step, input_data in enumerate(val_dataloader):

        x, y = input_data
        x_cuda, y_cuda = x.to(device), y.to(device)
        base = y_cuda[:, 1]
        y_true = y_cuda[:, 0]
        pred_y = net(x_cuda)
        pred_y = pred_y.flatten()
        
        if classification:
            pred_y = torch.argmax(pred_y, dim=1).double()
            pred_y[pred_y == 0] = 5e-4
            pred_y[pred_y == 1] = -5e-4
            pred_y[pred_y == 2] = 0
            
#         result = base
        result = base + pred_y
        

        sum_loss += float(torch.sum((result - y_true) ** 2))
        if step % 500 ==0:
            print("val {}/{}".format(step, len(val_dataloader)), pred_y[0], base[0], y_true[0])

    sum_loss = math.sqrt(sum_loss / (len(val_dataloader) * batch_size))
    print("epoch: {}, average validation loss: {}".format(epoch, sum_loss))




epoch: 0
999/21380 steps 1.309699246614057e-06 tensor(-0.0007, device='cuda:0', grad_fn=<SelectBackward>) -0.0007749999999995261
1999/21380 steps 1.4210195728775925e-06 tensor(-0.0002, device='cuda:0', grad_fn=<SelectBackward>) -0.00035000000000051656
2999/21380 steps 3.8117205967528162e-06 tensor(-0.0012, device='cuda:0', grad_fn=<SelectBackward>) -0.00035000000000096065
3999/21380 steps 1.4666276923486484e-06 tensor(0.0007, device='cuda:0', grad_fn=<SelectBackward>) -0.0028000000000001357
4999/21380 steps 1.4121313803255308e-06 tensor(0.0007, device='cuda:0', grad_fn=<SelectBackward>) 0.0013999999999998458
5999/21380 steps 7.253380630442219e-07 tensor(0.0007, device='cuda:0', grad_fn=<SelectBackward>) 0.00044999999999939533
6999/21380 steps 7.20623736928299e-07 tensor(-0.0007, device='cuda:0', grad_fn=<SelectBackward>) 4.440892098500626e-16
7999/21380 steps 9.120897901201701e-07 tensor(0.0008, device='cuda:0', grad_fn=<SelectBackward>) 0.0017999999999998018
8999/21380 steps 4.6329893

KeyboardInterrupt: 

In [17]:
# load the dataset
test_dataframe = read_csv('test_data.csv', engine='python')

test_dataframe = test_dataframe.drop(columns=['Date', 'Unnamed: 0', 'Time',])
test_dataset = test_dataframe.values

In [11]:
torch.save(net.state_dict(), 'lstm_10.pkl')

In [92]:
def getFeature(x):
    output,(h_n,c_n) = net.rnn(x)
    out = output[:,-1,:]
    return out

In [18]:
print(len(test_dataset))


for i in range(0, len(test_dataset), 10):
    x_mean = np.mean(test_dataset[i:i+10], axis=(0, ))
    x_std = np.std(test_dataset[i:i+10], axis=(0, )) + 0.01
    x = (test_dataset[i:i+10] - x_mean) / x_std
    x = torch.from_numpy(x).unsqueeze(0).to(device)
    
    pred_y = net(x)
    if classification:
        pred_y = torch.argmax(pred_y, dim=1).double()
        pred_y[pred_y == 0] = 5e-4
        pred_y[pred_y == 2] = -5e-4
        pred_y[pred_y == 1] = 0
    
    result = test_dataset[i + 9, 0] + pred_y.item()
    print("{},{}".format(int(i/10)+1,float(result)))

10000
1,3.4291405025194197
2,3.431116271222207
3,3.430322923428899
4,3.426032008645647
5,3.417267879920264
6,3.4203688102141605
7,3.4186306787800778
8,3.4257191924495007
9,3.4265733345329075
10,3.4266107906698458
11,3.4236249843390834
12,3.425133869536198
13,3.4257942492611613
14,3.4274579795325653
15,3.426875142426522
16,3.4228843460356995
17,3.418749564618708
18,3.4166822925835834
19,3.413574625959804
20,3.4092982098849838
21,3.4084722553843214
22,3.4028813703105794
23,3.398428002134042
24,3.3955467651540707
25,3.3979423682748022
26,3.400152072543379
27,3.3991465296968766
28,3.400492658994874
29,3.4006911131657427
30,3.4014454170562303
31,3.3994696510834186
32,3.398993828118952
33,3.39995902131892
34,3.39863510622423
35,3.397296655912337
36,3.396069112231956
37,3.4002415348591684
38,3.399158974865559
39,3.398027919456597
40,3.3966991632817
41,3.3914908102560255
42,3.3875434927147516
43,3.3894924329674225
44,3.390335551504096
45,3.3903357329439854
46,3.3890806095731567
47,3.3879161871

539,3.2362475181569046
540,3.2382449978059253
541,3.2338713704586746
542,3.232089061547603
543,3.2283736378015084
544,3.228191810969857
545,3.2273009064646945
546,3.2246393636191115
547,3.2266706928412905
548,3.2250182240783567
549,3.2261731859883995
550,3.2207941915231912
551,3.2182271271793215
552,3.217854678236937
553,3.2210661882451084
554,3.2223514569147995
555,3.2204588286262674
556,3.2204722501989167
557,3.2244004375281796
558,3.2234759823738304
559,3.221066416379209
560,3.2162302437277157
561,3.2197620977067456
562,3.218396029048682
563,3.218904409424157
564,3.216900153432463
565,3.2137315570938547
566,3.212298129415117
567,3.2137122997020304
568,3.208434292353302
569,3.204255128835026
570,3.2015552584797233
571,3.2032041048735382
572,3.205483121691059
573,3.2065770793376465
574,3.2082281757029705
575,3.2065479490870783
576,3.2068934534851663
577,3.2087763845095156
578,3.2095539568287412
579,3.211616917121592
580,3.2103965833126042
581,3.2069748112382412
582,3.204100437514384
5

930,3.0832488879018203
931,3.083378786665089
932,3.089655947532554
933,3.093442590937173
934,3.103621898512082
935,3.118294092303544
936,3.112205307102022
937,3.1195287148281583
938,3.1132143996673336
939,3.124169770201858
940,3.1255224135462387
941,3.122845835182577
942,3.120280130663332
943,3.1244840882843543
944,3.1216519251624533
945,3.131513136977668
946,3.1295098006678366
947,3.1243531474264423
948,3.123097375582478
949,3.125762363361561
950,3.1190355513587966
951,3.11849497983227
952,3.1157903157967364
953,3.1129725044242944
954,3.1171513560645696
955,3.1168638809681743
956,3.1089055722018344
957,3.110588678777121
958,3.1106792022852727
959,3.110250365153658
960,3.1119991587911895
961,3.113625670923327
962,3.1180106515182477
963,3.124595941723572
964,3.1261584445369976
965,3.124004055690296
966,3.120919079288559
967,3.1323306345594704
968,3.132583215019254
969,3.1387091399249196
970,3.1482018296631216
971,3.146132759932768
972,3.144394362683347
973,3.151705667894672
974,3.154544

In [74]:
path = "../working/classification_result.csv"
with open(path,'w') as f:
    csv_write = csv.writer(f)
    csv_head = ["caseid","midprice"]
    csv_write.writerow(csv_head)
    
