# Guess 3 days' movement
# For CN stock market

In [1]:
import pandas as pd
from torch.optim import Adam
from torch.utils.data import DataLoader,Dataset
from torch.nn.utils import rnn
import math
import numpy as np
import torch
from ray.matchbox import DF_Dataset,Trainer

#### Obvious improvements to make

* More features(only five for now)
* Deeper layers in LSTM
* Bidirectional LSTM
* Attentional RNN
* Try GRU instead of LSTM (use nn.GRU)
* Put on other metrics other than Mean Square Error

In [2]:
data_df = pd.read_csv("/data/stock_hist_data.csv")
data_df.sample(10)

Unnamed: 0,sn,ts_code,trade_date,open,high,low,close,pre_close,change,pct_change,vol,amount,indu,indu_en
6869938,868,600505.SH,20150407,11.9,12.39,11.75,12.2,11.79,0.41,3.48,219963.83,264456.47,水力发电,Hydroelectric power generation
7291021,2630,000798.SZ,20070703,6.05,6.14,5.6,5.91,6.03,-0.12,-1.99,38017.04,22256.3768,渔业,fishery
907333,641,600839.SH,20160311,3.57,3.62,3.54,3.59,3.61,-0.02,-0.55,542643.63,193820.835,家用电器,Household appliances
7795884,372,601788.SH,20170419,14.91,14.93,14.63,14.77,14.94,-0.17,-1.14,181845.85,267951.776,证券,Securities
2805154,2223,000576.SZ,20090105,2.84,2.93,2.8,2.93,2.79,0.14,5.02,79376.91,22729.7654,造纸,papermaking
2106634,2790,002081.SZ,20070118,30.15,31.43,28.96,30.1,29.91,0.19,0.64,10062.1,29805.594,装修装饰,decoration decoration
6946605,1549,002134.SZ,20111111,9.86,9.99,9.68,9.7,9.86,-0.16,-1.62,30484.97,29999.7885,元器件,components
590093,214,002736.SZ,20171207,12.0,12.01,11.79,11.79,12.03,-0.24,-2.0,60164.27,71462.78,证券,Securities
2960929,497,002681.SZ,20160608,16.8,17.22,16.7,16.77,16.97,-0.2,-1.18,153983.04,260673.4279,家用电器,Household appliances
4882645,3518,000571.SZ,20020905,5.01,5.04,4.93,4.96,5.01,-0.05,-1.0,16590.47,8263.8751,综合类,Miscellaneous


In [3]:
grouped_stock = list(data_df.groupby("ts_code"))

In [4]:
data_df["o_h"] = (data_df.high - data_df.open)
data_df["o_c"] = (data_df.close - data_df.open)
data_df["o_l"] = (data_df.low - data_df.open)
data_df["h_c"] = (data_df.close - data_df.high)
data_df["l_c"] = (data_df.close - data_df.low)
data_df["l_h"] = (data_df.high - data_df.low)

In [5]:
data_df["oh_p"] = data_df.o_h/data_df.l_h
data_df["oc_p"] = data_df.o_c/data_df.l_h
data_df["ol_p"] = data_df.o_l/data_df.l_h
data_df["hc_p"] = data_df.h_c/data_df.l_h
data_df["lc_p"] = data_df.l_c/data_df.l_h

In [6]:
data_df = data_df[["sn","ts_code","trade_date","oc_p","ol_p","lc_p","oc_p","hc_p","pct_change"]]

data_df = data_df.fillna(0.)

In [7]:
data_df.head(10)

Unnamed: 0,sn,ts_code,trade_date,oc_p,ol_p,lc_p,oc_p.1,hc_p,pct_change
0,0,603208.SH,20181026,0.172414,-0.413793,0.586207,0.172414,-0.413793,0.9291
1,1,603208.SH,20181025,0.218182,-0.436364,0.654545,0.218182,-0.345455,-2.1531
2,2,603208.SH,20181024,-0.068182,-0.181818,0.113636,-0.068182,-0.886364,-0.8539
3,3,603208.SH,20181023,-0.461538,-0.897436,0.435897,-0.461538,-0.564103,-0.8467
4,4,603208.SH,20181022,0.759259,-0.037037,0.796296,0.759259,-0.203704,4.5231
5,5,603208.SH,20181019,0.575,-0.25,0.825,0.575,-0.175,1.4464
6,6,603208.SH,20181018,0.355556,-0.622222,0.977778,0.355556,-0.022222,-0.4469
7,7,603208.SH,20181017,-0.15625,-0.984375,0.828125,-0.15625,-0.171875,0.7
8,8,603208.SH,20181016,-0.605263,-1.0,0.394737,-0.605263,-0.605263,-2.7237
9,9,603208.SH,20181015,0.058824,-0.602941,0.661765,0.058824,-0.338235,-1.9552


In [8]:
ts_code_ct = data_df.groupby("ts_code").count()[["sn"]]

In [9]:
ts_code_ct["sn_ct"] = ts_code_ct.sn.apply(lambda x:x>30)

In [10]:
ts_code_ct.sample(5)

Unnamed: 0_level_0,sn,sn_ct
ts_code,Unnamed: 1_level_1,Unnamed: 2_level_1
300419.SZ,826,True
000023.SZ,4000,True
002876.SZ,346,True
000528.SZ,4000,True
002305.SZ,2097,True


In [11]:
ts_code_ct = ts_code_ct.sort_values(by = "sn",ascending=False)
valid_ts_code = ts_code_ct[ts_code_ct.sn_ct]

In [12]:
valid_ts_code.tail(5)

Unnamed: 0_level_0,sn,sn_ct
ts_code,Unnamed: 1_level_1,Unnamed: 2_level_1
603590.SH,39,True
002933.SZ,38,True
603192.SH,38,True
601068.SH,35,True
002935.SZ,34,True


In [13]:
first = data_df.reset_index().groupby("ts_code").first()[["index"]].rename(columns={"index":"first"})
last = data_df.reset_index().groupby("ts_code").last()[["index"]].rename(columns={"index":"last"})

In [14]:
valid_ts_code = valid_ts_code.join(first).join(last)

In [15]:
valid_ts_code.head()

Unnamed: 0_level_0,sn,sn_ct,first,last
ts_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
000001.SZ,4000,True,4128141,4132140
600097.SH,4000,True,1276082,1280081
600112.SH,4000,True,2742589,2746588
600111.SH,4000,True,5101362,5105361
600110.SH,4000,True,2281116,2285115


In [16]:
dict(valid_ts_code.loc["600112.SH"])

{'sn': 4000, 'sn_ct': True, 'first': 2742589, 'last': 2746588}

In [17]:
def xy_pre_row(loc,stop):
    basic = data_df[loc["first"]:loc["first"]+stop]
    arr = basic[["oc_p","ol_p","lc_p","oc_p","hc_p"]].values
    arr_x,arr_y = arr[:-5,:],arr[-5:,:]
    return torch.FloatTensor(arr_x),torch.FloatTensor(arr_y)

def xy_pre(df):
    res_x,res_y = [],[]
    for i in range(len(df.index)):
        idx = df.index[i]
        loc_dict = dict(df.loc[idx])
        if i== 0:
            maxlen = loc_dict["last"]-loc_dict["first"]
            stop = min(maxlen,800)-int(np.random.rand()*.3*min(maxlen,800))
        x,y = xy_pre_row(loc_dict,stop)
        res_x.append(x)
        res_y.append(y)
    tensor_x = rnn.pad_sequence(res_x,batch_first=True,)
    tensor_y = rnn.pad_sequence(res_y,batch_first=True)
    return tensor_x,tensor_y

def x_pre_row(loc,stop):
    basic = data_df[loc["first"]:loc["first"]+stop]
    arr = basic[["oc_p","ol_p","lc_p","oc_p","hc_p"]].values
    return torch.FloatTensor(arr)

def x_pre(df):
    res_x,res_y = [],[]
    for i in range(len(df.index)):
        idx = df.index[i]
        loc_dict = dict(df.loc[idx])
        if i== 0:
            maxlen = loc_dict["last"]-loc_dict["first"]
            stop = min(maxlen,800)-int(np.random.rand()*.3*min(maxlen,800))
        x = x_pre_row(loc_dict,stop)
        res_x.append(x)
    tensor_x = rnn.pad_sequence(res_x,batch_first=True,)
    return tensor_x

def stock_collate(batch):
    arrs,_ = zip(*batch)
    arrs_x,arrs_y = zip(*arrs)
#     print(arrs_x)
    return arrs_x,arrs_y

In [18]:
ds = DF_Dataset(valid_ts_code,x_pre,lambda x:0,bs=12,shuffle=False)

In [19]:
# dl = DataLoader(ds,batch_size=1,shuffle=True,collate_fn=stock_collate)
dl = DataLoader(ds,batch_size=1,shuffle=True)

In [20]:
gen = iter(dl)
for i in range(1):
    x,y = next(gen)
    print(x[0],y[0])

tensor([[[-0.5750, -0.5750, -0.8250,  ..., -0.5750, -0.5750, -0.7500],
         [ 0.6000,  0.6000, -0.1429,  ...,  0.6000,  0.6000, -0.2571],
         [ 0.1515,  0.1515, -0.1515,  ...,  0.1515,  0.1515, -0.6970],
         ...,
         [ 0.5339,  0.5339, -0.4110,  ...,  0.5339,  0.5339, -0.0551],
         [-0.2105, -0.2105, -0.5307,  ..., -0.2105, -0.2105, -0.6798],
         [ 0.5191,  0.5191, -0.0255,  ...,  0.5191,  0.5191, -0.4554]],

        [[ 0.0000,  0.0000, -0.2857,  ...,  0.0000,  0.0000, -0.7143],
         [ 0.7000,  0.7000, -0.1000,  ...,  0.7000,  0.7000, -0.2000],
         [ 0.2000,  0.2000, -0.3000,  ...,  0.2000,  0.2000, -0.5000],
         ...,
         [-0.2059, -0.2059, -0.9412,  ..., -0.2059, -0.2059, -0.2647],
         [ 0.0625,  0.0625, -0.5000,  ...,  0.0625,  0.0625, -0.4375],
         [-0.4500, -0.4500, -0.8000,  ..., -0.4500, -0.4500, -0.6500]],

        [[ 1.0000,  1.0000,  0.0000,  ...,  1.0000,  1.0000,  0.0000],
         [ 0.6000,  0.6000, -0.3000,  ...,  0

In [21]:
from torch import nn
CUDA = torch.cuda.is_available()
print("with_gpu",CUDA)

with_gpu False


In [27]:
DIM = 1024
N_LAYER = 1
class money1(nn.Module):
    def __init__(self):
        super(money1,self).__init__()
        self.encoder = nn.LSTM(input_size = 7,
                               num_layers=2,
                          hidden_size = DIM,
                          
                          batch_first = True,bias = True)
        self.decoder = nn.LSTM(input_size = DIM,
                         num_layers = 2,
                          hidden_size = 7,
                          batch_first = True,bias = True)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self,x):
        output, (h_n,c_n) = self.encoder(x)
        outpout = self.relu(output)
        input_recon,_ = self.decoder(output)
        return input_recon

In [28]:
Money = money1()
if CUDA:
    torch.cuda.empty_cache()
    Money.cuda() 


opt = Adam(Money.parameters(),lr=1e-5)
loss_f = nn.MSELoss()

def action(*args,**kwargs):
    data_ = args[0][0].squeeze(0)
    x = data_[:,-3:,:]
    y = data_[:,:3,:]
    opt.zero_grad()
    if CUDA: 
        x,y = tuple(va.cuda() for va in (x,y))
    y_ = Money(x)
    loss = loss_f(y_,y)
    loss.backward()
    opt.step()
    return {"loss":loss.item()}

def val_action(*args,**kwargs):
    x,y = args[0]
    if CUDA: 
        x,y = tuple(va[0].cuda() for va in (x,y))
    else:
        x,y = tuple(va[0] for va in (x,y))
    y_,loss = Money(x,y)
    return {"loss":loss.item()}

trainer = Trainer(ds,batch_size=1,shuffle=True,print_on=5)
# trainer.train_data.collate_fn = stock_collate
trainer.action = action
trainer.val_action = val_action
trainer.train(20)

⭐[ep_0_i_294]	loss	0.250: 100%|██████████| 296/296 [01:13<00:00,  4.03it/s]
⭐[ep_1_i_294]	loss	0.236: 100%|██████████| 296/296 [01:12<00:00,  4.06it/s]
⭐[ep_2_i_294]	loss	0.213: 100%|██████████| 296/296 [01:13<00:00,  4.05it/s]
⭐[ep_3_i_294]	loss	0.231: 100%|██████████| 296/296 [01:13<00:00,  4.03it/s]
⭐[ep_4_i_294]	loss	0.214: 100%|██████████| 296/296 [01:13<00:00,  4.03it/s]
⭐[ep_5_i_294]	loss	0.221: 100%|██████████| 296/296 [01:13<00:00,  4.02it/s]
⭐[ep_6_i_294]	loss	0.202: 100%|██████████| 296/296 [01:13<00:00,  4.03it/s]
⭐[ep_7_i_294]	loss	0.201: 100%|██████████| 296/296 [01:13<00:00,  4.02it/s]
⭐[ep_8_i_294]	loss	0.217: 100%|██████████| 296/296 [01:12<00:00,  4.11it/s]
⭐[ep_9_i_294]	loss	0.184: 100%|██████████| 296/296 [01:13<00:00,  4.04it/s]
⭐[ep_10_i_294]	loss	0.207: 100%|██████████| 296/296 [01:12<00:00,  4.06it/s]
⭐[ep_11_i_294]	loss	0.212: 100%|██████████| 296/296 [01:13<00:00,  4.05it/s]
⭐[ep_12_i_294]	loss	0.185: 100%|██████████| 296/296 [01:13<00:00,  4.04it/s]
⭐[ep_13_i

In [26]:
trainer.train(20)

⭐[ep_0_i_294]	loss	0.113: 100%|██████████| 296/296 [00:27<00:00, 10.93it/s]
⭐[ep_1_i_294]	loss	0.130: 100%|██████████| 296/296 [00:27<00:00, 10.91it/s]
⭐[ep_2_i_294]	loss	0.093: 100%|██████████| 296/296 [00:25<00:00, 11.67it/s]
⭐[ep_3_i_294]	loss	0.122: 100%|██████████| 296/296 [00:27<00:00, 10.90it/s]
⭐[ep_4_i_294]	loss	0.110: 100%|██████████| 296/296 [00:26<00:00, 10.98it/s]
⭐[ep_5_i_294]	loss	0.115: 100%|██████████| 296/296 [00:26<00:00, 11.28it/s]
⭐[ep_6_i_294]	loss	0.114: 100%|██████████| 296/296 [00:23<00:00, 12.60it/s]
⭐[ep_7_i_294]	loss	0.130: 100%|██████████| 296/296 [00:23<00:00, 12.54it/s]
⭐[ep_8_i_294]	loss	0.109: 100%|██████████| 296/296 [00:26<00:00, 11.07it/s]
⭐[ep_9_i_294]	loss	0.108: 100%|██████████| 296/296 [00:27<00:00, 10.95it/s]
⭐[ep_10_i_294]	loss	0.119: 100%|██████████| 296/296 [00:27<00:00, 10.92it/s]
⭐[ep_11_i_294]	loss	0.123: 100%|██████████| 296/296 [00:27<00:00, 10.89it/s]
⭐[ep_12_i_294]	loss	0.101: 100%|██████████| 296/296 [00:27<00:00, 10.90it/s]
⭐[ep_13_i