# Guess Industry Name With The Stock Indices

In [1]:
import pandas as pd
from torch.optim import Adam
from torch.utils.data import DataLoader,Dataset
from torch.nn.utils import rnn
import math
import numpy as np
import torch
from ray.matchbox import DF_Dataset,Trainer,save_model,load_model
from ray.armory import mem

#### Obvious improvements to make

* Try Attentional LSTM
* More features(only five for now)
* Deeper layers in LSTM
* Bidirectional LSTM
* Attentional RNN
* Try GRU instead of LSTM (use nn.GRU)
* Put on other metrics other than Mean Square Error

In [2]:
MODEL_PATH = "/data/money_move_.0.0.1.npy"
TRAIN = True
LAG = 5

In [3]:
data_df = pd.read_csv("/data/stock_hist_data.csv")
data_df.sample(10)

Unnamed: 0,sn,ts_code,trade_date,open,high,low,close,pre_close,change,pct_change,vol,amount,indu,indu_en
3471392,1581,002517.SZ,20110311,25.6,25.76,25.21,25.22,25.76,-0.54,-2.1,12457.99,31683.1269,互联网,the Internet
6210284,1989,601601.SH,20100816,23.08,23.88,22.82,23.69,23.15,0.54,2.33,315946.96,735954.525,保险,Insurance
5652124,2058,600351.SH,20100226,20.25,21.0,19.93,20.26,20.25,0.01,0.05,144343.74,296717.3,中成药,Chinese medicine
7804821,581,002511.SZ,20160607,16.24,16.56,16.05,16.33,16.24,0.09,0.55,54231.46,88293.8782,造纸,papermaking
1151582,3385,600867.SH,20040707,5.9,5.95,5.83,5.95,5.9,0.05,0.85,1608.9,948.991,生物制药,Biopharmaceutical
991881,452,603611.SH,20161216,34.2,35.33,34.02,35.14,34.14,1.0,2.93,18090.34,63153.987,运输设备,Transportation Equipment
1377894,3880,600177.SH,20020521,10.25,10.46,10.25,10.32,10.1,0.22,2.18,6389.1,6612.462,服饰,apparel
1482534,688,002591.SZ,20150512,13.32,13.67,13.04,13.55,13.3,0.25,1.88,104137.63,139229.4471,化工原料,Chemical raw materials
2465539,213,300488.SZ,20171208,30.69,32.37,30.45,31.7,30.69,1.01,3.29,4736.95,14902.479,机械基件,Mechanical base parts
6942488,1048,300121.SZ,20140513,6.9,7.38,6.88,7.0,6.9,0.1,1.45,118547.09,83822.6121,化工原料,Chemical raw materials


In [4]:
grouped_stock = list(data_df.groupby("ts_code"))

In [5]:
data_df["o_h"] = (data_df.high - data_df.open)
data_df["o_c"] = (data_df.close - data_df.open)
data_df["o_l"] = (data_df.low - data_df.open)
data_df["h_c"] = (data_df.close - data_df.high)
data_df["l_c"] = (data_df.close - data_df.low)
data_df["l_h"] = (data_df.high - data_df.low)

In [6]:
data_df["oh_p"] = data_df.o_h/data_df.l_h
data_df["oc_p"] = data_df.o_c/data_df.l_h
data_df["ol_p"] = data_df.o_l/data_df.l_h
data_df["hc_p"] = data_df.h_c/data_df.l_h
data_df["lc_p"] = data_df.l_c/data_df.l_h

In [7]:
data_df = data_df[["sn","ts_code","trade_date","oc_p","ol_p","lc_p","oc_p","hc_p","pct_change"]]

data_df = data_df.fillna(0.)

In [8]:
data_df.head(10)

Unnamed: 0,sn,ts_code,trade_date,oc_p,ol_p,lc_p,oc_p.1,hc_p,pct_change
0,0,603208.SH,20181026,0.172414,-0.413793,0.586207,0.172414,-0.413793,0.9291
1,1,603208.SH,20181025,0.218182,-0.436364,0.654545,0.218182,-0.345455,-2.1531
2,2,603208.SH,20181024,-0.068182,-0.181818,0.113636,-0.068182,-0.886364,-0.8539
3,3,603208.SH,20181023,-0.461538,-0.897436,0.435897,-0.461538,-0.564103,-0.8467
4,4,603208.SH,20181022,0.759259,-0.037037,0.796296,0.759259,-0.203704,4.5231
5,5,603208.SH,20181019,0.575,-0.25,0.825,0.575,-0.175,1.4464
6,6,603208.SH,20181018,0.355556,-0.622222,0.977778,0.355556,-0.022222,-0.4469
7,7,603208.SH,20181017,-0.15625,-0.984375,0.828125,-0.15625,-0.171875,0.7
8,8,603208.SH,20181016,-0.605263,-1.0,0.394737,-0.605263,-0.605263,-2.7237
9,9,603208.SH,20181015,0.058824,-0.602941,0.661765,0.058824,-0.338235,-1.9552


In [9]:
ts_code_ct = data_df.groupby("ts_code").count()[["sn"]]

In [10]:
ts_code_ct["sn_ct"] = ts_code_ct.sn.apply(lambda x:x>30)

In [11]:
ts_code_ct.sample(5)

Unnamed: 0_level_0,sn,sn_ct
ts_code,Unnamed: 1_level_1,Unnamed: 2_level_1
002105.SZ,2837,True
600019.SH,4000,True
300642.SZ,371,True
002277.SZ,2234,True
002030.SZ,3349,True


In [12]:
ts_code_ct = ts_code_ct.sort_values(by = "sn",ascending=False)
valid_ts_code = ts_code_ct[ts_code_ct.sn_ct]

In [13]:
valid_ts_code.tail(5)

Unnamed: 0_level_0,sn,sn_ct
ts_code,Unnamed: 1_level_1,Unnamed: 2_level_1
603590.SH,39,True
002933.SZ,38,True
603192.SH,38,True
601068.SH,35,True
002935.SZ,34,True


In [14]:
first = data_df.reset_index().groupby("ts_code").first()[["index"]].rename(columns={"index":"first"})
last = data_df.reset_index().groupby("ts_code").last()[["index"]].rename(columns={"index":"last"})

In [15]:
valid_ts_code = valid_ts_code.join(first).join(last)

In [16]:
valid_ts_code.head()

Unnamed: 0_level_0,sn,sn_ct,first,last
ts_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
000001.SZ,4000,True,4128141,4132140
600097.SH,4000,True,1276082,1280081
600112.SH,4000,True,2742589,2746588
600111.SH,4000,True,5101362,5105361
600110.SH,4000,True,2281116,2285115


In [17]:
dict(valid_ts_code.loc["600112.SH"])

{'first': 2742589, 'last': 2746588, 'sn': 4000, 'sn_ct': True}

### Preprocess Function

In [18]:
def xy_pre_row(loc,stop):
    basic = data_df[loc["first"]:loc["first"]+stop]
    arr = basic[["oc_p","ol_p","lc_p","oc_p","hc_p"]].values
    arr_x,arr_y = arr[:-LAG,:],arr[-LAG:,:]
    return torch.FloatTensor(arr_x),torch.FloatTensor(arr_y)

def xy_pre(df):
    res_x,res_y = [],[]
    for i in range(len(df.index)):
        idx = df.index[i]
        loc_dict = dict(df.loc[idx])
        if i== 0:
            maxlen = loc_dict["last"]-loc_dict["first"]
            stop = min(maxlen,800)-int(np.random.rand()*.3*min(maxlen,800))
        x,y = xy_pre_row(loc_dict,stop)
        res_x.append(x)
        res_y.append(y)
    tensor_x = rnn.pad_sequence(res_x,batch_first=True,)
    tensor_y = rnn.pad_sequence(res_y,batch_first=True)
    return tensor_x,tensor_y

def x_pre_row(loc,stop):
    basic = data_df[loc["first"]:loc["first"]+stop]
    arr = basic[["oc_p","ol_p","lc_p","oc_p","hc_p"]].values
    return torch.FloatTensor(arr)

def x_pre(df):
    res_x,res_y = [],[]
    for i in range(len(df.index)):
        idx = df.index[i]
        loc_dict = dict(df.loc[idx])
        if i== 0:
            maxlen = loc_dict["last"]-loc_dict["first"]
            stop = min(maxlen,800)-int(np.random.rand()*.3*min(maxlen,800))
        x = x_pre_row(loc_dict,stop)
        res_x.append(x)
    tensor_x = rnn.pad_sequence(res_x,batch_first=True,)
    return tensor_x

def get_ts(df):
    return list(df.index)

def stock_collate(batch):
    arrs,_ = zip(*batch)
    arrs_x,arrs_y = zip(*arrs)
#     print(arrs_x)
    return arrs_x,arrs_y

def stock_collate_inf(batch):
    arrs,ts_list = zip(*batch)
#     print(arrs_x)
    return arrs,ts_list

In [19]:
split = (np.random.rand(len(valid_ts_code))>.7)

In [20]:
ds = DF_Dataset(valid_ts_code,x_pre,get_ts,bs=12,shuffle=False)
train_ds = DF_Dataset(valid_ts_code[~split],x_pre,lambda x:0,bs=12,shuffle=False)
valid_ds = DF_Dataset(valid_ts_code[split],x_pre,lambda x:0,bs=12,shuffle=False)

#### Testing dataset and how it performs in generator

In [21]:
# dl = DataLoader(ds,batch_size=1,shuffle=True,collate_fn=stock_collate)
dl = DataLoader(ds,batch_size=1,shuffle=True,collate_fn=stock_collate_inf)

In [22]:
gen = iter(dl)
for i in range(1):
    x,y = next(gen)
    print(x[0],y[0])

tensor([[[-0.4444, -0.4444, -0.8889,  ..., -0.4444, -0.4444, -0.5556],
         [ 0.4667,  0.4667, -0.4667,  ...,  0.4667,  0.4667, -0.0667],
         [-0.5000, -0.5000, -0.7500,  ..., -0.5000, -0.5000, -0.7500],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -1.0000],
         [ 0.8000,  0.8000, -0.0667,  ...,  0.8000,  0.8000, -0.1333],
         [-0.3143, -0.3143, -0.6857,  ..., -0.3143, -0.3143, -0.6286]],

        [[-0.3750, -0.3750, -0.6250,  ..., -0.3750, -0.3750, -0.7500],
         [ 0.6818,  0.6818, -0.1818,  ...,  0.6818,  0.6818, -0.1364],
         [ 0.2500,  0.2500, -0.3500,  ...,  0.2500,  0.2500, -0.4000],
         ...,
         [-0.3359, -0.3359, -0.7023,  ..., -0.3359, -0.3359, -0.6336],
         [ 0.7168,  0.7168, -0.2832,  ...,  0.7168,  0.7168,  0.0000],
         [ 0.7769,  0.7769, -0.2231,  ...,  0.7769,  0.7769,  0.0000]],

        [[ 0.6607,  0.6607,  0.0000,  ...,  0.6607,  0.6607, -0.3393],
         [ 0.6061,  0.6061,  0.0000,  ...,  0

In [23]:
from torch import nn
CUDA = torch.cuda.is_available()
print("with_gpu",CUDA)

with_gpu True


### Encoder Decoder Structure with some lag in days

In [24]:
DIM = 1024
N_LAYER = 1

class money1(nn.Module):
    def __init__(self):
        super(money1,self).__init__()
        self.encoder = nn.LSTM(input_size = 7,
                          hidden_size = DIM,
                          batch_first = True,bias = True)
        self.decoder = nn.LSTM(input_size = DIM,
                          hidden_size = 7,
                          batch_first = True,bias = True)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self,x):
        output, (h_n,c_n) = self.encoder(x)
        outpout = self.relu(output)
        input_recon,_ = self.decoder(output)
        return input_recon

In [25]:
Money = money1()
if CUDA:
    torch.cuda.empty_cache()
    Money.cuda() 

opt = Adam(Money.parameters(),lr=1e-5)
loss_f = nn.MSELoss()

infe_list = []

def action(*args,**kwargs):
    data_ = args[0][0].squeeze(0)
    x = data_[:,-LAG:,:]
    y = data_[:,:LAG,:]
    opt.zero_grad()
    if CUDA: 
        x,y = tuple(va.cuda() for va in (x,y))
    y_ = Money(x)
    loss = loss_f(y_,y)
    loss.backward()
    opt.step()
    return {"loss":loss.item()}

def val_action(*args,**kwargs):
    data_ = args[0][0].squeeze(0)
    x = data_[:,-LAG:,:]
    y = data_[:,:LAG,:]
    if CUDA: 
        x,y = tuple(va.cuda() for va in (x,y))
    y_ = Money(x)
    loss = loss_f(y_,y)
    return {"loss":loss.item()}

def inf_action(*args,**kwargs):
    """
    A step of inference action
    """
    x = args[0][0][0]
    ts = args[0][1][0]
    if CUDA:
        x = x.cuda()
    y_ = list(Money(x)[:,-LAG:,:].cpu().detach().numpy())
    savedf = pd.DataFrame({"ts":ts,"pred":y_})
    savedf["pred"] = savedf["pred"].apply(lambda x:x.reshape(-1).tolist())
    infe_list.append(savedf)
    return {"success":1}

trainer = Trainer(train_ds, val_dataset=valid_ds, batch_size=1,shuffle=True,print_on=5)
# trainer.train_data.collate_fn = stock_collate
trainer.action = action
trainer.val_action = val_action

In [26]:
infe = Trainer(ds, batch_size=1,shuffle=False, print_on=5)
infe.action = inf_action
infe.train_data.collate_fn = stock_collate_inf

In [None]:
if TRAIN:
    trainer.train(20)
    save_model(Money,MODEL_PATH)

⭐[ep_0_i_204]	loss	0.223: 100%|██████████| 206/206 [00:03<00:00, 52.82it/s]
😎[val_ep_0_i_89]	loss	0.231: 100%|██████████| 90/90 [00:01<00:00, 59.62it/s]
⭐[ep_1_i_204]	loss	0.217: 100%|██████████| 206/206 [00:03<00:00, 51.81it/s]
😎[val_ep_1_i_89]	loss	0.213: 100%|██████████| 90/90 [00:01<00:00, 59.49it/s]
⭐[ep_2_i_204]	loss	0.210: 100%|██████████| 206/206 [00:03<00:00, 52.40it/s]
😎[val_ep_2_i_89]	loss	0.204: 100%|██████████| 90/90 [00:01<00:00, 57.63it/s]
⭐[ep_3_i_204]	loss	0.198: 100%|██████████| 206/206 [00:03<00:00, 52.26it/s]
😎[val_ep_3_i_89]	loss	0.194: 100%|██████████| 90/90 [00:01<00:00, 59.34it/s]
⭐[ep_4_i_204]	loss	0.158: 100%|██████████| 206/206 [00:03<00:00, 52.26it/s]
😎[val_ep_4_i_89]	loss	0.173: 100%|██████████| 90/90 [00:01<00:00, 58.63it/s]
⭐[ep_5_i_204]	loss	0.138: 100%|██████████| 206/206 [00:03<00:00, 52.24it/s]
😎[val_ep_5_i_89]	loss	0.154: 100%|██████████| 90/90 [00:01<00:00, 60.98it/s]
⭐[ep_6_i_204]	loss	0.156: 100%|██████████| 206/206 [00:03<00:00, 52.09it/s]
😎[val_

In [None]:
if TRAIN:
    trainer.train(20)
    save_model(Money,MODEL_PATH)

⭐[ep_0_i_204]	loss	0.119: 100%|██████████| 206/206 [00:03<00:00, 53.15it/s]
😎[val_ep_0_i_89]	loss	0.114: 100%|██████████| 90/90 [00:01<00:00, 62.96it/s]
⭐[ep_1_i_134]	loss	0.124:  64%|██████▍   | 132/206 [00:02<00:01, 52.06it/s]

In [None]:
load_model(Money,MODEL_PATH)
infe.train(1)