In [1]:
import pandas as pd
import numpy as np
import torch
import os

In [2]:
stock = 'hs300'

In [3]:
basic_feature = ["open", "close", "high", "low", "volume", "turnover"]
# basic_feature = ["open", "close", "high", "low", "volume"]

In [4]:
target_return_span = 1
target = f"return+{target_return_span}"

In [5]:
time_span = 60
# stock_date = ['2018-01-01', '2021-12-31', '2022-01-02', '2022-10-04',  '2022-10-05', '2023-12-31']
stock_date = ['2018-01-01', '2021-12-31', '2022-01-02', '2022-09-29',  '2022-09-30', '2023-12-31']

In [6]:
tic_df_list = []
for tic_path in os.listdir(f"./{stock}"):
    # tic_df = pd.read_csv(f"./{stock}/{tic_path}")[["date", "open", "close", "high", "low", "volume", "tic"]]
    tic_df = pd.read_csv(f"./{stock}/{tic_path}")[["date", "open", "close", "high", "low", "volume", "turnover", "tic"]]
    tic_df[target] = tic_df.close.pct_change(target_return_span).shift(-1 * target_return_span)
    tic_df_list.append(tic_df)

tic_target_df = pd.concat(tic_df_list).sort_values(by="date").dropna()
tic_target_df.date = pd.DatetimeIndex(tic_target_df.date)

In [7]:
dataset_df = tic_target_df
dataset_df

Unnamed: 0,date,open,close,high,low,volume,turnover,tic,return+1
0,2018-01-02,13.35,13.70,13.93,13.32,208159255.0,2.856544e+09,000001.SZ,-0.027007
0,2018-01-02,4.65,4.72,4.84,4.61,257111210.0,1.216979e+09,000425.SZ,-0.006356
0,2018-01-02,7.13,7.18,7.23,7.12,49500856.0,3.559108e+08,601169.SH,0.011142
0,2018-01-02,17.05,17.24,17.29,17.01,97602162.0,1.678255e+09,601166.SH,-0.002320
0,2018-01-02,29.55,29.02,30.66,28.92,36416515.0,1.088832e+09,601155.SH,-0.000689
...,...,...,...,...,...,...,...,...,...
1455,2023-12-28,14.52,14.74,14.75,14.48,49900548.0,7.321915e+08,601166.SH,0.099729
1455,2023-12-28,19.47,20.20,20.30,19.33,57302141.0,1.147868e+09,002142.SZ,-0.004455
1455,2023-12-28,7.96,7.82,8.05,7.74,71736980.0,5.615205e+08,600011.SH,-0.015345
1455,2023-12-28,9.11,9.45,9.47,9.08,166159184.0,1.550257e+09,000001.SZ,-0.006349


In [8]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 203840 entries, 0 to 1455
Data columns (total 9 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   date      203840 non-null  datetime64[ns]
 1   open      203840 non-null  float64       
 2   close     203840 non-null  float64       
 3   high      203840 non-null  float64       
 4   low       203840 non-null  float64       
 5   volume    203840 non-null  float64       
 6   turnover  203840 non-null  float64       
 7   tic       203840 non-null  object        
 8   return+1  203840 non-null  float64       
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 15.6+ MB


In [9]:
dataset_df['date'].nunique()

1456

In [10]:
train_df = dataset_df[(dataset_df.date >= stock_date[0]) & (dataset_df.date <= stock_date[1])]
val_df = dataset_df[(dataset_df.date >= stock_date[2]) & (dataset_df.date <= stock_date[3])]
test_df = dataset_df[(dataset_df.date >= stock_date[4]) & (dataset_df.date <= stock_date[5])]

In [11]:
print(test_df['date'].nunique(), ' ', train_df['date'].nunique(), ' ', val_df['date'].nunique())

302   973   181


In [12]:
tic_list = dataset_df['tic'].unique()

In [13]:
def df_2_array(dataset_df, feat_col, target, type):
    dataset_feat = []
    dataset_ret = []
    dataset_price = []
    for tic in tic_list:
        df = dataset_df[dataset_df.tic == tic]
        feat = df[feat_col].to_numpy()
        ret = df[target].to_numpy()
        price = df['close'].to_numpy()
        stock_feat = []
        stock_ret = []
        stock_price = []
        for i in range(time_span, feat.shape[0]):
            stock_feat.append(feat[i-time_span : i])
            stock_ret.append(ret[i])
            stock_price.append(price[i])
        stock_feat = np.array(stock_feat)
        stock_ret = np.array(stock_ret)
        stock_price = np.array(stock_price)
    
        dataset_feat.append(stock_feat)
        dataset_ret.append(stock_ret)
        dataset_price.append(stock_price)

    dataset_feat = np.array(dataset_feat).transpose((1, 2, 0, 3))
    dataset_ret = np.array(dataset_ret).transpose((1, 0))
    dataset_price = np.array(dataset_price).transpose((1, 0))
    
    dataset_feat_tensor = torch.tensor(dataset_feat, dtype=torch.float)
    dataset_ret_tensor = torch.tensor(dataset_ret, dtype=torch.float)
    dataset_price_tensor = torch.tensor(dataset_price, dtype=torch.float)
    
    torch.save(dataset_feat_tensor, f"./dataset/{stock}/{type}/feat.pt")
    torch.save(dataset_ret_tensor, f"./dataset/{stock}/{type}/ret.pt")
    torch.save(dataset_price_tensor, f"./dataset/{stock}/{type}/price.pt")    
    
    return dataset_feat, dataset_ret, dataset_price

In [14]:
_ = df_2_array(train_df, basic_feature, target, "train")
_ = df_2_array(val_df, basic_feature, target, "val")
dataset_feat, dataset_ret, dataset_price = df_2_array(test_df, basic_feature, target, "test")

In [15]:
dataset_feat.shape

(242, 60, 140, 6)

In [16]:
dataset_ret.shape

(242, 140)