In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import IterableDataset, Dataset, DataLoader
from tqdm import *
os.chdir(r"D:\Desktop\研一module1\AFT\data\AI量化模型预测挑战赛公开数据\train")
file_list = os.listdir()

In [3]:
stock_list = list(range(10))
date_list = list(range(64))
half_list = ['am', 'pm']


def gen_filename(stock_id, date_id, half_id):
    return "snapshot_sym{}_date{}_{}.csv".format(stock_id, date_id, half_id)

In [4]:
class StockDataset(Dataset):
    def __init__(self, data, seq_len, num_of_seconds):
        self.data = data
        self.seq_len = seq_len
        self.num_of_seconds = num_of_seconds

    def __len__(self):
        return len(self.data) * self.num_of_seconds

    def __getitem__(self, idx):
        date = idx // self.num_of_seconds
        second = idx % self.num_of_seconds
        X = self.data[date][:, max(0, second+1-self.seq_len):(second+1), :-1]
        y = self.data[date][:, second, -1]
        return X, y

In [5]:
[(10, 3998, 22),
 (8, 3998, 22)]

[(10, 3998, 22), (8, 3998, 22)]

In [None]:
(seq_len, batch_size, features_size)

255872

In [None]:
# def get_Dataset(date_start, date_end):
#     data_dict = []
#     for d in tqdm(range(date_start, date_end)):
#         daily_data = []
#         for s in stock_list:
#             if gen_filename(s, d, 'am') not in file_list or gen_filename(s, d, 'pm') not in file_list:
#                 continue
#             am_df = pd.read_csv(gen_filename(s, d, 'am'))
#             pm_df = pd.read_csv(gen_filename(s, d, 'pm'))
#             df = pd.concat([am_df, pm_df])
#             daily_data.append(np.array(df.iloc[:, 4:-4]))
#         daily_data = np.array(daily_data)
#         data_dict.append(daily_data)
#     ds = StockDataset(data_dict, 100, 3998)
#     dl = DataLoader(ds, batch_size=1)
#     return ds, dl

In [6]:
# v2
def get_Dataset(date_start, date_end):
    data_dict = []
    for d in tqdm(range(date_start, date_end)):
        daily_data = []
        for s in stock_list:
            if gen_filename(s, d, 'am') not in file_list or gen_filename(s, d, 'pm') not in file_list:
                continue
            am_df = pd.read_csv(gen_filename(s, d, 'am'))
            pm_df = pd.read_csv(gen_filename(s, d, 'pm'))
            df = pd.concat([am_df, pm_df])
            df = df.iloc[:, 4:-4]
            df = df.drop("amount_delta", axis=1)
            df = df.div(df.iloc[0]).replace([np.nan, np.inf, -np.inf], 0.0)
            daily_data.append(df)
        daily_data = np.array(daily_data)
        data_dict.append(daily_data)
    ds = StockDataset(data_dict, 600, 3998)
    dl = DataLoader(ds, batch_size=1)
    return ds, dl

In [7]:
train_ds, train_dl = get_Dataset(0, 48)
val_ds, val_dl = get_Dataset(48, 64)

100%|██████████| 48/48 [00:25<00:00,  1.86it/s]
100%|██████████| 16/16 [00:08<00:00,  1.78it/s]


In [11]:
class GRUmodel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(GRUmodel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size,
                          num_layers, batch_first=True)
        self.bn = nn.BatchNorm1d(hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x, _ = self.gru(x)
        x = self.bn(x[:, -1, :])
        # x = x[:, -1, :]
        x = self.fc(x)
        return x

In [180]:
model = GRUmodel(input_size=23, hidden_size=16, num_layers=1, output_size=3)
X = torch.from_numpy(np.random.random(size=(3, 10, 23)).astype(np.float32))
model(X).sum(dim=1)

tensor([1.0000, 1.0000, 1.0000], grad_fn=<SumBackward1>)

In [9]:
def determine_seed(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


determine_seed()

tensor([0., 1., 1., 1., 1., 1., 1., 1.], dtype=torch.float64)

In [12]:
determine_seed()
model = GRUmodel(input_size=22, hidden_size=16, num_layers=1, output_size=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

train_losses = []
train_accs = []
val_losses = []
val_accs = []

num_epochs = 100
accumulation_steps = 8
alpha = 1
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0.0
    train_total = 0.0
    for i, (X, y) in enumerate(tqdm(train_dl)):
        if (i + 1) % 100 != 0:
            continue
        X, y = X[0].to(torch.float32).to(
            device), y[0].to(torch.int64).to(device)

        outputs = model(X)
        y_oh = torch.nn.functional.one_hot(y, num_classes=3).to(torch.float32)
        y_oh[:, [0, 2]] *= alpha
        loss = criterion(outputs, y_oh)
        loss.backward()
        # if (i + 1) % accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

        with torch.no_grad():
            _, predicted = torch.max(outputs.data, 1)
            train_correct += (predicted == y).sum().item()
            train_total += y.size(0)
            train_loss += loss.item()

    train_loss /= len(train_ds)
    train_losses.append(train_loss)
    train_acc = train_correct / train_total
    train_accs.append(train_acc)

    model.eval()
    val_loss = 0.0
    val_correct = 0.0
    val_total = 0.0
    with torch.no_grad():
        for i, (X, y) in enumerate(tqdm(val_dl)):
            if (i + 1) % 100 != 0:
                continue
            X, y = X[0].to(torch.float32).to(
                device), y[0].to(torch.int64).to(device)

            outputs = model(X)
            y_oh = torch.nn.functional.one_hot(
                y, num_classes=3).to(torch.float32)
            y_oh[:, [0, 2]] *= alpha
            loss = criterion(outputs, y_oh)

            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == y).sum().item()
            val_total += y.size(0)
            val_loss += loss.item()

    val_loss /= len(val_ds)
    val_losses.append(val_loss)
    val_acc = val_correct / val_total
    val_accs.append(val_acc)

    print(train_acc, train_loss, val_acc, val_loss)

#     if epoch == num_epochs - 1:
#         model.eval()
#         with torch.no_grad():
#             for date in tqdm(range(48,64)):
#                 daily_X, daily_y = gen_ts(stock_ids, date)
#                 predicted_value_today = []
#                 for second in list(range(3998)):
#                     X, y = daily_X[:, max(0, second+1-100):(second+1), :], daily_y[:, second]
#                     X = torch.from_numpy(X.astype(np.float32)).to(device)
#                     y = torch.from_numpy(y.astype(np.int64)).to(device)

#                     outputs = model(X)
#                     _, predicted = torch.max(outputs.data, 1)
#                     predicted_value_today.append(predicted.cpu().numpy())
#                 pd.DataFrame(predicted_value_today).to_csv("D:/jupyter/aft_lstm_predicted/date{}.csv".format(date))

100%|██████████| 191904/191904 [01:59<00:00, 1602.24it/s]
100%|██████████| 63968/63968 [00:32<00:00, 1971.45it/s]
  0%|          | 274/191904 [00:00<01:10, 2726.66it/s]

0.595512533572068 0.008477134318400794 0.4988691437802908 0.009218500645690528


100%|██████████| 191904/191904 [02:00<00:00, 1591.30it/s]
100%|██████████| 63968/63968 [00:32<00:00, 1965.77it/s]
  0%|          | 300/191904 [00:00<01:05, 2947.08it/s]

0.6312108325872874 0.007793136142450203 0.4894991922455573 0.009126355656315054


100%|██████████| 191904/191904 [02:00<00:00, 1597.83it/s]
100%|██████████| 63968/63968 [00:33<00:00, 1917.63it/s]
  0%|          | 334/191904 [00:00<00:57, 3315.77it/s]

0.6544315129811996 0.007576248557591798 0.4849757673667205 0.00937338842693092


100%|██████████| 191904/191904 [02:04<00:00, 1547.04it/s]
100%|██████████| 63968/63968 [00:34<00:00, 1831.64it/s]
  0%|          | 300/191904 [00:00<01:06, 2873.72it/s]

0.6648948075201433 0.007375785878292467 0.49596122778675283 0.009361325100198456


100%|██████████| 191904/191904 [02:03<00:00, 1551.09it/s]
100%|██████████| 63968/63968 [00:32<00:00, 1951.95it/s]
  0%|          | 316/191904 [00:00<01:01, 3135.16it/s]

0.6745188003581021 0.0072006172410788456 0.5012924071082391 0.00947431609334574


100%|██████████| 191904/191904 [01:59<00:00, 1605.85it/s]
100%|██████████| 63968/63968 [00:31<00:00, 2014.16it/s]
  0%|          | 311/191904 [00:00<01:02, 3087.45it/s]

0.6859333034914951 0.0070856937378264775 0.5053311793214863 0.009937092477829382


100%|██████████| 191904/191904 [02:04<00:00, 1543.94it/s]
100%|██████████| 63968/63968 [00:33<00:00, 1892.41it/s]
  0%|          | 350/191904 [00:00<00:55, 3471.73it/s]

0.6880595344673232 0.0070634716781617524 0.5197092084006462 0.009535877959799951


100%|██████████| 191904/191904 [02:00<00:00, 1598.04it/s]
100%|██████████| 63968/63968 [00:32<00:00, 1992.06it/s]
  0%|          | 324/191904 [00:00<00:59, 3217.72it/s]

0.6922560429722471 0.006991886313972358 0.5093699515347334 0.009806706069718455


100%|██████████| 191904/191904 [02:01<00:00, 1585.50it/s]
100%|██████████| 63968/63968 [00:33<00:00, 1933.41it/s]
  0%|          | 331/191904 [00:00<00:58, 3289.99it/s]

0.7079789615040286 0.0068469507609178306 0.5069466882067851 0.00962031239726622


100%|██████████| 191904/191904 [01:57<00:00, 1628.72it/s]
100%|██████████| 63968/63968 [00:31<00:00, 2015.86it/s]
  0%|          | 372/191904 [00:00<00:51, 3693.04it/s]

0.7074753804834378 0.006766625429533592 0.4922455573505654 0.010782443635955848


100%|██████████| 191904/191904 [01:58<00:00, 1612.99it/s]
100%|██████████| 63968/63968 [00:33<00:00, 1881.80it/s]
  0%|          | 585/191904 [00:00<01:11, 2679.34it/s]

0.708034914950761 0.006770064436055798 0.489983844911147 0.010076717493599984


100%|██████████| 191904/191904 [02:01<00:00, 1585.72it/s]
100%|██████████| 63968/63968 [00:33<00:00, 1936.24it/s]
  0%|          | 306/191904 [00:00<01:03, 3037.80it/s]

0.7140219337511191 0.006671267191150948 0.498546042003231 0.01065894517779216


100%|██████████| 191904/191904 [01:59<00:00, 1600.46it/s]
100%|██████████| 63968/63968 [00:32<00:00, 1948.75it/s]
  0%|          | 300/191904 [00:00<01:06, 2875.79it/s]

0.7197291853178156 0.0065818658113797664 0.502746365105008 0.010419881476341873


100%|██████████| 191904/191904 [01:59<00:00, 1601.07it/s]
100%|██████████| 63968/63968 [00:32<00:00, 1942.24it/s]
  0%|          | 344/191904 [00:00<00:56, 3417.20it/s]

0.7251566696508505 0.006518723665149123 0.5040387722132472 0.010926862779339443


100%|██████████| 191904/191904 [02:00<00:00, 1596.24it/s]
100%|██████████| 63968/63968 [00:32<00:00, 1960.46it/s]
  0%|          | 341/191904 [00:00<00:56, 3385.74it/s]

0.7254364368845121 0.00651502508464477 0.5037156704361874 0.011189398713588804


100%|██████████| 191904/191904 [01:59<00:00, 1607.37it/s]
100%|██████████| 63968/63968 [00:32<00:00, 1960.13it/s]
  0%|          | 337/191904 [00:00<00:57, 3345.51it/s]

0.7317032229185317 0.006394663923391282 0.49822294022617125 0.01080958306297414


100%|██████████| 191904/191904 [02:00<00:00, 1590.68it/s]
100%|██████████| 63968/63968 [00:34<00:00, 1859.43it/s]
  0%|          | 315/191904 [00:00<01:01, 3127.13it/s]

0.7350044762757386 0.006382662449320922 0.49176090468497574 0.011657925711207655


100%|██████████| 191904/191904 [01:58<00:00, 1621.42it/s]
100%|██████████| 63968/63968 [00:32<00:00, 1967.56it/s]
  0%|          | 337/191904 [00:00<00:57, 3345.56it/s]

0.7335496866606983 0.006363125576899983 0.4930533117932149 0.011248446391124824


100%|██████████| 191904/191904 [02:37<00:00, 1221.32it/s]
100%|██████████| 63968/63968 [01:02<00:00, 1029.70it/s]
  0%|          | 366/191904 [00:00<00:52, 3633.50it/s]

0.7365711727842436 0.006320625004865328 0.4812600969305331 0.011953923311746079


 43%|████▎     | 83025/191904 [16:22<21:28, 84.52it/s]    


KeyboardInterrupt: 

In [1]:
train_losses

NameError: name 'train_losses' is not defined

In [14]:
predicted_value_today = []
predicted_value_today.append(predicted.cpu().numpy())
predicted_value_today.append(predicted.cpu().numpy())

In [16]:
pd.DataFrame(predicted_value_today).to_csv(
    "D:/jupyter/aft_lstm_predicted/date{}.csv".format(date))

In [None]:
plt.plot(train_accs)
plt.plot(val_accs)
plt.plot(train_losses)
plt.plot(val_accs)

In [23]:
model.eval()
with torch.no_grad():
    for date in tqdm(range(48, 64)):
        daily_X, daily_y = gen_ts(stock_ids, date)
        predicted_value_today = []
        for second in list(range(3998)):
            X, y = daily_X[:, max(0, second+1-100)                           :(second+1), :], daily_y[:, second]
            X = torch.from_numpy(X.astype(np.float32)).to(device)
            y = torch.from_numpy(y.astype(np.int64)).to(device)

            outputs = model(X)
            _, predicted = torch.max(outputs.data, 1)
            predicted_value_today.append(predicted.cpu().numpy())
        pd.DataFrame(predicted_value_today).to_csv(
            "D:/jupyter/aft_lstm_predicted/date{}.csv".format(date))

100%|██████████| 16/16 [01:10<00:00,  4.42s/it]


In [21]:
pd.DataFrame(predicted_value_today)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,0,1,1,1,1
2,1,1,1,1,1,0,1,1,1,1
3,1,1,1,1,1,0,1,1,0,1
4,1,1,1,1,1,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
3993,1,1,1,1,1,1,1,1,1,1
3994,1,1,1,1,1,1,1,1,1,1
3995,1,1,1,1,1,1,1,1,1,1
3996,1,1,1,1,1,2,1,1,1,1


In [44]:
y_true = np.array([])
y_pred = np.array([])
for date in range(48, 64):
    yhat = pd.read_csv(
        "D:/jupyter/aft_lstm_predicted/date{}.csv".format(date), index_col=0)
    _, daily_y = gen_ts(stock_ids, date)
    y = daily_y.T
    yhat = np.array(yhat)
    y_true = np.append(y_true, y.reshape(-1))
    y_pred = np.append(y_pred, yhat.reshape(-1))

In [46]:
from sklearn.metrics import f1_score
f1 = f1_score(y_true, y_pred, average='macro')
f1

0.33371707990182164

In [48]:
(y_true == y_pred).mean()

0.6092094434313932

In [32]:
date = 48
yhat = pd.read_csv(
    "D:/jupyter/aft_lstm_predicted/date{}.csv".format(date), index_col=0)
_, daily_y = gen_ts(stock_ids, date)
y = daily_y.T

In [43]:
np.append(np.array([]), y.reshape(-1))

array([1., 1., 1., ..., 2., 0., 1.])

In [36]:
y.shape

(3998, 10)

In [25]:
pd.read_csv("D:/jupyter/aft_lstm_predicted/date{}.csv".format(date), index_col=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,1,1,0,1,0,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
3993,1,1,1,1,1,1,1,0,1,1
3994,1,1,1,1,1,1,1,0,1,1
3995,1,1,1,1,1,1,1,0,1,1
3996,1,1,1,1,1,1,1,0,1,1


In [55]:
list(range(48, 64))

[48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]

In [7]:
X = gen_ts(stock_ids, date_id, time)
X = torch.from_numpy(X.astype(np.float32)).to(device)
model(X)

tensor([[-0.4212],
        [-0.4230],
        [ 0.6397],
        [ 0.1246],
        [-0.4212],
        [-0.4212],
        [ 0.6383],
        [-0.4212]], grad_fn=<AddmmBackward0>)

In [53]:
y

tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [2.],
        [1.],
        [2.]])

In [32]:
stock_ids = list(range(10))
date_id = 0
time = 5
daily_batchs = []
for stock in stock_ids:
    if gen_filename(stock, date_id, 'am') not in file_list:
        continue
    am_df = pd.read_csv(gen_filename(stock, date_id, 'am'))
    pm_df = pd.read_csv(gen_filename(stock, date_id, 'pm'))
    df = pd.concat([am_df, pm_df])
    df = df.iloc[0:time, 4:-5]
    daily_batchs.append(np.array(df))
np.array(daily_batchs).shape

(8, 5, 23)

In [5]:
am_df = pd.read_csv(gen_filename(0, 0, 'am'))
pm_df = pd.read_csv(gen_filename(0, 0, 'pm'))

In [12]:
am_df

Unnamed: 0,uuid,date,time,sym,n_close,amount_delta,n_midprice,n_bid1,n_bsize1,n_bid2,...,n_asize3,n_ask4,n_asize4,n_ask5,n_asize5,label_5,label_10,label_20,label_40,label_60
0,0,0,09:40:03,0,0.015691,2594895.0,0.015270,0.015130,1.961524e-06,0.014850,...,1.005697e-05,0.016251,0.000005,0.016531,0.000016,1,0,0,0,0
1,1,0,09:40:06,0,0.015410,837314.0,0.015410,0.015130,3.424355e-06,0.014850,...,5.053418e-06,0.016531,0.000016,0.016811,0.000018,0,0,0,0,0
2,2,0,09:40:09,0,0.015130,184807.0,0.015270,0.015130,3.158386e-06,0.014850,...,1.083825e-05,0.016251,0.000005,0.016531,0.000017,1,0,0,0,0
3,3,0,09:40:12,0,0.015691,500046.0,0.015551,0.015410,4.155771e-07,0.015130,...,5.053418e-06,0.016531,0.000016,0.016811,0.000018,0,0,0,0,0
4,4,0,09:40:15,0,0.014290,1785635.0,0.014710,0.014290,9.308927e-06,0.014010,...,3.612296e-05,0.015971,0.000009,0.016251,0.000005,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,1994,0,11:19:45,0,0.007845,115107.0,0.008126,0.007845,1.163616e-07,0.007565,...,1.662308e-08,0.009246,0.000004,0.009526,0.000004,1,1,1,1,1
1995,1995,0,11:19:48,0,0.007845,35970.0,0.008126,0.007845,3.324617e-08,0.007565,...,1.662308e-08,0.009246,0.000004,0.009526,0.000004,1,1,1,1,1
1996,1996,0,11:19:51,0,0.008686,348378.0,0.008826,0.008686,5.352633e-08,0.007845,...,3.797876e-06,0.009807,0.000005,0.010087,0.000006,1,0,1,1,1
1997,1997,0,11:19:54,0,0.008686,54806.0,0.008266,0.007845,1.496078e-07,0.007565,...,3.797876e-06,0.009807,0.000005,0.010087,0.000006,2,0,1,1,1


In [15]:
am_df.columns[4:-5]

Index(['n_close', 'amount_delta', 'n_midprice', 'n_bid1', 'n_bsize1', 'n_bid2',
       'n_bsize2', 'n_bid3', 'n_bsize3', 'n_bid4', 'n_bsize4', 'n_bid5',
       'n_bsize5', 'n_ask1', 'n_asize1', 'n_ask2', 'n_asize2', 'n_ask3',
       'n_asize3', 'n_ask4', 'n_asize4', 'n_ask5', 'n_asize5'],
      dtype='object')