In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

from data_r import PatentDataset, Sequence
from model import Net

In [5]:
# train_set = PatentDataset()
# with open('./data/train_set.pkl', 'wb') as f:
#     pickle.dump(train_set, f)

In [6]:
with open('./data/train_set.pkl', 'rb') as f:
    full_set = pickle.load(f)

In [147]:
# np.sort(pd.Series([seq.seq_t0 for seq in full_set]).unique())

In [148]:
seq_len_pd = pd.DataFrame({'full': [len(seq) for seq in full_set], 'T_0': [seq.seq_t0 for seq in full_set], 'n_cite': [seq.marks.sum() for seq in full_set]})
seq_len_pd.head(1)

In [9]:
# np.sort(pd.Series().unique())

In [149]:
# # 2 * 4 * 3 = 24
# hidden_state = torch.arange(24).reshape(2,4,3)
# hidden_state.shape
# hidden_state
# hidden_state[[0,1], [3,0]].shape
# hidden_state.unsqueeze(1).expand(-1, 5, -1, -1).shape

### 在 full_set 中根据年份取 train_set

In [12]:
T = 1990
min_len = 3

seq_l = [seq.trunc(T, abs_T=True) for seq in full_set]
seq_len_pd[T] = np.array([len(seq) for seq in seq_l])

keep_l = [len(seq) >= min_len for seq in seq_l]
seq_l = [seq for i, seq in enumerate(seq_l) if keep_l[i]]
train_set = PatentDataset(seq_l=seq_l)

seq_len_pd.head(1)

Unnamed: 0,full,T_0,n_cite,1990
0,14,1977.0,22,9


In [13]:
# seq = full_set[np.random.randint(len(full_set))]
# seq.get_pred_target(T, n_period=5, abs_T=True)

In [14]:
# train_set.id2idx_pd

In [15]:
# seq_len_pd[seq_len_pd[T] < 5][T].value_counts()

In [16]:
train_loader = DataLoader(train_set, batch_size=64, shuffle=True, collate_fn=PatentDataset.collate_batch)

In [38]:
# 用作试验的一个 batch
batch = next(iter(train_loader))
# print(*[b.shape for b in batch], sep='\n')

### 模型超参数

In [19]:
# model hyperparameters
hid_dim=32 
mlp_dim=16

feature_dim = full_set[0].seq_features.shape[0]

In [20]:
model = Net(hid_dim, mlp_dim, feature_dim)

In [21]:
model(*batch)

(tensor(0.6119, grad_fn=<MeanBackward0>),
 tensor(0.4562, grad_fn=<MeanBackward0>))

# batch 上试验 mape 的计算

In [153]:
batch_target = [full_set[i].get_pred_target(T, n_period=5, abs_T=True) for i in full_set.id2idx_pd[batch[-1]]]
batch_pred_inter_t = np.array([seq[0] for seq in batch_target])
batch_count_target = np.array([seq[1] for seq in batch_target])

batch_count_hist = [train_set[i].marks for i in train_set.id2idx_pd[batch[-1]]]
# batch_count_target: (batch_size, n_period, 2)
# batch_count_hist: (batch_size, seq_len_hist, 2), notice that the seq_len_hist here is not a constant

In [150]:
# list = []
# list.append([4]*5)
# list

mape = torch.tensor(size=())


In [155]:
self_count_hist = []
nonself_count_hist = []

for array in batch_count_hist:
    _count1 = 0
    _count2 = 0
    _seq1 = []
    _seq2 = []
    for event  in array:
        _count1 += event[0]
        _count2 += event[1]
    _seq1.append([_count1]*5)
    _seq2.append([_count2]*5)
    self_count_hist.append(_seq1)
    nonself_count_hist.append(_seq2)

# self_count_hist: (batch_size, n_period)

In [158]:

batch_count_pred = model.pred(batch, batch_pred_inter_t)
# batch_count_pred: same shape as batch_count_target
batch_count_pred = torch.tensor(batch_count_pred, dtype=torch.float32)
batch_count_target = torch.tensor(batch_count_target, dtype=torch.float32)
# batch_count_target[:, :, 0]

  batch_count_target = torch.tensor(batch_count_target, dtype=torch.float32)


In [170]:
dif = batch_count_target -batch_count_pred
diff_self = dif[:, :, 0]
diff_nonself = dif[:, :, 1]

target_self = batch_count_target[:, :, 0]

# pred_self.shape, diff_self.shape
# pred_self, diff_self: (batch_size, n_period)

## batch_mape = abs(batch_count_pred - batch_count_target)/(batch_count_target + batch_count_hist)

In [184]:
self_count_hist = torch.tensor(self_count_hist, dtype=torch.float32)
# print(self_count_hist.shape, diff_self.shape)
self_count_hist = torch.squeeze(self_count_hist)
# print(self_count_hist.shape)

# print(self_count_hist, pred_self)

batch_mape_self = abs(diff_self)/(self_count_hist + target_self + 0.001)       # 有部分年份总次数为0，导致分母为0
batch_mape_self_mean = torch.mean(batch_mape_self)
print(batch_mape_self_mean)

tensor(6.3812)


  self_count_hist = torch.tensor(self_count_hist, dtype=torch.float32)


In [25]:
model.pred(batch, batch_pred_inter_t)[-1]

array([[1.4370832, 1.4370832],
       [3.0253067, 3.0253067],
       [4.7805653, 4.7805653],
       [6.7204247, 6.7204247],
       [8.864304 , 8.864304 ]], dtype=float32)

# 模型训练，每个 epoch 的 mape

In [190]:
total_mape = []

for epoch in range(10):
    epoch_mape = []

    for batch in tqdm(train_loader):
        loss_self, loss_nonself = model.train_batch(batch)

        batch_target = [full_set[i].get_pred_target(T, n_period=5, abs_T=True) for i in full_set.id2idx_pd[batch[-1]]]
        batch_pred_inter_t = np.array([seq[0] for seq in batch_target])
        batch_count_target = np.array([seq[1] for seq in batch_target])

        batch_count_hist = [train_set[i].marks for i in train_set.id2idx_pd[batch[-1]]]

        self_count_hist = []
        nonself_count_hist = []

        for array in batch_count_hist:
            _count1 = 0
            _count2 = 0
            _seq1 = []
            _seq2 = []
            for event  in array:
                _count1 += event[0]
                _count2 += event[1]
            _seq1.append([_count1]*5)
            _seq2.append([_count2]*5)
            self_count_hist.append(_seq1)
            nonself_count_hist.append(_seq2)

        batch_count_pred = model.pred(batch, batch_pred_inter_t)
        batch_count_pred = torch.tensor(batch_count_pred, dtype=torch.float32)
        batch_count_target = torch.tensor(batch_count_target, dtype=torch.float32)

        dif = batch_count_target - batch_count_pred
        diff_self = dif[:, :, 0]
        diff_nonself = dif[:, :, 1]

        target_self = batch_count_target[:, :, 0]
        target_nonself = batch_count_target[:, :, 1]
        self_count_hist = torch.tensor(self_count_hist, dtype=torch.float32)
        self_count_hist = torch.squeeze(self_count_hist)
        nonself_count_hist = torch.tensor(nonself_count_hist, dtype=torch.float32)
        nonself_count_hist = torch.squeeze(nonself_count_hist)
        batch_mape = abs(diff_self + diff_nonself)/(self_count_hist + nonself_count_hist + target_nonself + target_self + 0.001)
        batch_mape_mean = torch.mean(batch_mape)
        epoch_mape.append(batch_mape_mean)

    mape = np.mean(epoch_mape)
    total_mape.append(mape)

print(total_mape)



100%|██████████| 141/141 [00:00<00:00, 175.74it/s]
100%|██████████| 141/141 [00:00<00:00, 187.53it/s]
100%|██████████| 141/141 [00:00<00:00, 184.62it/s]
100%|██████████| 141/141 [00:00<00:00, 187.34it/s]
100%|██████████| 141/141 [00:00<00:00, 188.93it/s]
100%|██████████| 141/141 [00:00<00:00, 188.89it/s]
100%|██████████| 141/141 [00:00<00:00, 188.27it/s]
100%|██████████| 141/141 [00:00<00:00, 189.17it/s]
100%|██████████| 141/141 [00:00<00:00, 188.54it/s]
100%|██████████| 141/141 [00:00<00:00, 187.07it/s]

[0.26019937, 0.2627829, 0.26301336, 0.26284942, 0.26185748, 0.26218975, 0.26140717, 0.26108715, 0.26313737, 0.26181853]





In [28]:
batch[1][:, :-1, [0]].shape

torch.Size([51, 12, 1])