# model

In [1]:
import torch
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda")

In [2]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x, rat):
        x = x + (self.pe[:x.size(0), :]) * rat
        return self.dropout(x)

class TFModule(nn.Module):
    def __init__(self, vocab_size, embed_size, num_layers, num_hiddens, num_heads, embeddropout_rate, dropout_rate, pos_ratio):
        super(TFModule, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.pos_embed = PositionalEncoding(embed_size, embeddropout_rate, 1025)
        self.pos_ratio = pos_ratio
        
        self.encoder_layer = nn.TransformerEncoderLayer(
                                        d_model=embed_size,
                                        dim_feedforward=num_hiddens,
                                        nhead=num_heads, 
                                        dropout=dropout_rate, 
                                        activation='relu')
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        
        self.decoder = nn.Linear(embed_size, vocab_size) # 初始时间步和最终时间步的隐藏状态作为全连接层输入

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask.to(device)
    
    def forward(self, inputs):
        
        word_emb = self.embedding(inputs.permute(1, 0))
        embeddings = self.pos_embed(word_emb, self.pos_ratio)
#         print(word_emb, '\n', embeddings-word_emb)
        
        # outputs形状是(长度, 批量大小, embeding大小)
        outputs = self.transformer_encoder(embeddings, mask=self._generate_square_subsequent_mask(embeddings.shape[0]))

        output = outputs.permute(1, 0, 2)
        outs = output.reshape(output.size(0)*output.size(1), output.size(2))
        ret = self.decoder(outs)
        return ret

In [3]:
# import torch
# import torch.nn as nn

In [4]:
# encoder_layer = nn.TransformerEncoderLayer(d_model=1, nhead=1, dropout=0, activation='relu')
# transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=1)

In [5]:
# src = torch.ones(3,1,1)
# out = encoder_layer(src)
# print(src.shape,out.shape)
# print(src)
# print(out)

In [6]:
# # dir(encoder_layer)
# vos = 53
# emb = 22
# numl = 3
# numh = 233
# numhea = 11
# dror = 0
# model = TFModule(vos, emb, numl, numh, numhea, dror)
# src = torch.ones((2,3),dtype=int)
# print(src.shape)
# out = model(src)
# print(out)

# data

In [7]:
import torch
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

In [8]:
class Corpus(object):
    def __init__(self, train_batch_size=20, eval_batch_size=10, bptt=35):
        self.bptt = bptt
        train_iter = WikiText2(split='train')
        self.tokenizer = get_tokenizer('basic_english')
        counter = Counter()
        for line in train_iter:
            counter.update(self.tokenizer(line))
        self.vocab = Vocab(counter)
        train_iter, val_iter, test_iter = WikiText2()
        train_data = self.data_process(train_iter)
        val_data = self.data_process(val_iter)
        test_data = self.data_process(test_iter)

        self.train_data = self.batchify(train_data, train_batch_size)
        self.val_data = self.batchify(val_data, eval_batch_size)
        self.test_data = self.batchify(test_data, eval_batch_size)

    def data_process(self, raw_text_iter):
        data = [torch.tensor([self.vocab[token] for token in self.tokenizer(item)],
                           dtype=torch.long) for item in raw_text_iter]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

    def batchify(self, data, batch_size):
        # Divide the dataset into batch_size parts.
        nbatch = data.size(0) // batch_size
        # Trim off any extra elements that wouldn't cleanly fit (remainders).
        data = data.narrow(0, 0, nbatch * batch_size)
        # Evenly divide the data across the batch_size batches.
        data = data.view(batch_size, -1).t().contiguous()
        return data.to(device)

    def get_batch(self, source, i):
        seq_len = min(self.bptt, len(source) - 1 - i)
        data = source[i:i+seq_len]
        target = source[i+1:i+1+seq_len].reshape(-1)
        return data, target

    def get_ntokens(self):
        return len(self.vocab.stoi)

# main

In [9]:
import argparse
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable

import data
import model
class args():
    pos_ratio = 0.1
    emsize = 100
    nhid = 200
    nlayers = 3
    nhead = 20
    lr = 0.5
    epochs = 1000
    batch_size = 32
    bptt = 256
    embeddropout = 0.1
    dropout = 0.5
    tied = False
    seed = 1234
    save = 'model.pt'

In [10]:
data_loader = Corpus(train_batch_size=args.batch_size,
                     eval_batch_size=args.batch_size,
                     bptt=args.bptt)

In [11]:
train_data = data_loader.train_data
val_data = data_loader.val_data
test_data = data_loader.test_data

In [12]:
def get_batch(source, i, evaluation=False):
    seq_len = min(args.bptt, len(source) - 1 - i)
#     data = Variable(source[i:i+seq_len], volatile=evaluation)
#     target = Variable(source[i+1:i+1+seq_len].view(-1))
    data = source[i:i+seq_len].t()
    tmp = source[i+1:i+1+seq_len].t()
#     print(data.shape,tmp.shape)
    target = tmp.reshape(-1)
    return data, target

In [13]:
# import torch
# torch.__version__
# print(train_data.size(0))
# for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
#     data, targets = get_batch(train_data, i)
#     print(data.shape,targets.shape,targets[-1])
#     if (batch>=2): break

In [14]:
len(data_loader.vocab.itos)

28783

In [15]:
ntokens = len(data_loader.vocab.itos)
# model = RNNModel(args.model, ntokens, args.emsize, args.nhid,
#         args.nlayers, args.dropout, args.tied)
model = TFModule(ntokens, args.emsize, args.nlayers, args.nhid, args.nhead, args.embeddropout, args.dropout, args.pos_ratio)

In [16]:
model = model.to(device)

In [17]:
model

TFModule(
  (embedding): Embedding(28783, 100)
  (pos_embed): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): _LinearWithBias(in_features=100, out_features=100, bias=True)
    )
    (linear1): Linear(in_features=100, out_features=200, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (linear2): Linear(in_features=200, out_features=100, bias=True)
    (norm1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.5, inplace=False)
    (dropout2): Dropout(p=0.5, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=100, out_features=100, bias=True)
        )
        (linear1): Linear(in_features=100, out_fe

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)

In [19]:
def evaluate(data_source):
    model.eval()
    total_loss = 0
    total_words = 0
    for batch, i in enumerate(range(0, data_source.size(0) - 1, args.bptt)):
        data, targets = get_batch(data_source, i)
        
        output = model(data)
        loss = criterion(output, targets)

        total_loss += loss.data * targets.shape[0]
        total_words += targets.shape[0]
        
#         if batch > 1: break
        
    total_loss = float(total_loss.to(torch.device('cpu')))
#     print(total_loss,total_words)
    return total_loss / total_words, math.exp(total_loss / total_words)

def train():
    model.train()
    total_loss = 0.
    total_words = 0
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        model.zero_grad()
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.data * targets.shape[0]
        total_words += targets.shape[0]
        
#         if batch > 10: break
        
#         if (batch % args.log_interval == 0 and batch > 0):
#             cur_loss = total_loss / args.log_interval
#             elapsed = time.time() - start_time
#             print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
#                     'loss {:5.2f} | ppl {:8.2f}'.format(
#                 233, batch, len(train_data) // args.bptt, args.lr,
#                 elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
#             total_loss = 0
#             start_time = time.time()

    cur_loss = total_loss / total_words
    print('train {:10d} words | loss {:5.5f} | ppl {:5.5}'.format(total_words, cur_loss, math.exp(cur_loss)))
    return cur_loss, math.exp(cur_loss)

In [20]:
import numpy as np
Losstrain = []
Lossval = []
Losstest = []

In [21]:
import time
for T in range(args.epochs):
    print('Round : ',T,"  ",time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
#     train()
#     print('  valid ',evaluate(val_data))
#     print('  test  ',evaluate(test_data))
    trainl, trainp = train()
    vall, valp = evaluate(val_data)
    testl, testp = evaluate(test_data)
    
    Losstrain.append(trainp)
    Lossval.append(valp)
    Losstest.append(testp)
    np.save('model-2-train',np.array(Losstrain))
    np.save('model-2-val',np.array(Lossval))
    np.save('model-2-test',np.array(Losstest))
    
    print('  valid {:5.5f} | test {:5.5f}'.format(valp, testp))
    print('-'*80)

Round :  0    2021-05-19 10:43:08
train    2049952 words | loss 7.38624 | ppl 1613.6
  valid 880.83301 | test 826.87295
--------------------------------------------------------------------------------
Round :  1    2021-05-19 10:43:51
train    2049952 words | loss 6.88344 | ppl 975.98
  valid 718.86088 | test 679.99207
--------------------------------------------------------------------------------
Round :  2    2021-05-19 10:44:34
train    2049952 words | loss 6.73509 | ppl 841.42
  valid 658.93953 | test 621.13847
--------------------------------------------------------------------------------
Round :  3    2021-05-19 10:45:18
train    2049952 words | loss 6.64241 | ppl 766.94
  valid 586.01895 | test 550.02547
--------------------------------------------------------------------------------
Round :  4    2021-05-19 10:46:02
train    2049952 words | loss 6.57421 | ppl 716.38
  valid 520.05998 | test 489.42283
----------------------------------------------------------------------------

train    2049952 words | loss 5.92424 | ppl 373.99
  valid 336.30012 | test 305.48685
--------------------------------------------------------------------------------
Round :  42    2021-05-19 11:13:53
train    2049952 words | loss 5.91509 | ppl 370.59
  valid 336.37680 | test 305.55966
--------------------------------------------------------------------------------
Round :  43    2021-05-19 11:14:37
train    2049952 words | loss 5.90610 | ppl 367.27
  valid 337.02313 | test 306.00792
--------------------------------------------------------------------------------
Round :  44    2021-05-19 11:15:21
train    2049952 words | loss 5.89756 | ppl 364.15
  valid 334.50863 | test 303.43654
--------------------------------------------------------------------------------
Round :  45    2021-05-19 11:16:05
train    2049952 words | loss 5.88824 | ppl 360.77
  valid 331.31377 | test 300.51187
--------------------------------------------------------------------------------
Round :  46    2021-05-19

train    2049952 words | loss 5.63041 | ppl 278.78
  valid 291.88392 | test 264.26985
--------------------------------------------------------------------------------
Round :  86    2021-05-19 11:46:01
train    2049952 words | loss 5.62466 | ppl 277.18
  valid 293.67360 | test 266.33222
--------------------------------------------------------------------------------
Round :  87    2021-05-19 11:46:45
train    2049952 words | loss 5.62005 | ppl 275.9
  valid 292.27786 | test 264.72883
--------------------------------------------------------------------------------
Round :  88    2021-05-19 11:47:29
train    2049952 words | loss 5.61580 | ppl 274.73
  valid 290.81091 | test 263.30652
--------------------------------------------------------------------------------
Round :  89    2021-05-19 11:48:13
train    2049952 words | loss 5.61022 | ppl 273.2
  valid 278.60335 | test 251.89643
--------------------------------------------------------------------------------
Round :  90    2021-05-19 1

train    2049952 words | loss 5.46202 | ppl 235.57
  valid 290.50364 | test 263.17140
--------------------------------------------------------------------------------
Round :  127    2021-05-19 12:16:00
train    2049952 words | loss 5.46009 | ppl 235.12
  valid 277.96935 | test 250.45307
--------------------------------------------------------------------------------
Round :  128    2021-05-19 12:16:44
train    2049952 words | loss 5.45658 | ppl 234.29
  valid 291.26845 | test 264.69257
--------------------------------------------------------------------------------
Round :  129    2021-05-19 12:17:28
train    2049952 words | loss 5.45224 | ppl 233.28
  valid 277.57123 | test 249.92311
--------------------------------------------------------------------------------
Round :  130    2021-05-19 12:18:12
train    2049952 words | loss 5.44905 | ppl 232.54
  valid 278.03176 | test 250.68971
--------------------------------------------------------------------------------
Round :  131    2021-

train    2049952 words | loss 5.34700 | ppl 209.98
  valid 279.63800 | test 252.63918
--------------------------------------------------------------------------------
Round :  168    2021-05-19 12:46:02
train    2049952 words | loss 5.34530 | ppl 209.62
  valid 278.16603 | test 251.84904
--------------------------------------------------------------------------------
Round :  169    2021-05-19 12:46:46
train    2049952 words | loss 5.34337 | ppl 209.22
  valid 280.00953 | test 252.96887
--------------------------------------------------------------------------------
Round :  170    2021-05-19 12:47:30
train    2049952 words | loss 5.34062 | ppl 208.64
  valid 290.03327 | test 263.15263
--------------------------------------------------------------------------------
Round :  171    2021-05-19 12:48:13
train    2049952 words | loss 5.34002 | ppl 208.52
  valid 314.75838 | test 285.72331
--------------------------------------------------------------------------------
Round :  172    2021-

KeyboardInterrupt: 

In [None]:
import time
for T in range(args.epochs):
    print('Round : ',T,"  ",time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
#     train()
#     print('  valid ',evaluate(val_data))
#     print('  test  ',evaluate(test_data))
    trainl, trainp = train()
    vall, valp = evaluate(val_data)
    testl, testp = evaluate(test_data)
    
    Losstrain.append(trainp)
    Lossval.append(valp)
    Losstest.append(testp)
    np.save('model-2-train',np.array(Losstrain))
    np.save('model-2-val',np.array(Lossval))
    np.save('model-2-test',np.array(Losstest))
    
    print('  valid {:5.5f} | test {:5.5f}'.format(valp, testp))
    print('-'*80)

Round :  0    2021-05-19 12:54:38
train    2049952 words | loss 5.32538 | ppl 205.49
  valid 292.01432 | test 264.80040
--------------------------------------------------------------------------------
Round :  1    2021-05-19 12:55:20
train    2049952 words | loss 5.32445 | ppl 205.3
  valid 285.69211 | test 257.80214
--------------------------------------------------------------------------------
Round :  2    2021-05-19 12:56:05
train    2049952 words | loss 5.32149 | ppl 204.69
  valid 301.99057 | test 273.66528
--------------------------------------------------------------------------------
Round :  3    2021-05-19 12:56:48
train    2049952 words | loss 5.31878 | ppl 204.13
  valid 281.61849 | test 254.25411
--------------------------------------------------------------------------------
Round :  4    2021-05-19 12:57:32
train    2049952 words | loss 5.31764 | ppl 203.9
  valid 283.88700 | test 256.45986
------------------------------------------------------------------------------

train    2049952 words | loss 5.24971 | ppl 190.51
  valid 327.24731 | test 297.82730
--------------------------------------------------------------------------------
Round :  42    2021-05-19 13:25:22
train    2049952 words | loss 5.24656 | ppl 189.91
  valid 301.21517 | test 274.60022
--------------------------------------------------------------------------------
Round :  43    2021-05-19 13:26:06
train    2049952 words | loss 5.24645 | ppl 189.89
  valid 330.33727 | test 300.15946
--------------------------------------------------------------------------------
Round :  44    2021-05-19 13:26:50
train    2049952 words | loss 5.24329 | ppl 189.29
  valid 302.29149 | test 275.53294
--------------------------------------------------------------------------------
Round :  45    2021-05-19 13:27:34
train    2049952 words | loss 5.24216 | ppl 189.08
  valid 302.36359 | test 274.88639
--------------------------------------------------------------------------------
Round :  46    2021-05-19

train    2049952 words | loss 5.18968 | ppl 179.41
  valid 291.88801 | test 265.05553
--------------------------------------------------------------------------------
Round :  83    2021-05-19 13:55:25
train    2049952 words | loss 5.18917 | ppl 179.32
  valid 293.73063 | test 266.63416
--------------------------------------------------------------------------------
Round :  84    2021-05-19 13:56:09
train    2049952 words | loss 5.18769 | ppl 179.05
  valid 303.66934 | test 276.45611
--------------------------------------------------------------------------------
Round :  85    2021-05-19 13:56:53
train    2049952 words | loss 5.18749 | ppl 179.02
  valid 323.74176 | test 292.71216
--------------------------------------------------------------------------------
Round :  86    2021-05-19 13:57:36
train    2049952 words | loss 5.18515 | ppl 178.6
  valid 301.16933 | test 274.12952
--------------------------------------------------------------------------------
Round :  87    2021-05-19 

train    2049952 words | loss 5.14528 | ppl 171.62
  valid 288.79592 | test 262.60950
--------------------------------------------------------------------------------
Round :  124    2021-05-19 14:25:26
train    2049952 words | loss 5.14541 | ppl 171.64
  valid 325.34162 | test 296.26055
--------------------------------------------------------------------------------
Round :  125    2021-05-19 14:26:10
train    2049952 words | loss 5.14398 | ppl 171.4
  valid 322.93367 | test 294.17330
--------------------------------------------------------------------------------
Round :  126    2021-05-19 14:26:54
train    2049952 words | loss 5.14255 | ppl 171.15
  valid 288.11555 | test 261.91189
--------------------------------------------------------------------------------
Round :  127    2021-05-19 14:27:38
train    2049952 words | loss 5.14064 | ppl 170.82
  valid 299.09390 | test 272.05557
--------------------------------------------------------------------------------
Round :  128    2021-0

train    2049952 words | loss 5.10951 | ppl 165.59
  valid 291.84410 | test 265.84931
--------------------------------------------------------------------------------
Round :  165    2021-05-19 14:55:28
train    2049952 words | loss 5.10938 | ppl 165.57
  valid 319.17734 | test 291.37451
--------------------------------------------------------------------------------
Round :  166    2021-05-19 14:56:12
train    2049952 words | loss 5.10708 | ppl 165.19
  valid 292.62677 | test 266.13529
--------------------------------------------------------------------------------
Round :  167    2021-05-19 14:56:55
train    2049952 words | loss 5.10810 | ppl 165.36
  valid 326.76604 | test 297.39364
--------------------------------------------------------------------------------
Round :  168    2021-05-19 14:57:39
train    2049952 words | loss 5.10658 | ppl 165.11
  valid 288.51735 | test 262.09485
--------------------------------------------------------------------------------
Round :  169    2021-

train    2049952 words | loss 5.08075 | ppl 160.89
  valid 291.31838 | test 264.36918
--------------------------------------------------------------------------------
Round :  206    2021-05-19 15:25:29
train    2049952 words | loss 5.07766 | ppl 160.4
  valid 301.63243 | test 275.57055
--------------------------------------------------------------------------------
Round :  207    2021-05-19 15:26:13
train    2049952 words | loss 5.07818 | ppl 160.48
  valid 292.57695 | test 265.67731
--------------------------------------------------------------------------------
Round :  208    2021-05-19 15:26:57
train    2049952 words | loss 5.07756 | ppl 160.38
  valid 287.71128 | test 261.14714
--------------------------------------------------------------------------------
Round :  209    2021-05-19 15:27:41
train    2049952 words | loss 5.07710 | ppl 160.31
  valid 323.40950 | test 295.98579
--------------------------------------------------------------------------------
Round :  210    2021-0

In [23]:
# torch.save(model.state_dict(), 'parameter-2.pkl')
# evaluate(test_data)

(5.516509941114199, 248.76531478395296)

In [None]:
if 1:
    x = torch.ones((1,1),dtype=int)
    x[0,0] = 1
    print(model.embedding(x))

In [None]:
# import time
# for T in range(100):
#     print('Round : ',T,"  ",time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
#     train()
#     print('  valid ',evaluate(val_data))
#     print('  test  ',evaluate(test_data))
#     print('-'*80)

In [None]:
# if 1:
#     x = torch.tensor([[1000,1],[100,1],[1000,1]],dtype=float)
#     y = torch.tensor([0,1,0])
#     print(criterion(x,y))

In [None]:
# -math.log(0.1)