In [2]:
%matplotlib inline


Language Modeling with nn.Transformer and TorchText
===============================================================

This is a tutorial on training a sequence-to-sequence model that uses the
`nn.Transformer <https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html>`__ module.

The PyTorch 1.2 release includes a standard transformer module based on the
paper `Attention is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`__.
Compared to Recurrent Neural Networks (RNNs), the transformer model has proven
to be superior in quality for many sequence-to-sequence tasks while being more
parallelizable. The ``nn.Transformer`` module relies entirely on an attention
mechanism (implemented as
`nn.MultiheadAttention <https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html>`__)
to draw global dependencies between input and output. The ``nn.Transformer``
module is highly modularized such that a single component (e.g.,
`nn.TransformerEncoder <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html>`__)
can be easily adapted/composed.

![](../_static/img/transformer_architecture.jpg)



Define the model
----------------




In this tutorial, we train a ``nn.TransformerEncoder`` model on a
language modeling task. The language modeling task is to assign a
probability for the likelihood of a given word (or a sequence of words)
to follow a sequence of words. A sequence of tokens are passed to the embedding
layer first, followed by a positional encoding layer to account for the order
of the word (see the next paragraph for more details). The
``nn.TransformerEncoder`` consists of multiple layers of
`nn.TransformerEncoderLayer <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html>`__.
Along with the input sequence, a square attention mask is required because the
self-attention layers in ``nn.TransformerEncoder`` are only allowed to attend
the earlier positions in the sequence. For the language modeling task, any
tokens on the future positions should be masked. To produce a probability
distribution over output words, the output of the ``nn.TransformerEncoder``
model is passed through a linear layer followed by a log-softmax function.




In [4]:
import math
from typing import Tuple
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset

class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int, 
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout) #NOTE d_model is the embedding size
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        src = self.encoder(src) * math.sqrt(self.d_model) #! the input is 35(just consider it to be # of batches) by 20, the output is 35 by 20 by 200. The embedding turns the indices into vectors of size 200.
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask) #! the output is the output of the final fully-connected layer of 200 dimension. The dimension here is still the same as 35 by 20 by 200
        output = self.decoder(output) #! The linear layer in the decoder maps the input from 35 by 20 by 200 to 35 by 20 by ntoken (好像是两万多)
        return output


def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

``PositionalEncoding`` module injects some information about the
relative or absolute position of the tokens in the sequence. The
positional encodings have the same dimension as the embeddings so that
the two can be summed. Here, we use ``sine`` and ``cosine`` functions of
different frequencies.




In [5]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000): # max_len means the maximum time steps or word length
        super().__init__()
        self.dropout = nn.Dropout(p=dropout) # do not understand why you need dropout here

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)) #1/10000^(2i/dim_model)) #! exp(ln(x))=x, therefore, exp(ln(1/10000^(2i/dim_model)))) = exp(2i/dim_model)*(-ln(10000))
        pe = torch.zeros(max_len, 1, d_model) #NOTE: Row always means the 
        pe[:, 0, 0::2] = torch.sin(position * div_term) #PE(pos, 2i) = sin(pos/10000^(2i/dim_model))
        pe[:, 0, 1::2] = torch.cos(position * div_term) #PE(pos, 2i) = cos(pos/10000^(2i/dim_model))
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

Load and batch data
-------------------




This tutorial uses ``torchtext`` to generate Wikitext-2 dataset.
To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data. 

The vocab object is built based on the train dataset and is used to numericalize
tokens into tensors. Wikitext-2 represents rare tokens as `<unk>`.

Given a 1-D vector of sequential data, ``batchify()`` arranges the data
into ``batch_size`` columns. If the data does not divide evenly into
``batch_size`` columns, then the data is trimmed to fit. For instance, with
the alphabet as the data (total length of 26) and ``batch_size=4``, we would
divide the alphabet into 4 sequences of length 6:

\begin{align}\begin{bmatrix}
  \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z}
  \end{bmatrix}
  \Rightarrow
  \begin{bmatrix}
  \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} &
  \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} &
  \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} &
  \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix}
  \end{bmatrix}\end{align}

Batching enables more parallelizable processing. However, batching means that
the model treats each column independently; for example, the dependence of
``G`` and ``F`` can not be learned in the example above.




In [1]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>']) # tokenize the sentences into words. The specail word '<unk>' will be inserted in the start of the vocab.
vocab.set_default_index(vocab['<unk>']) # If there is an unknown word that is not included in the vocab, then the default index that corresponds to <unk> will be returned, which is 0. 
# vocab.get_itos() # List mapping indices to tokens.



In [6]:
def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter] # vocab(tokens) will return the corresponding indices of the tokens in the generated vocabulary
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

# train_iter was "consumed" by the process of building the vocab,
# so we have to create it again
train_iter, val_iter, test_iter = WikiText2()
train_data = data_process(train_iter) # put all the corresponding indices of all the words in the training dataset into one tensor list
val_data = data_process(val_iter)
test_data = data_process(test_iter)


In [7]:
train_data.shape

torch.Size([2049990])

In [8]:
# ## test (to delate)！
# for num, item in enumerate(train_iter):
#     print(item)
#     tok = tokenizer(item)
#     print(tok)
#     inddd = vocab(tok)
#     print(inddd)
#     voc
#     print(back2token)
#     if num == 2:
#         break
    
# for num, item in enumerate(train_iter):
#     data = torch.tensor(vocab(tokenizer(item)), dtype=torch.long)
#     print(data)
#     if num == 2:
#         break

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Load and batch data
NOTE: batch data can enable parallel processing. However, the data in different batches will not be trained together. The model treats each batch independently. The data in different batches will not be considered as the context information. 

In the case of IPS, do we need batches？？

For example, if we have 10 trajectories. Each trajectory has 8000 data. Suppose for each trajectory, we define 400 points. Then each point will correspond to 8000/400 = 20 CSI. We can convert the problem of regression down to classification problem, which states as follows. Given a test data with 400 CSI, it can be considered as having passed 400/20 = 20 positions. Then the task will be to predict the next position, which corresponds to the task of predicting the next word in NLP. 400 positions meas we have 400 words. Here, 400 words probably correspond to ``bptt=35`` in this tutorial. 

I guess 400 words can be handled by the system without resorting to batching. However, if the the data points is way larger than that. For example, if we have 10 trajectories to cover. Then the data points will be 8000 * 10 = 80000. The positions will be 400 * 10 = 4000. !!NOTE: 突然意识到一个很不错的折中的想法，batch的个数我们就可以想象为是trajectory的个数！那么一个batch，即一个trajectory中，我们就有8000的数据点，以及定义的400个位置，每个位置相差20个CSI. 每个轨迹之间不需要通过tranformer去学习关联。当然了，这个还是需要实验去进行验证，IPS的准确度和trajectory的长度的关系，假如我们让transformer每次只学1个轨迹，或者是一次学2个，3个等等时，准确度是否会增高，又或者是假如增高的话，训练难度是不是也是会异常增大！

We may consider using batches to help us with the long sequences. It also makes perfect sense to batch the long sequence. Imagine that maybe the CSI of some of the positions in the trajectory CSI is affected by multipath. But it is mostly likely the neighboring CSI can have a good channel condition. Then the transformer may attend to those positions more. But since the CSI of a very far apart locations will be totally different, so the transformer probably will not attend to much to the CSI of a very far away location. Hence, it is reasonable to divide the longer trajectory CSI into batches. Then transformer can learn all the batches in parallel considering the connections between batches.

In [10]:
def batchify(data: Tensor, bsz: int) -> Tensor: # bsz is the number of batches
    """Divides the data into bsz separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Args:
        data: Tensor, shape [N]
        bsz: int, batch size

    Returns:
        Tensor of shape [N // bsz, bsz]
    """
    seq_len = data.size(0) // bsz 
    data = data[:seq_len * bsz] # removing the extra elements 
    data = data.view(bsz,seq_len).t().contiguous() # t() means transpose! The view changes from (bsz,seq_len) to (seq_len, bsz)

    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size)  # shape [seq_len, batch_size]
val_data = batchify(val_data, eval_batch_size)
test_data = batchify(test_data, eval_batch_size)

In [11]:
train_data.shape

torch.Size([102499, 20])

In [11]:
# # to delete
# m = nn.LogSoftmax(dim=1)
# loss = nn.NLLLoss()
# loss_cr = nn.CrossEntropyLoss()

# # input is of size N x C = 3 x 5
# input = torch.randn(3, 5, requires_grad=True)
# # each element in target has to have 0 <= value < C
# target = torch.tensor([1, 0, 4])

# print(input)
# print(m(input))
# print(torch.exp(m(input)))
# print(loss(m(input), target))
# print(loss_cr(input,target))
# output = loss(m(input), target)

#### Functions to generate input and target sequence

``get_batch()`` generates a pair of input-target sequences for
the transformer model. It subdivides the source data into chunks of
length ``bptt``. For the language modeling task, the model needs the
following words as ``Target``. For example, with a ``bptt`` value of 2,
we’d get the following two Variables for ``i`` = 0:

It should be noted that the chunks are along dimension 0, consistent
with the ``S`` dimension in the Transformer model. The batch dimension
``N`` is along dimension 1.

Here in this example, the shape of the source (``train_data``) is [102499, 20]. Therefore, the ``full_seq_len`` is 102499, and the ``batch_size`` is 20.
Here 20 is the result of function batchify(). What it does is divide the whole 1D sequence into 20 batches and each batch has 102499 tokens. The division is carried out in a sequential order. The first 102499 of the whole 1D sequence will be the first batch, then from 102500 to 204999 will be the second batch and so on and so forth. Here, the token has already been numberized!

Here what get_batch() does is to further get the small chunk from ``tran_data`` of [102499, 20]. For example, the first chunk will be the data from 0 to 34，along with the batch dimension 20. The result will be of size [35,20]. The second chunk will be from 35 to 69. The result will still be of size [35, 20]. So how many chunks are there? 102499 / 35 = 2928 chunks. 

In summary, the chunk here has different meaning of batch although the name of the function is named as get_batch... So for, the chunk size (bptt) gives the impression that it is equivalent to the length of one sentence as used in the context of transformer for NLP. Usually, the length of the transformer could be around 1024. The reason why it is 35, I guess, is that to make it as a small system. The larger bptt, obviously the larger of the input matrix (longer the first dimension). It turns out that if I set the value of bptt 1000, then GPU will run out of memory, which indicates that bptt is, in a sense, exactly the same as the length of words.

这里在通过IPS进行思考的话，假如我们已经确定了先暂时让每个轨迹的CSI作为一个batch，那么每个轨迹CSI有8000个数据点，对应400个点，每个点之间相距20个CSI的距离，那么这里的get_batch就是指的将这400个数据点在进行切割，不去一次性计算这400个点的attention，相互的关系等，而是定义了一个量35，即每次只看其中的35个，计算35的self-attention. 得出35个数的结果之后，进行取loss，然后再去判定下一个35个，直到400个全部学习完了。当然这里的35我们也有待去探究，到底是什么数能有最优解，又或者是我们不需要这里的分成小块，而是将400直接放进去，让transformer去计算所有的400个position的CSI之间的attention，（论文）我们甚至可以说可以将不同参数(35,或者是别的数，或者不要chunk)的attenion给打印出来，用图像的形势看看具体的位置之间CSI的attention关系，将这图放入到论文当中去！

NOTE: 这里呢，有一个catch，那就是计算的时候不只是单纯的计算35个，而是35 x #batch个，即假如我们有10个轨迹的话，那么第一次学习的时候呢，就是先学习35 * 10，10个应该是同时进行的，相互之间没有关联。

#NOTE: ##! The data and target has the same shape. The only difference is that for example, if the data is taken from index 0 to 35, then the target will be from 1 to 36 because target should be the prediction of the previous words. Since the data also has a batch dimension, e.g. 20 in this tutorial. For every batch, they all have 35 words, and the objective is the same for all the batches which is to predict the next word. 
So (I think !!!) for the first iteration, the first word of all the 20 batches are put in the model.
Then the model should have 20 predictions for the next word. So our 20 targets are exactly the next 20 words. The loss will be calculated and then use GD to update the weights. Then For the second iteration, the output of the first word of 20 batches will be as the input to the model, and then the model will have 20 predictions for the third word. So our 20 target will be the next 20 true words. Then loss ... The iteration keeps on until 35 words are all learned! I am not so sure whether the real process takes place just as I wrote above in a manner of one word at a time. Or, more likely, the 35 words will be put in the model directly. I tend to believe the second scenario is true because that is exactly what a transformer do, only focus on the attentions!

``NOTE!`` 这里的bptt绝对就是transformer论文中一个句子中的单词数的概念！！

In [12]:
bptt = 35 #NOTE: batch dimension N, the length of each chunk
def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:  #NOTE: The source data has not 
    """
    Args:
        source: Tensor, shape [full_seq_len, batch_size]
        i: int

    Returns:
        tuple (data, target), where data has shape [seq_len, batch_size] and
        target has shape [seq_len * batch_size]
    """
    seq_len = min(bptt, len(source) - 1 - i)  #! The actual value for i is [0 35 70 105,...]
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1) #! reshape(-1) will unfold the matrix from the higher dimension to the lower dimension
    return data, target

In [27]:
# ## test (to delate)
# from einops import rearrange
# i = 0
# seq_len = min(bptt, len(train_data) - 1 - i) 
# data = train_data[i:i+seq_len]
# # target = train_data[i+1:i+1+seq_len].reshape(-1)
# target = train_data[i+1:i+1+seq_len]

# print(train_data.shape)
# print(data.shape)
# print(target.shape)

# print(data[1:3].reshape(-1))
# print(target[:2].reshape(-1))
# # print(target[:10].reshape(-1))


torch.Size([102499, 20])
torch.Size([35, 20])
torch.Size([35, 20])
tensor([ 3849,    12,   300,  6302,  3989,  1930, 10559,   451,     4,     7,
            2,  1511, 10115,   942,  2439,   572,     1,    47,    30,  1990,
         3869,   315,    19,    29,   939,     2,    10,  2139,  4916, 16615,
          235,     3,    13,     7,    24,    17, 13737,    97,  7720,     4],
       device='cuda:0')
tensor([ 3849,    12,   300,  6302,  3989,  1930, 10559,   451,     4,     7,
            2,  1511, 10115,   942,  2439,   572,     1,    47,    30,  1990,
         3869,   315,    19,    29,   939,     2,    10,  2139,  4916, 16615,
          235,     3,    13,     7,    24,    17, 13737,    97,  7720,     4],
       device='cuda:0')


Initiate an instance
--------------------




The model hyperparameters are defined below. The vocab size is
equal to the length of the vocab object.




IPS：我们还得考虑假如每个点对应20个CSI的话，那么我们还需不需要将这20长度的CSI进行embedding，map到高纬度去~！ 这个需要做试验试一试才能知道！

In [14]:
ntokens = len(vocab)  # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # number of heads in nn.MultiheadAttention
dropout = 0.2  # dropout probability
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)

In [15]:
# ## test (to delate)
# # Example of target with class indices
# loss = nn.CrossEntropyLoss()
# input = torch.randn(3, 5, requires_grad=True) #! Here: 5 dimensional input
# # The following code will generate class indices from 0 to 5-1
# target = torch.empty(3, dtype=torch.long).random_(5) #NOTE .random_(5): Fills self tensor with numbers sampled from the discrete uniform distribution 
# output = loss(input, target) 

# print(input)
# print(target)
# print(output)


Run the model
-------------




We use `CrossEntropyLoss <https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html>`__
with the `SGD <https://pytorch.org/docs/stable/generated/torch.optim.SGD.html>`__
(stochastic gradient descent) optimizer. The learning rate is initially set to
5.0 and follows a `StepLR <https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.StepLR.html>`__
schedule. During training, we use `nn.utils.clip_grad_norm\_ <https://pytorch.org/docs/stable/generated/torch.nn.utils.clip_grad_norm_.html>`__
to prevent gradients from exploding.




In [16]:
import copy
import time

criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
# optimizer = torch.optim.SGD(model.parameters(), lr=lr)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5.0, gamma=0.95) # after one epoch, the LR becomes 95% of the original LR

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 200
    start_time = time.time()
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(train_data) // bptt 
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)): #(0 35 70 ...)
        data, targets = get_batch(train_data, i) # i = 0, 35,70, ... len(train_data) #! The size of data is 35 by 20, 20 is the batch size
        batch_size = data.size(0)
        if batch_size != bptt:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size] 
        output = model(data, src_mask) #! The shape of the output is (35, 20, 28782)
        loss = criterion(output.view(-1, ntokens), targets) #! out.view(-1,ntokens) will make the shape (35,20,28782) to (700,28782)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.6f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            batch_size = data.size(0)
            if batch_size != bptt:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets).item()
    return total_loss / (len(eval_data) - 1)

Loop over epochs. Save the model if the validation loss is the best
we've seen so far. Adjust the learning rate after each epoch.



In [17]:
# print(torch.cuda.memory_allocated()/1024**2)
# print(torch.cuda.memory_cached()/1024**2)
# print(torch.cuda.memory_reserved())
# print(torch.cuda.memory_summary())

In [23]:
best_val_loss = float('inf')
epochs = 20
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    val_loss = evaluate(model, val_data)
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)

    scheduler.step()

| epoch   1 |   200/ 2928 batches | lr 0.001000 | ms/batch 14.73 | loss  5.37 | ppl   213.85
| epoch   1 |   400/ 2928 batches | lr 0.001000 | ms/batch 12.36 | loss  5.40 | ppl   221.25
| epoch   1 |   600/ 2928 batches | lr 0.001000 | ms/batch 12.45 | loss  5.24 | ppl   189.53
| epoch   1 |   800/ 2928 batches | lr 0.001000 | ms/batch 12.53 | loss  5.30 | ppl   200.49
| epoch   1 |  1000/ 2928 batches | lr 0.001000 | ms/batch 12.49 | loss  5.25 | ppl   190.65
| epoch   1 |  1200/ 2928 batches | lr 0.001000 | ms/batch 12.85 | loss  5.31 | ppl   202.44
| epoch   1 |  1400/ 2928 batches | lr 0.001000 | ms/batch 12.51 | loss  5.36 | ppl   212.29
| epoch   1 |  1600/ 2928 batches | lr 0.001000 | ms/batch 12.41 | loss  5.37 | ppl   214.67
| epoch   1 |  1800/ 2928 batches | lr 0.001000 | ms/batch 12.65 | loss  5.28 | ppl   197.02
| epoch   1 |  2000/ 2928 batches | lr 0.001000 | ms/batch 12.98 | loss  5.34 | ppl   207.79
| epoch   1 |  2200/ 2928 batches | lr 0.001000 | ms/batch 12.69 | los

Evaluate the best model on the test dataset
-------------------------------------------




In [19]:
test_loss = evaluate(best_model, test_data)
test_ppl = math.exp(test_loss)
print('=' * 89)
print(f'| End of training | test loss {test_loss:5.2f} | '
      f'test ppl {test_ppl:8.2f}')
print('=' * 89)

| End of training | test loss  5.42 | test ppl   226.74


In [20]:
# # to delete
# best_model.eval()  # turn on evaluation mode
# total_loss = 0.
# src_mask = generate_square_subsequent_mask(bptt).to(device)
# with torch.no_grad():
#     for i in range(0, val_data.size(0) - 1, bptt):
#         data, targets = get_batch(val_data, i)
#         batch_size = data.size(0)
#         if batch_size != bptt:
#             src_mask = src_mask[:batch_size, :batch_size]
#         output = model(data, src_mask)
#         print(output.shape)
#         output_flat = output.view(-1, ntokens)
#         print(output_flat.shape)
#         print(output_flat.argmax(1)[:10])
#         print(f'target shape is {targets.shape}')
#         print(targets[:10])
#         break
#         # total_loss += batch_size * criterion(output_flat, targets).item()

In [21]:
# train_data = batchify(train_data, batch_size)  # shape [seq_len, batch_size]
# val_data = batchify(val_data, eval_batch_size)
# test_data = batchify(test_data, eval_batch_size)


In [22]:
# to delete from the Internet for future reference
# def evaluate(dataloader):
#     model.eval()
#     total_acc, total_count = 0, 0

#     with torch.no_grad():
#         for idx, (label, text, offsets) in enumerate(dataloader):
#             predicted_label = model(text, offsets)
#             loss = criterion(predicted_label, label)
#             total_acc += (predicted_label.argmax(1) == label).sum().item()
#             total_count += label.size(0)
#     return total_acc/total_count