In [None]:
%matplotlib inline

Define the model
----------------




In this tutorial, we train a ``nn.TransformerEncoder`` model on a
language modeling task. The language modeling task is to assign a
probability for the likelihood of a given word (or a sequence of words)
to follow a sequence of words. A sequence of tokens are passed to the embedding
layer first, followed by a positional encoding layer to account for the order
of the word (see the next paragraph for more details). The
``nn.TransformerEncoder`` consists of multiple layers of
`nn.TransformerEncoderLayer <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html>`__.
Along with the input sequence, a square attention mask is required because the
self-attention layers in ``nn.TransformerEncoder`` are only allowed to attend
the earlier positions in the sequence. For the language modeling task, any
tokens on the future positions should be masked. To produce a probability
distribution over output words, the output of the ``nn.TransformerEncoder``
model is passed through a linear layer followed by a log-softmax function.




In [None]:
import math
from typing import Tuple
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset

class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int, 
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout) #NOTE d_model is the embedding size
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        # self.encoder = nn.Embedding(ntoken, d_model) # NOTE: Do not need embedding for IPS, the data itself has 384 dimensional data
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        # self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        # src = self.encoder(src) * math.sqrt(self.d_model) #! the input is 35(just consider it to be # of batches) by 20, the output is 35 by 20 by 200. The embedding turns the indices into vectors of size 200.
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask) #! the output is the output of the final fully-connected layer of 200 dimension. The dimension here is still the same as 35 by 20 by 200
        output = self.decoder(output) #! The linear layer in the decoder maps the input from 35 by 20 by 200 to 35 by 20 by ntoken (好像是两万多)
        return output


def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

``PositionalEncoding`` module injects some information about the
relative or absolute position of the tokens in the sequence. The
positional encodings have the same dimension as the embeddings so that
the two can be summed. Here, we use ``sine`` and ``cosine`` functions of
different frequencies.




In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000): # max_len means the maximum time steps or word length
        super().__init__()
        self.dropout = nn.Dropout(p=dropout) # do not understand why you need dropout here

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)) #1/10000^(2i/dim_model)) #! exp(ln(x))=x, therefore, exp(ln(1/10000^(2i/dim_model)))) = exp(2i/dim_model)*(-ln(10000))
        pe = torch.zeros(max_len, 1, d_model) #NOTE: Row always means the 
        pe[:, 0, 0::2] = torch.sin(position * div_term) #PE(pos, 2i) = sin(pos/10000^(2i/dim_model))
        pe[:, 0, 1::2] = torch.cos(position * div_term) #PE(pos, 2i) = cos(pos/10000^(2i/dim_model))
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

Load and batch data
-------------------




In [None]:
import scipy.io as sio
from scipy.io import loadmat, savemat
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
from einops import rearrange,reduce,repeat

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
# load training data (snr = 20)
dataPath = 'features_1d_snr20'
labelPath = 'labels_1d_snr20'
input_data = sio.loadmat(dataPath)
input_data = torch.from_numpy(input_data['features'])
labels = sio.loadmat(labelPath)
labels = torch.from_numpy(labels['labels'] )

# load test data (snr = 10)
dataPath = 'features_1d_10snr'
labelPath = 'labels_1d_10snr'
input_data_test = sio.loadmat(dataPath)
input_data_test = torch.from_numpy(input_data_test['features'])
labels_test = sio.loadmat(labelPath)
labels_test = torch.from_numpy(labels_test['label'] )


In [None]:
labels_test[1].shape

### Define trajectories!
1. There are 49 trajectories. From x being 0.1 to x being 4.9, step size is 0.1cm. The number of trajectories is equivalent to the variable `batch_size` in the original transformer code.
2. There are 79 points in one trajectory! In other words, there are 79 words in one sentence. 
NOTE: There is `bptt` in the original transformer code, it indicates the number of words put into the mode at one time. Here, we can definitely play with this variable. I think start from 10 to 40. I think `bptt` is more similar to the concept of batch. The number of batches, in IPS case, is 79/`bptt`. If 79 is not divisible by `bptt`, then the last batch is less than `bptt`.
NOTE: `bptt` may not be chosen a too large value because a larger `bptt` requires we should go much further because the IPS can start working with long enough input for the model.
4. For each point, the data size is (24, 4, 4). 24: the number of time-domain samples in the CIR. 4 is the number of antenna pairs. 4 is the number of APs. Since each AP has only one antenna while each STA has 4 antennas, so there are 4 antenna pairs.
5. In the original code, the data with `bptt` x `batch_size` is embedded and becomes the size of `bptt` x `batch_size` x `embedding size` (e.g., 35 x 20 x 200). However, since for each point, we have the data of size (24, 4, 4). Then we might consider flattening it to 24 x 4 x 4 = 384. Then, in analogy, our data should have a similar structure with the dimension being `bptt` x `# of trajectory` x `data dimension` (e.g., 20 x 49 x 384).


NOTE: Initially, we do not consider standardizing or normalizing. If the performance is poor, we then consider doing the scaling. 


##### Load and batch data (NOTE: HERE THE BATCH IS NOT THE NORMAL BATCH AS WE LEARNED. IT IS MORE OF A SEGMENTATION.)
NOTE: batch data can enable parallel processing. However, the data in different batches will not be trained together. The model treats each batch independently. `The data in different batches will not be considered as the context information`. 

In the case of IPS, do we need batches？？
`Conclusion: the batch here is considered as the trajectory. Different trajectories have no connections!`

For example, if we have 10 trajectories. Each trajectory has 8000 data. Suppose for each trajectory, we define 400 points. Then each point will correspond to 8000/400 = 20 CSI. We can convert the problem of regression down to classification problem, which states as follows. Given a test data with 400 CSI, it can be considered as having passed 400/20 = 20 positions. Then the task will be to predict the next position, which corresponds to the task of predicting the next word in NLP. 400 positions meas we have 400 words. Here, 400 words probably correspond to ``bptt=35`` in this tutorial. 

I guess 400 words can be handled by the system without resorting to batching. However, if the the data points is way larger than that. For example, if we have 10 trajectories to cover. Then the data points will be 8000 * 10 = 80000. The positions will be 400 * 10 = 4000. !!
NOTE: 突然意识到一个很不错的折中的想法，batch的个数我们就可以想象为是trajectory的个数！那么一个batch，即一个trajectory中，我们就有8000的数据点，以及定义的400个位置，每个位置相差20个CSI. 每个轨迹之间不需要通过transformer去学习关联。当然了，这个还是需要实验去进行验证，IPS的准确度和trajectory的长度的关系，假如我们让transformer每次只学1个轨迹，或者是一次学2个，3个等等时，准确度是否会增高，又或者是假如增高的话，训练难度是不是也是会异常增大！

We may consider using batches to help us with the long sequences. It also makes perfect sense to batch the long sequence. Imagine that maybe the CSI of some of the positions in the trajectory CSI is affected by multi-path. But it is mostly likely the neighboring CSI can have a good channel condition. Then the transformer may attend to those positions more. But since the CSI of a very far apart locations will be totally different, so the transformer probably will not attend to much to the CSI of a very far away location. Hence, it is reasonable to divide the longer trajectory CSI into batches. Then transformer can learn all the batches in parallel considering the connections between batches.

In [None]:
# n_pts_once=20 # bptt
train_test_radio = 1
n_trajectories = 49
n_pts_trajectory = 79

# Change the original data dimension [3871, 4, 4, 24] to [79 x 49 x 4 x 4 x 24] 
input_data_reshape = rearrange(input_data, 'a b c d -> d c b a')
data_traj = input_data_reshape.chunk(n_pts_trajectory) #! divide the data into chunks
data_stack = torch.stack([item for item in data_traj]) # [79 x 49 x 4 x 4 x 24]
print(data_stack.shape)

input_data_reshape_test = rearrange(input_data_test, 'a b c d -> d c b a')
data_traj_test = input_data_reshape_test.chunk(n_pts_trajectory) #! divide the data into chunks
data_stack_test = torch.stack([item for item in data_traj_test]) # [79 x 49 x 4 x 4 x 24]
print(data_stack_test.shape)

In [None]:
# Creating labels from 0 to 3870

labels_ips = torch.arange(0,3871)
# Change the label dimension from 1d to [79 x 49]
labels_ips = labels_ips.reshape(49, 79).t().to(device)
labels_ips.shape

##### Split the training data and test data


In [None]:
train_size = int(np.floor(len(data_stack)*train_test_radio))
train_data = data_stack[:train_size] # using chunk to prepare for batches. NOTE: The batch is in tuple!!
# test_data = data_stack[train_size:].to(device)
test_data = data_stack_test

print(f'the number of traing and test dataset is {len(train_data)} and {len(test_data)}')
print(f'the size of the traning data is {train_data.shape}')
# change the size of the from [79 x 49 x 4 x 4 x 24] to [79 x 49 x 384]
train_data = rearrange(train_data, 'a b c d e -> a b (c d e)').to(device)
test_data = rearrange(test_data, 'a b c d e -> a b (c d e)').to(device)
print(f'the size of the new (similar to embedded) traning data is {train_data.shape}')

In [None]:
# #NOTE Consider deleting this part; however, it's to worth considering reusing some of the codes.
# def batchify(raw_ips_data, n_pts_once) -> Tensor:
#     """ Convert raw IPS data with size being (24, 4, 4, 3871) into training dataset and test dataset. 
#     """
#     ## number of batches

#     print(data_stack.shape)
    
#     combine_dim = rearrange(raw_ips_data, 'a b c d -> a (b c d)')
#     print(combine_dim.shape)
#     # Divide the data into # of segments, which is the same as the number of trajectories (49).
#     data_traj = combine_dim.chunk(n_trajectories) #! -1 means the last data will only the the target value 
#     print(len(data_traj))
#     data_stack = torch.stack([item for item in data_traj]) # vocab(tokens) will return the corresponding indices of the tokens in the generated vocabulary

#     print(len(data_stack))
#     print(data_stack.shape)
    
#     data_traj = rearrange(data_traj, 'a b c -> b a c')
    
#     # data_traj is of size (79 x 49 x 384)

#     length = len(data_traj)

#     if np.remainder(length - 1, n_pts_once): #! -1 means the last data will only the the target value 
#         n_batch = length // n_pts_once + 1
#     else:
#         n_batch = length // n_pts_once
        
#     # prepare the data and target (using chunk to divide the data into n_batch as one tuple)
#     #  (the target should be the next 'bptt' locations)
#     data = combine_dim[:-1].chunk(n_batch) #! -1 means the last data will only the the target value 
#     target = combine_dim[1:].chunk(n_batch) #! 1 means the first data will only the data, not the target value    
    
#     print(f'the number of batches is {n_batch}')
#     print(f'the size of each batch is {n_pts_once}')    
#     print(f'the size of the last batch is',len(data[-1]))
    
#     return data, target

In [None]:
n_pts_once = 40 #NOTE: batch dimension N, the length of each batch. In IPS, it means the number of points considered at once
def get_batch(source: Tensor, label: Tensor, i: int) -> Tuple[Tensor, Tensor]: 
    """
    Args:
        source: Tensor, shape [num_pts, batch_size] NOTE: batch_size = # of trajectories
        label: Tensor, shape [num_pts, batch_size]
        i: int

    Returns:
        tuple (data, target), where data has shape [num_pts, batch_size] and
        target has shape [num_pts * batch_size]
    """
    seq_len = min(n_pts_once, len(source) - 1 - i)  #! The actual value for i is [0 35 70 105,...]
    data = source[i:i+seq_len]
    target = label[i+1:i+1+seq_len].reshape(-1) #! reshape(-1) will unfold the matrix from the higher dimension to the lower dimension
    return data, target

#### Functions to generate input and target sequence

``get_batch()`` generates a pair of input-target sequences for
the transformer model. It subdivides the source data into chunks of
length ``bptt``. For the language modeling task, the model needs the
following words as ``Target``. For example, with a ``bptt`` value of 2,
we’d get the following two Variables for ``i`` = 0:

It should be noted that the chunks are along dimension 0, consistent
with the ``S`` dimension in the Transformer model. The batch dimension
``N`` is along dimension 1.

Here in this example, the shape of the source (``train_data``) is [102499, 20]. Therefore, the ``full_seq_len`` is 102499, and the ``batch_size`` is 20.
Here 20 is the result of function batchify(). What it does is divide the whole 1D sequence into 20 batches and each batch has 102499 tokens. The division is carried out in a sequential order. The first 102499 of the whole 1D sequence will be the first batch, then from 102500 to 204999 will be the second batch and so on and so forth. Here, the token has already been numberized!

Here what get_batch() does is to further get the small chunk from ``tran_data`` of [102499, 20]. For example, the first chunk will be the data from 0 to 34，along with the batch dimension 20. The result will be of size [35,20]. The second chunk will be from 35 to 69. The result will still be of size [35, 20]. So how many chunks are there? 102499 / 35 = 2928 chunks. 

In summary, the chunk here has different meaning of batch although the name of the function is named as get_batch... So for, the chunk size (bptt) gives the impression that it is equivalent to the length of one sentence as used in the context of transformer for NLP. Usually, the length of the transformer could be around 1024. The reason why it is 35, I guess, is that to make it as a small system. The larger bptt, obviously the larger of the input matrix (longer the first dimension). It turns out that if I set the value of bptt 1000, then GPU will run out of memory, which indicates that bptt is, in a sense, exactly the same as the length of words.

这里在通过IPS进行思考的话，假如我们已经确定了先暂时让每个轨迹的CSI作为一个batch，那么每个轨迹CSI有8000个数据点，对应400个点，每个点之间相距20个CSI的距离，那么这里的get_batch就是指的将这400个数据点在进行切割，不去一次性计算这400个点的attention，相互的关系等，而是定义了一个量35，即每次只看其中的35个，计算35的self-attention. 得出35个数的结果之后，进行取loss，然后再去判定下一个35个，直到400个全部学习完了。当然这里的35我们也有待去探究，到底是什么数能有最优解，又或者是我们不需要这里的分成小块，而是将400直接放进去，让transformer去计算所有的400个position的CSI之间的attention，（论文）我们甚至可以说可以将不同参数(35,或者是别的数，或者不要chunk)的attenion给打印出来，用图像的形势看看具体的位置之间CSI的attention关系，将这图放入到论文当中去！

NOTE: 这里呢，有一个catch，那就是计算的时候不只是单纯的计算35个，而是35 x #batch个，即假如我们有10个轨迹的话，那么第一次学习的时候呢，就是先学习35 * 10，10个应该是同时进行的，相互之间没有关联。

#NOTE: ##! The data and target has the same shape. The only difference is that for example, if the data is taken from index 0 to 35, then the target will be from 1 to 36 because target should be the prediction of the previous words. Since the data also has a batch dimension, e.g. 20 in this tutorial. For every batch, they all have 35 words, and the objective is the same for all the batches which is to predict the next word. 
So (I think !!!) for the first iteration, the first word of all the 20 batches are put in the model.
Then the model should have 20 predictions for the next word. So our 20 targets are exactly the next 20 words. The loss will be calculated and then use GD to update the weights. Then For the second iteration, the output of the first word of 20 batches will be as the input to the model, and then the model will have 20 predictions for the third word. So our 20 target will be the next 20 true words. Then loss ... The iteration keeps on until 35 words are all learned! I am not so sure whether the real process takes place just as I wrote above in a manner of one word at a time. Or, more likely, the 35 words will be put in the model directly. I tend to believe the second scenario is true because that is exactly what a transformer do, only focus on the attentions!

``NOTE!`` 这里的bptt绝对就是transformer论文中一个句子中的单词数的概念！！

Initiate an instance
--------------------




The model hyperparameters are defined below. The vocab size is
equal to the length of the vocab object.




IPS：我们还得考虑假如每个点对应20个CSI的话，那么我们还需不需要将这20长度的CSI进行embedding，map到高纬度去~！ 这个需要做试验试一试才能知道！

In [None]:
ntokens = 3871  # there are 3871 points!!
emsize = 384  # the data sample dimension for each point (similar to embedding dimension here)
d_hid = 1024  # dimension of the feedforward network model in nn.TransformerEncoder %NOTE: default is 2048
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 4  # number of heads in nn.MultiheadAttention
dropout = 0.1  # dropout probability % 0.2
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)

Run the model
-------------




In [None]:
import copy
import time

criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
# optimizer = torch.optim.SGD(model.parameters(), lr=lr)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 100.0, gamma=0.95) # after one epoch, the LR becomes 95% of the original LR


In [None]:

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 5
    start_time = time.time()
    src_mask = generate_square_subsequent_mask(n_pts_once).to(device)

    num_batches = len(train_data) // n_pts_once 
    # for batch, i in enumerate(range(0, train_data.size(0) - 1, n_pts_once)): #(0 35 70 ...) #NOTE: `-1` is for batching the target value
    for batch, i in enumerate(range(0, train_data.size(0) - 1, 1)): #(0 35 70 ...) #NOTE: `-1` is for batching the target value
        data, targets = get_batch(train_data, labels_ips, i) # i = 0, 35,70, ... len(train_data) #! The size of data is 35 by 20, 20 is the batch size
        batch_size = data.size(0)
        if batch_size != n_pts_once:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size] 
        output = model(data, src_mask) #! The shape of the output is (35, 20, 28782)
        loss = criterion(output.view(-1, ntokens), targets) #! out.view(-1,ntokens) will make the shape (35,20,28782) to (700,28782)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        # total_loss += loss.item()
        # if batch % log_interval == 0 and batch > 0:
        #     lr = scheduler.get_last_lr()[0]
        #     ms_per_batch = (time.time() - start_time) * 1000 / log_interval
        #     cur_loss = total_loss / log_interval
        #     ppl = math.exp(cur_loss)
        #     print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
        #           f'lr {lr:02.6f} | ms/batch {ms_per_batch:5.2f} | '
        #           f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
        #     total_loss = 0
        #     start_time = time.time()

def evaluate(model: nn.Module, eval_data: Tensor, labels: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(n_pts_once).to(device)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, n_pts_once):
            data, targets = get_batch(eval_data, labels, i)
            batch_size = data.size(0)
            if batch_size != n_pts_once:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets).item()
    return total_loss / (len(eval_data) - 1)

Loop over epochs. Save the model if the validation loss is the best
we've seen so far. Adjust the learning rate after each epoch.



In [None]:
# print(torch.cuda.memory_allocated()/1024**2)
# print(torch.cuda.memory_cached()/1024**2)
# print(torch.cuda.memory_reserved())
# print(torch.cuda.memory_summary())


In [None]:
import glob
best_val_loss = 1.8 #float('inf')
epochs = 3000
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    val_loss = evaluate(model, train_data, labels_ips)  # val_data
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    
    if epoch % 20 == 1:
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
            f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
        print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)
        lr = scheduler.get_last_lr()[0]

        # Save the model 
        fileName = f'model_numLayer_{nlayers}_numHead_{nhead}_dropout_{dropout}_batchsize_{n_pts_once}_minLoss_{val_loss:.3f}_lr_{lr:.6f}_epoch_{epoch}.pth'
        PATH = 'model'
        if not os.path.exists(PATH):
            os.makedirs(PATH)
        fullPath = os.path.join(PATH, fileName)
        # searchWord = f'model_numLayer_{nlayers}_numHead_{nhead}_dropout_{dropout}_batchsize_{n_pts_once}'
        # files = glob.glob(f'model/{searchWord}*.pth')
        # for i in files:
        #     os.remove(i)
        torch.save(best_model.state_dict(), fullPath)

    scheduler.step()

In [None]:
fileName = f'model_numLayer_{nlayers}_numHead_{nhead}_dropout_{dropout}_batchsize_{n_pts_once}_minLoss_{val_loss:.3f}_lr_{lr:.6f}_epoch_{epoch}.pth'
PATH = 'model'
if not os.path.exists(PATH):
    os.makedirs(PATH)
fullPath = os.path.join(PATH, fileName)
# searchWord = f'model_numLayer_{nlayers}_numHead_{nhead}_dropout_{dropout}_batchsize_{n_pts_once}'
# files = glob.glob(f'model/{searchWord}*.pth')
# for i in files:
#     os.remove(i)
torch.save(best_model.state_dict(), fullPath)

In [None]:
# load the model
new_model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)
new_model.load_state_dict(torch.load('model\model_numLayer_2_numHead_4_dropout_0.1_batchsize_40_minLoss_0.014_lr_0.000060.pth'))
new_model.eval() # to turn off the dropout layer ..

Evaluate the best model on the test dataset
-------------------------------------------




In [None]:
# test_loss = evaluate(best_model, test_data)
# test_ppl = math.exp(test_loss)
# print('=' * 89)
# print(f'| End of training | test loss {test_loss:5.2f} | '
#       f'test ppl {test_ppl:8.2f}')
# print('=' * 89)
def predict1(model: nn.Module, start_point: Tensor, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(n_pts_once).to(device)
    with torch.no_grad():
      #   for i in range(0, eval_data.size(0) - 1, n_pts_once):
      # data, targets = get_batch(eval_data, labels, i)
      i = start_point
      seq_len = min(n_pts_once, len(eval_data) - 1 - i)  #! The actual value for i is [0 n_pts_once 2*n_pts_once ...]
      data = eval_data[i:i+seq_len]
      # target = label[i+1:i+1+seq_len].reshape(-1) #! reshape(-1) will unfold the matrix from the higher dimension to the lower dimension            
      batch_size = data.size(0)
      if batch_size != n_pts_once:
            src_mask = src_mask[:batch_size, :batch_size]
      output = model(data, src_mask)
      output_flat = output.view(-1, ntokens)
            
            # total_loss += batch_size * criterion(output_flat, targets).item()
    return output_flat


#### Estimation Error Table !
1. First, I need to create a 49 x 79 table.
2. 49 is along the x axis, and 79 is along the y axis.
3. The spacing between each point is 0.1cm. 
4. When calculating the error, just need to calculate the distance between the estimated 

In [None]:
def calculate_error(table, trajectory_ind, start_point, data_length, predicted):
    label = table[trajectory_ind][start_point + 1:start_point+data_length + 1]
    predicted_ind = predicted.cpu().numpy()
    dis_sum = 0
    for x, y in zip(predicted_ind, label):
        pred = np.where(table == y) #NOTE the index of the predicted value in the table
        grd = np.where(table == x) #NOTE the index of the true label in the table
        dis = ((pred[0] - grd[0])**2 + (pred[1] - grd[1])**2)**0.5
        real_dis = dis * spacing #NOTE: Multiply with the unit
        dis_sum = dis_sum + real_dis
    ave = dis_sum/data_length # calculate the averaged value
    print(f'the averaged value over {data_length} data points is {ave} cm')

    return ave

#### Test Scenario 1: 
1. the test data is the same data as the training data.
2. the batch size is the same as the training data. (batch_sie <==> n_pts_once)

`Result`: The result is 100% accurate if the same data is put into the model. 

In [None]:
trajectory_ind = 0 #NOTE: the index of the trajectory from all 49 trajectories.
start_point = 0  # 0, n_pts_once, 2*n_pts_once... #NOTE: the start point in a given trajectory

test_input = rearrange(test_data, 'a b c -> b a c') # train_data test_data
print(test_input.shape)
one_trajectory = rearrange(test_input[trajectory_ind], 'a b -> a 1 b') 
# print(one_trajectory.shape)

predicted = predict1(new_model, start_point, one_trajectory) # new_model best_model 
# test_ppl = math.exp(test_loss)
print('=' * 89)
# print(predicted.shape)
pre_ind = torch.argmax(predicted,axis=1)
# print(pre_ind.reshape(n_pts_once,-1))
print(f'the predicted locations are {pre_ind.reshape(len(predicted),-1).squeeze()}')
# print(labels_ips.)

print('=' * 89)

In [None]:
print(train_data.shape)
print(test_data.shape)

#### Test Scenario 2: 
1. the test data is different from the training data. 
2. For example, the batch_size is 20, one of the 20 data, only the first one is valid, the remaining ones will be 0, just to conform with the batch_size during the training.

`Result`: The result is 100% accurate if the same data is put into the model. 

In [None]:
def predict2(model: nn.Module, start_point: Tensor, data_length: Tensor, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(n_pts_once).to(device)
    with torch.no_grad():
      if data_length > n_pts_once:
          raise ValueError(f"the data_length should not be greater than the batch_size {n_pts_once}")
      else:
        data = eval_data[start_point:start_point+data_length]
        batch_size = data.size(0)
        if batch_size != n_pts_once:
                src_mask = src_mask[:batch_size, :batch_size]
        output = model(data, src_mask)
        output_flat = output.view(-1, ntokens)
            
    return output_flat

In [None]:
trajectory_ind = 30 #NOTE: the index of the trajectory from all 49 trajectories.
start_point = 20 # 0, n_pts_once, 2*n_pts_once... #NOTE: the start point in a given trajectory
data_length = 40

test_input = rearrange(test_data, 'a b c -> b a c') # train_data test_data
# print(test_input.shape)
one_trajectory = rearrange(test_input[trajectory_ind], 'a b -> a 1 b') 
# print(one_trajectory.shape)

predicted = predict2(new_model, start_point, data_length, one_trajectory)
# test_ppl = math.exp(test_loss)
print('=' * 89)
# print(predicted.shape)
pre_ind = torch.argmax(predicted,axis=1)
# print(pre_ind.reshape(n_pts_once,-1))
print(f'the predicted locations are {pre_ind.reshape(len(predicted),-1).squeeze()}')
# print(labels_ips.)

print('=' * 89)

In [None]:
table = np.arange(0, 79 * 49)
table = table.reshape(49, 79)
spacing = 10 # 10 cm between each point

calculate_error(table, trajectory_ind, start_point, data_length, pre_ind)


#### Test Scenario 3: 
1. the test data is different from the training data. The test data is generated with three different SNR. 
2. For example, the batch_size is 20, one of the 20 data, only the first one is valid, the remaining ones will be 0, just to conform with the batch_size during the training.

`Result`: The result is 100% accurate if the same data is put into the model. 