# Basic tutorial: translation
#### Author: Matteo Caorsi

This short tutorial provides you with the basic functioning of *giotto-deep* API.

The example described in this tutorial is the one of translation.

The main steps of the tutorial are the following:
 1. creation of a dataset
 2. creation of a model
 3. define metrics and losses
 4. train the model
 5. extract some features of the network

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np

import torch
from torch import nn

from gdeep.models import FFNet

from gdeep.visualisation import  persistence_diagrams_of_activations

from torch.utils.tensorboard import SummaryWriter
from gdeep.data import TorchDataLoader
from gdeep.pipeline import Pipeline

from gtda.diagrams import BettiCurve

from gtda.plotting import plot_betti_surfaces

No TPUs...


# Initialize the tensorboard writer

In order to analyse the reuslts of your models, you need to start tensorboard.
On the terminal, move inside the `/example` folder. There run the following command:

```
tensorboard --logdir=runs
```

Then go [here](http://localhost:6006/) after the training to see all the visualisation results.

In [2]:
writer = SummaryWriter()

# Create your dataset

In [3]:
from torch.utils.data.sampler import SubsetRandomSampler

# the only part of the training set we are interested in
train_indices = list(range(32*10))

dl = TorchDataLoader(name="Multi30k", convert_to_map_dataset=True)
dl_tr_str, dl_ts_str = dl.build_dataloaders(sampler=SubsetRandomSampler(train_indices))


The dataset contains a list of pairs of sentences: the English sentecen and its German translation.

In [4]:
next(iter(dl_tr_str))

[('Ein Mann, der eine reflektierende Weste und einen Schutzhelm trägt, hält eine Flagge in die Straße.\n',),
 ('A man wearing a reflective vest and a hard hat holds a flag in the road\n',)]

## Required preprocessing

Neural networks cannot direcly deal with strings. We have first to preprocess the dataset in three main ways:
 1. Tokenise the string into its words
 2. Build a vocabulary out of these words
 3. Embed each word into a vector, so that each sentence becomes a list of vectors

The first two steps are performed by the `PreprocessTextTranslation`. The embedding will be added directly to the model.

In [5]:
from gdeep.data import PreprocessText, PreprocessTextTranslation

prec = PreprocessTextTranslation((dl_tr_str, dl_ts_str))

(dl_tr, dl_ts) = prec.build_dataloaders(batch_size=1)


## Define and train your model

In [6]:
from torch.nn import Transformer
from torch.optim import Adam, SparseAdam, SGD

# my simple transformer model
class TranslationTransformer(nn.Module):

    def __init__(self, src_vocab_size, tgt_vocab_size, embed_dim):
        super(TranslationTransformer, self).__init__()
        self.transformer = Transformer(d_model=embed_dim,
                                       nhead=2,
                                       num_encoder_layers=1,
                                       num_decoder_layers=1,
                                       dim_feedforward=512,
                                       dropout=0.1)
        self.embedding_src = nn.Embedding(src_vocab_size, embed_dim, sparse=True)
        self.embedding_tgt = nn.Embedding(tgt_vocab_size, embed_dim, sparse=True)
        self.generator = nn.Linear(embed_dim, tgt_vocab_size)
        
    def forward(self, X):
        #print(X.shape)
        src = X[:,0,:]
        tgt = X[:,1,:]
        #print(src.shape, tgt.shape)
        src_emb = self.embedding_src(src)
        tgt_emb = self.embedding_tgt(tgt)
        #print(src_emb.shape, tgt_emb.shape)
        self.outs = self.transformer(src_emb, tgt_emb)
        #print(outs.shape)
        logits = self.generator(self.outs)
        #print(logits.shape)
        #out = torch.topk(logits, k=1, dim=2).indices.reshape(-1,44)
        #print(out, out.shape)
        return logits
    
    def encode(self, src, src_mask):
        """this method is used only at the inference step"""
        return self.transformer.encoder(
                            self.embedding_src(src), src_mask)

    def decode(self, tgt, memory, tgt_mask):
        """this method is used only at the inference step"""
        return self.transformer.decoder(
                          self.embedding_tgt(tgt), memory,
                          tgt_mask)

In [7]:
vocab_size = 1500 # to be discussed

src_vocab_size = vocab_size # prec.vocabulary["."]  # len(prec.vocabulary)
tgt_vocab_size = vocab_size # prec.vocabulary_lab["."]  # len(prec.vocabulary_lab)
emb_size = 64

model = TranslationTransformer(src_vocab_size, tgt_vocab_size, emb_size)
print(model)

TranslationTransformer(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
          )
          (linear1): Linear(in_features=64, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=64, bias=True)
          (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0): TransformerDecoderLayer(
          (self_attn): MultiheadAttention(
          

## Define the loss function

This loss function is a adapted version of the Cross Entropy for the trnasformer architecture.

In [8]:

def loss_fn(logits, tgt_out):
    cel = nn.CrossEntropyLoss()
    return cel(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))


In [9]:
# prepare a pipeline class with the model, dataloaders loss_fn and tensorboard writer
pipe = Pipeline(model, (dl_tr, dl_ts), loss_fn, writer)

# train the model
pipe.train(SGD, 3, False, {"lr":0.01}, {"batch_size":16})

Epoch 1
-------------------------------
No TPUs
Batch training loss:  4.0121496468782425  	Batch training accuracy:  2710.9375  	[ 16 / 16 ]                             
Time taken for this epoch: 1.00s
No TPUs



Cannot store data in the PR curve



Validation results: 
 Accuracy: 3151.562500%,                 Avg loss: 2.284808 

Epoch 2
-------------------------------
No TPUs
Batch training loss:  2.1167310625314713  	Batch training accuracy:  3201.171875  	[ 16 / 16 ]                            
Time taken for this epoch: 1.00s
No TPUs
Validation results: 
 Accuracy: 3162.500000%,                 Avg loss: 2.084198 

Epoch 3
-------------------------------
No TPUs
Batch training loss:  1.9731857255101204  	Batch training accuracy:  3247.265625  	[ 16 / 16 ]                            
Time taken for this epoch: 1.00s
No TPUs
Validation results: 
 Accuracy: 3307.812500%,                 Avg loss: 1.992500 



(1.9924995303153992, 3307.8125)

## Translation!

In [10]:
de, en = next(iter(dl_tr_str))
print(de, "\n", en)

('Ein Mann in Sandalen und weißer Jacke sitzt auf einer grünen Bank uns spricht am Handy.\n',) 
 ('A man in sandals and white cardigan sits on a green bench while talking on his cellphone.\n',)



Get the vocabulary and numericize the German sentence

In [11]:
voc = prec.vocabulary_lab
sent = str.lower(de[0]).split()
de_sentence = list(map(voc.__getitem__,sent))
de_sentence

[236, 88, 139, 1, 104, 7, 6, 25, 111, 81, 7, 7, 1, 2, 14, 0]

In [12]:

def greedy_decode(model, src, src_mask, max_len, start_symbol):
    """function to generate output sequence using greedy algorithm"""
    memory = model.encode(src, src_mask)
    out = model.decode(src, memory, None)
    prob = model.generator(out)
    greedy_out = torch.max(prob, dim=2).indices
    return greedy_out
    

def translate(model: torch.nn.Module, dl_ts_item):
    """actual function to translate input sentence into target language"""
    model.eval()
    voc = prec.vocabulary
    src = dl_ts_item
    num_tokens = src.shape[1]
    src_mask = None
    tgt_tokens_raw = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=111).flatten()
    tgt_tokens = [token for token in tgt_tokens_raw if token < len(list(voc.vocab))]
    return " ".join(list(map(list(voc.vocab).__getitem__, tgt_tokens)))
    
# translation!
print("German sentence: ", de[0])
print("English translation: ", translate(pipe.model, torch.tensor([de_sentence])))

German sentence:  Ein Mann in Sandalen und weißer Jacke sitzt auf einer grünen Bank uns spricht am Handy.

English translation:  young packing jump tropical packing packing young trees made


# Extract inner data from your models

In [13]:
from gdeep.models import ModelExtractor

me = ModelExtractor(pipe.model, loss_fn)

lista = me.get_layers_param()

for k, item in lista.items():
    print(k,item.shape)


transformer.encoder.layers.0.self_attn.in_proj_weight torch.Size([192, 64])
transformer.encoder.layers.0.self_attn.in_proj_bias torch.Size([192])
transformer.encoder.layers.0.self_attn.out_proj.weight torch.Size([64, 64])
transformer.encoder.layers.0.self_attn.out_proj.bias torch.Size([64])
transformer.encoder.layers.0.linear1.weight torch.Size([512, 64])
transformer.encoder.layers.0.linear1.bias torch.Size([512])
transformer.encoder.layers.0.linear2.weight torch.Size([64, 512])
transformer.encoder.layers.0.linear2.bias torch.Size([64])
transformer.encoder.layers.0.norm1.weight torch.Size([64])
transformer.encoder.layers.0.norm1.bias torch.Size([64])
transformer.encoder.layers.0.norm2.weight torch.Size([64])
transformer.encoder.layers.0.norm2.bias torch.Size([64])
transformer.encoder.norm.weight torch.Size([64])
transformer.encoder.norm.bias torch.Size([64])
transformer.decoder.layers.0.self_attn.in_proj_weight torch.Size([192, 64])
transformer.decoder.layers.0.self_attn.in_proj_bias t

In [14]:
DEVICE = torch.device("cpu")
x = next(iter(dl_tr))[0]
pipe.model.eval()
pipe.model(x.to(DEVICE))

list_activations = me.get_activations(x)
len(list_activations)


30

In [15]:
x = next(iter(dl_tr))[0][0]
if x.dtype is not torch.int64:
    res = me.get_decision_boundary(x, n_epochs=1)
    res.shape

In [16]:
x, target = next(iter(dl_tr))
if x.dtype is torch.float:
    for gradient in me.get_gradients(x, target=target)[1]:
        print(gradient.shape)

# Visualise activations and other topological aspects of your model

In [17]:
from gdeep.visualisation import Visualiser

vs = Visualiser(pipe)

vs.plot_data_model()
#vs.plot_activations(x)
#vs.plot_persistence_diagrams(x)
