CUSTOMIZABLE TRANSFORMER-BASED MODEL

Input Embeddings
Positional Encodings
Layer Normalization
Feed Forward
Multi-Head Attention
Residual Connection
Encoder
Decoder
Linear Layer
Transformer
Task overview
Tokenizer
Dataset
Training loop
Validation loop
Attention visualization

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import numpy as np
import pandas as pd


In [3]:
ENlines = open("English100.csv").read().splitlines()

In [4]:
DElines = open("Deutsch100.csv").read().splitlines()

In [5]:
DF_EN=pd.DataFrame(ENlines)

INPUT EMBEDDING: process of representing words and sentences as tensors suitable as inputs to the model

In [6]:
DF_DE= pd.DataFrame(DElines)

In [7]:
import torchtext

In [8]:
from torchtext.data import get_tokenizer

In [9]:
tokenizer= get_tokenizer("basic_english")

In [10]:
tokensmatrix= [tokenizer(''.join(x)) for x in DF_EN[0]]

In [11]:
tokensmatrixDE = [tokenizer(''.join(x)) for x in DF_DE[0]]

In [12]:
tokensmatrixDF= pd.DataFrame(tokensmatrix)

In [13]:
tokensmatrixDEDF= pd.DataFrame(tokensmatrixDE)

In [13]:
tokensmatrixDEDF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53,54,55,56,57,58,59,60,61,62
0,iron,cement,ist,eine,gebrauchs-fertige,paste,",",die,mit,einem,...,,,,,,,,,,
1,nach,der,aushärtung,schützt,iron,cement,die,kokille,gegen,den,...,,,,,,,,,,
2,feuerfester,reparaturkitt,für,feuerungsanlagen,",",öfen,",",offene,feuerstellen,etc,...,,,,,,,,,,
3,der,bau,und,die,reparatur,der,autostraßen,.,.,.,...,,,,,,,,,,
4,die,mitteilungen,sollen,den,geschäftlichen,kommerziellen,charakter,tragen,.,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,die,im,jahre,1990,gegründete,kanzlei,haidegger,&amp,partner,bietet,...,,,,,,,,,,
96,budapest,ist,nicht,nur,das,politische,",",wirtschaftliche,",",industrielle,...,,,,,,,,,,
97,diese,statistik,basiert,auf,den,teilnehmenden,e-commerceseiten,6819,(,e-shops,...,,,,,,,,,,
98,nur,sponsormarken,sind,in,der,kostenfreien,open,icecat,content,verteilung,...,,,,,,,,,,


In [14]:
tokensmatrixflat= []
for i in range(len(tokensmatrix)):
    tokensmatrixflat+=tokensmatrix[i]

In [15]:
len(tokensmatrixflat)

2578

In [16]:
tokensmatrixflatDE= []
for i in range(len(tokensmatrixDE)):
    tokensmatrixflatDE+=tokensmatrixDE[i]

len(tokensmatrixflatDE)

2533

In [17]:
vocabulary= dict(zip(tokensmatrixflat, range(len(tokensmatrixflat))))

In [18]:
vocabularyDE= dict(zip(tokensmatrixflatDE, range(len(tokensmatrixflatDE))))

In [19]:
em= [[vocabulary[x] for x in y] for y in tokensmatrix]

In [20]:
emDE= [[vocabularyDE[x] for x in y] for y in tokensmatrixDE]

In [21]:
emDF= pd.DataFrame(em)

In [22]:
emDEDF = pd.DataFrame(emDE)

In [23]:
emDF.shape

(100, 57)

In [24]:
emDF= emDF.fillna(0)

In [25]:
emDEDF= emDEDF.fillna(0)

In [26]:
inputTensor = torch.tensor(emDF.values,dtype=int)

In [27]:
desiredOutputTensor = torch.tensor(emDEDF.values,dtype=int)

In [30]:
print(inputTensor.shape, desiredOutputTensor.shape)

torch.Size([100, 63]) torch.Size([100, 63])


In [29]:
import torch.nn.functional as F

diff= inputTensor.shape[1]-desiredOutputTensor.shape[1]
if diff >0:
    desiredOutputTensor = F.pad(input=desiredOutputTensor, pad=(0,diff), mode='constant', value=0)
else:
    if diff<0:
        inputTensor = F.pad(input=inputTensor, pad=(0,-diff), mode='constant', value=0)




splitting tensors into train and validation sets

In [31]:
train_set_input,val_set_input= torch.split(inputTensor,5*inputTensor.shape[0]//10)

In [32]:
train_set_output,val_set_output= torch.split(desiredOutputTensor,5*desiredOutputTensor.shape[0]//10)

In [33]:
from torch.nn import Transformer
from torch import Tensor


In [34]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model: int, dropout: float, max_len: int):
        super().__init__()
        self.dropout= nn.Dropout(p= dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [35]:
class MyTransformer(nn.Module):
    def __init__(self, dim_input:int,d_model: int, num_heads: int, num_encod_layers: int,
                 num_decod_layers: int,dim_ffn: int, dropout: float ):
        super().__init__()
        self.embeddingLayer = nn.Embedding(dim_input, d_model)
        self.pos_enc_layer = PositionalEncoder(d_model, dropout,max_len=5000)
        self.transformer = nn.Transformer(d_model=d_model, nhead=num_heads, dropout=dropout, batch_first=True)
        self.feedforward = nn.Linear(d_model,dim_input)
    
    def forward(self, input, output, output_mask):
        myInput= self.embeddingLayer(input)
        myOutput= self.embeddingLayer(output)

        myInput= self.pos_enc_layer(myInput)
        myOutput= self.pos_enc_layer(myOutput)

        trOutput = self.transformer(src= myInput, tgt= myOutput, tgt_mask= output_mask )
        ffOutput  = self.feedforward(trOutput)
        return ffOutput



In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

THIS SECTION IS FOR SETTING THE MODEL PARAMETERS

In [37]:
TK=  len(tokensmatrixflat)+1 # NUMBER OF TOKENS - vocabulary size
NH=   2    # NUMBER OF HEADS OF THE MULTIHEAD LAYERS OF THE TRANSFORMER
DM=   TK*NH  # MODEL DIMENSION
NEL=  1    # NUMBER OF ENCODER LAYERS IN THE TRANSFORMER
NDL=  1    # NUMBER OF DECODER LAYERS IN THE TRANSFORMER
DFF=  512  # DIMENSION OF THE FEED FORWARD LAYER
DROP = 0.1  # DROPOUT PROBABILITY



In [38]:
TK

2579

In [39]:
device

device(type='cpu')

In [53]:
My_T_Model = MyTransformer(dim_input=TK, d_model=DM, num_heads=NH, num_encod_layers= NEL, num_decod_layers=NDL, dim_ffn=DFF, dropout=DROP).to(device)

In [54]:
learning_rate= 0.001
optimizer= torch.optim.Adam(My_T_Model.parameters(), lr= learning_rate)

In [67]:
masksize = train_set_output.shape[1]
masksize

63

In [68]:
torch.triu(torch.ones(masksize,masksize))

tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [0., 1., 1.,  ..., 1., 1., 1.],
        [0., 0., 1.,  ..., 1., 1., 1.],
        ...,
        [0., 0., 0.,  ..., 1., 1., 1.],
        [0., 0., 0.,  ..., 0., 1., 1.],
        [0., 0., 0.,  ..., 0., 0., 1.]])

In [74]:
def train(model, optimizer, loss_fn, train_src_data, train_target_data, val_src_data,val_target_data, epochs=20, device="cpu"):
    for epoch in range(epochs):
        training_loss = 0.0
        valid_loss = 0.0
        model.train()
        for batch in train_src_data:
            optimizer.zero_grad()
            inputs, targets = train_src_data, train_target_data
            inputs = inputs.to(device)
            targets = targets.to(device)

            shifted_target = torch.roll(targets,-1,1)
            output_mask = torch.triu(torch.ones(targets.shape[1],targets.shape[1]))
            
            output = model(inputs,shifted_target,output_mask)
            loss = loss_fn(output, targets)
            loss.backward()
            optimizer.step()
            training_loss += loss.data.item()
        training_loss /= len(train_src_data)
        model.eval()
        num_correct = 0
        num_examples = 0
        for batch in val_src_data:
            inputs, targets = val_src_data, val_target_data
            inputs = inputs.to(device)

            shifted_target = torch.roll(targets,-1,1)
            output_mask = torch.triu(torch.ones(targets.shape[1],targets.shape[1]))
            output = model(inputs, shifted_target,output_mask)
            targets = targets.to(device)
            loss = loss_fn(output,targets)
            valid_loss += loss.data.item()
            correct = torch.eq(output,targets)
            num_correct += torch.sum(correct).item()
            num_examples += correct.shape[0]            
        valid_loss /= len(val_src_data)
        print('Epoch: {} - Training Loss: {:.2f} - Validation Loss: {:.2f} - accuracy : {:.2f}'.format(epoch, training_loss,valid_loss, num_correct / num_examples))

In [58]:
train_set_input

tensor([[  32,   50, 2503,  ...,    0,    0,    0],
        [  32,   50,   34,  ...,    0,    0,    0],
        [2460,   52,   48,  ...,    0,    0,    0],
        ...,
        [2570, 1136, 1137,  ...,    0,    0,    0],
        [2570, 1157, 2503,  ...,    0,    0,    0],
        [2243, 2570, 1275,  ...,    0,    0,    0]])

In [70]:
print(train_set_input.shape) 
print(train_set_output.shape) 
print(val_set_input.shape) 
print(val_set_output.shape) 

torch.Size([50, 63])
torch.Size([50, 63])
torch.Size([50, 63])
torch.Size([50, 63])


In [60]:
inputs, targets = train_set_input, train_set_output

In [61]:
inputs = inputs.to(device)
targets = targets.to(device)

In [62]:
shifted_target = torch.roll(targets,-1,1)
shifted_target.shape

torch.Size([50, 63])

In [71]:
output_mask = torch.triu(torch.ones(masksize,masksize))

In [72]:
output_mask.shape

torch.Size([63, 63])

In [75]:
train(My_T_Model,optimizer=optimizer,loss_fn= nn.MSELoss(), train_src_data=train_set_input, train_target_data= train_set_input,val_src_data= val_set_input, val_target_data=val_set_input,device=device)

: 

In [314]:
train(My_T_Model,optimizer=optimizer,loss_fn= nn.MSELoss(), train_src_data=train_set_input, train_target_data= train_set_output,val_src_data= val_set_input, val_target_data=val_set_output,device=device)

IndexError: index out of range in self

Using the Trained Model

In [None]:
src = input
tgt_in = torch.tensor([[1]], dtype=torch.int64).to(device)
t_mask = nn.Transformer.generate_square_subsequent_mask(1)

with torch.no_grad():
  preds = My_T_Model(src, tgt_in, tgt_mask=t_mask)
