In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math,copy,time
import numpy as np
import string
import re
from torch.utils.data import DataLoader, Dataset
%matplotlib inline

#from google.colab import files
from sklearn.model_selection import train_test_split

# **Helper Class**

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# uploaded = files.upload()

In [3]:
class Lang:
    """
    Helper class for creating dictionaries for languages
    """
    def __init__(self,name):
        self.name = name
        self.word2index = {'SOS':0,'EOS':1,'PAD':2}
        self.index2word = {0:'SOS',1:'EOS',2:'PAD'}
        self.word2count = {}
        self.n_words = 3

    def add_sentence(self,sentence):
        for word in sentence.split(' '):
            self.add_word(word)

            
    def add_word(self,word):
        # check if the word has appeared before
        if word not in self.word2index:
            # word to index at index (n_words)
            self.word2index[word] = self.n_words
            
            # index (n_words) to word
            self.index2word[self.n_words] = word
            
            # word count is set to 1
            self.word2count[word] = 1
            
            # numbers of unique words increases by 1
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
def read_Lang(reverse=False):
    # open the file in the directory
    # This is the only class that is language specific
    
    # Reads chinese to english files, the first line is English and second line is Chinese
    f = open("cmn-eng.txt",encoding='utf-8')
    
    # read lines in the file
    lines = f.readlines()
    lines = [line.rstrip().split('\t')[0:2] for line in lines]
    
    # add all the lower case alphabetical letters into Chinese dictionary
    alph = string.ascii_lowercase
    alph = [s for s in alph]
    alph = " ".join(alph)

    # close the file
    f.close()
    
    if reverse:
        lines = [list(reversed(line)) for line in lines]
        
        # Input is Chinese
        input_ = [re.sub(r" ","",line[0]) for line in lines]
        input_ = [[word for word in line] for line in input_]
        input_ = [" ".join(line) for line in input_]
        input_ = [line.lower() for line in input_]
        input_.append(alph)
        
        # Output is English
        output_ = [re.sub(r"([.!?])",r" \1",line[1]) for line in lines]
        output_ =  [re.sub(r"[^a-zA-Z.!?]+", r" ", line) for line in output_]
        output_ = [s.lower() for s in output_]
        
        # Combine the pairs
        pairs = [[i,o] for i,o in zip(input_,output_)]
        input_class = Lang('cmn')
        output_class = Lang('eng')
    else:
        # input is English
        input_ = [re.sub(r"([.!?])",r" \1",line[0]) for line in lines]
        input_ =  [re.sub(r"[^a-zA-Z.!?]+", r" ", line) for line in input_]
        input_ = [s.lower() for s in input_]
        
        # output is Chinese
        output_ = [re.sub(r" ","",line[1]) for line in lines]
        output_ = [[word for word in line] for line in output_]
        output_ = [" ".join(line) for line in output_]
        output_ = [line.lower() for line in output_]
        output_.append(alph)
        
        # Combine the pairs
        pairs = [[i,o] for i,o in zip(input_,output_)]
        input_class = Lang('eng')
        output_class = Lang('cmn')
    
    
    for i in range(len(input_)):
        input_class.add_sentence(input_[i])

    for j in range(len(output_)):
        output_class.add_sentence(output_[j])
        
    return input_class,output_class,pairs

In [5]:
class language_loader(Dataset):
    def __init__(self,pairs,input_lang,output_lang,device):
        self.pairs = pairs
        self.input_lang = input_lang
        self.output_lang = output_lang
        length_in = [len(pair[0]) for pair in self.pairs]
        length_out = [len(pair[1]) for pair in self.pairs]

        self.in_max = max(length_in)
        self.out_max = max(length_out)
        
        self.device = device
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self,idx):
        pair = self.pairs[idx]
        
        input_s = pair[0].split(" ")
        input_s = input_s + ['EOS']
        input_length = torch.LongTensor([len(input_s)])
        
        output_s = pair[1].split(" ")
        output_s = ['SOS'] + output_s + ['EOS']
        output_length = torch.LongTensor([len(output_s)])
        
        src_pad_idx = self.input_lang.word2index['PAD']
        trg_pad_idx = self.input_lang.word2index['PAD']
        
        input_tensor = torch.ones(self.in_max)*src_pad_idx
        output_tensor = torch.ones(self.out_max)*trg_pad_idx
        
        for i in range(len(input_s)):
            word = input_s[i]
            input_tensor[i] = self.input_lang.word2index[word]
        
        for j in range(len(output_s)):
            word = output_s[j]
            output_tensor[j] = self.output_lang.word2index[word]

        return input_tensor.long().to(self.device),output_tensor.long().to(self.device),input_length.to(self.device),output_length.to(self.device)

In [6]:
in_,out_,p = read_Lang()
l = language_loader(p,in_,out_,device)

# **Encoder**

In [7]:
def clones(layer,N):
    """
    Function that produces N identical layers and stores them in 
    a modulelist 
    
    layer: The type of layer being passed in 
    N: the number of times that it is going to be repeated
    """
    return nn.ModuleList([layer for _ in range(N)])

**Attention Layer**

In [8]:
class Attention(nn.Module):
    def __init__(self,num_head,embed,device,drop=0.1):
        """
        num_head: number of heads of the attention layer
        embed: the embedded dimension
        """
        super().__init__()
        self.num_head = num_head
        self.embed = embed
        # the dimension of each head
        self.head_dim = int(self.embed / self.num_head)
        
        self.fc_q = nn.Linear(self.embed,self.embed)
        self.fc_k = nn.Linear(self.embed,self.embed)
        self.fc_v = nn.Linear(self.embed,self.embed)
        
        self.fc = nn.Linear(self.embed,self.embed)
        self.device = device
        self.dropout = nn.Dropout(drop)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(self.device)
        
    def forward(self,query,key,value,mask=None):
        """
        query: of shape (batch_size,query_len,embed)
        key: of shape (batch_size,key_len,embed)
        value: of shape (batch_size,value_len,embed)
        """
        src_len = query.shape[1]
        batch_size = query.shape[0]

        # Q is of shape (batch_size, query_len,embed)
        # K is of shape (batch_size, key_len,embed)
        # V is of shape (batch_size, value_len,embed)
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        
        # Q will have shape (batch_size,num_heads, query_len,head_dim)
        # K will have shape (batch_size,num_heads, key_len,head_dim)
        # V will have shape (batch_size,num_heads, value_len,head_dim)
        Q = Q.view(batch_size,-1,self.num_head,self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size,-1,self.num_head,self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size,-1,self.num_head,self.head_dim).permute(0, 2, 1, 3) 
        
        #score has shape (batch_size, num_heads,query_len,key_len)
        score = torch.matmul(Q,K.permute(0,1,3,2)) / self.scale
        
        
        # check if masked is applied
        if mask is not None:
            score = score.masked_fill(mask == 0,value = torch.tensor(-1e10))
        # attention is of shape (batch_size, num_heads,query_len,key_len)
        attention = F.softmax(score,dim = -1)
        
        # key_len has to be equal to query_len
        # output at each layer, x of shape (batch_size, num_heads,query_len,head_dim)
        x = torch.matmul(self.dropout(attention),V)
        
        # x is of shape (batch_size,query_len,num_heads,head_dim)
        x = x.permute(0,2,1,3)
        
        # x is of shape (batch_size,query_len,embed)
        x = x.reshape(batch_size,src_len,self.embed)
        
        # x is of shape (batch_size,query_len,embed)
        x = self.fc(x)

        return x,attention

In [9]:
a = Attention(2,20,'cpu')
x = torch.randn(10,2,20)
y = torch.randn(10,5,20)
# should give the same shape as x
p,q = a(x,y,y)
q.shape

torch.Size([10, 2, 2, 5])

**Feed forward neural network**

In [10]:
class ffnn(nn.Module):
    def __init__(self,embed,hid_dim,device,drop=0.1):
        super().__init__()
        self.embed = embed
        self.hid_dim = hid_dim
        self.device = device
        self.fc1 = nn.Linear(self.embed,self.hid_dim)
        self.fc2 = nn.Linear(self.hid_dim,self.embed)
        self.dropout = nn.Dropout(drop)
    def forward(self,x):
        """
        x : of shape (batch_size, src_len,embed)
        """
        output1 = self.fc1(x)
        output1 = output1.clamp(min=0)
        output1 = self.dropout(output1)
        
        output2 = self.fc2(output1)
        
        return output2

In [11]:
fn = ffnn(20,100,'cpu')
a = torch.randn(1,10,20)
fn(a).shape

torch.Size([1, 10, 20])

 **single encoder layer**

In [12]:
class EncoderLayer(nn.Module):
    def __init__(self,num_head,embed,hid_dim,device,drop=0.1):
        """
        num_head: number of heads in the multi-head self attention mechanism that we are using
        head_dim: the dimension of each head
        embed: the embedded size
        """
        super().__init__()
        self.num_head = num_head
        self.embed = embed
        self.hid_dim = hid_dim
        self.drop = drop
        self.device = device
        # each layer has two sub-layers. The first is a multi-head 
        # self attention mechanism, the second is a simple, position-
        # wise FC feed-forward network
        
        # attention network
        self.attn = Attention(self.num_head,self.embed,self.device,drop=self.drop)
        self.norm1 = nn.LayerNorm(self.embed)
        
        # feed forward neural network
        self.ff = ffnn(self.embed,self.hid_dim,self.device,drop=self.drop)
        self.norm2 = nn.LayerNorm(self.embed)
        
        self.dropout = nn.Dropout(drop)
    def forward(self,x,mask=None):
        # z1 of shape (batch_size,src_len,embed_dim)
        z1,attention = self.attn(x,x,x,mask)

        # sum and normalize, has shape (batch_size,src_len,embed_dim)
        output1 = self.norm1(z1+self.dropout(x))

        # feed into fully connected layer
        output2 = self.ff(output1)
        
        # sum and normalize, has shape (batch_size,src_len,embed_dim)
        output2 = self.norm2(output2+self.dropout(output1))
        
        return output2,attention

In [13]:
# input of shape num_samples,embed_dim
NUM_HEAD = 2
HID_DIM = 150
EMBED = 20
SRC_LEN = 10
E = EncoderLayer(NUM_HEAD,EMBED,HID_DIM,'cpu',drop=0.2)
_in = torch.randn(100,SRC_LEN,EMBED)
E(_in)[0].shape

torch.Size([100, 10, 20])

# **Encoder compiled**

In [14]:
class Encoder(nn.Module):
    def __init__(self,n_layers,input_size,num_head,embed,hid_dim,device,drop=0.1):
        """
        n_layers: number of layers to repeat
        input_size: the size of the dictionary being passed in
        num_head: number of heads present in each multi-head self attention mechanism
        head_dim: dimension of the each head 
        embed: embedded size of the dictionary
        """
        super().__init__()
        self.n_layers = n_layers
        self.input_size = input_size
        self.num_head = num_head
        self.embed = embed
        self.hid_dim = hid_dim
        self.max_size = 200
        self.device = device
        self.drop = drop

        # dictionary embedding and a positional embedding
        self.embeddings = nn.Embedding(self.input_size,self.embed)
        self.pos_embeddings = nn.Embedding(self.max_size,self.embed)
        
        self.layer = EncoderLayer(self.num_head,self.embed,self.hid_dim,self.device,self.drop)
        self.layers = clones(self.layer,self.n_layers)
        self.dropout = nn.Dropout(drop)
        self.scale = torch.sqrt(torch.FloatTensor([self.embed])).to(self.device)
        
    def forward(self,input_,mask=None):
        """
        input_: of shape (batch_size,src_len) to indicate the position of words in the dictionary
        """
        src_len = input_.shape[1]
        batch_size = input_.shape[0]
        
        pos_idx = torch.arange(src_len).unsqueeze(0).repeat(batch_size,1).to(self.device)
        pos_embed = self.pos_embeddings(pos_idx)

        x = self.embeddings(input_)
        x = self.dropout(x*self.scale+pos_embed)
        
        
        for layer in self.layers:
            x,attention = layer(x,mask)       
        
        return x,attention

In [15]:
# input of shape num_samples,embed_dim
N = 6
INPUT_SIZE = 200
NUM_HEAD = 10
HEAD_DIM = 30
EMBED = 20

x = torch.LongTensor([[0,2,3,4],[3,4,5,6]])
E = Encoder(N,INPUT_SIZE,NUM_HEAD,HEAD_DIM,EMBED,'cpu')
E(x)[0].shape

torch.Size([2, 4, 30])

# **Decoder**

In [16]:
class Decoder_layer(nn.Module):
    def __init__(self,num_head,embed,hid_dim,device,drop=0.1):
        super().__init__()
        self.num_head = num_head
        self.embed = embed
        self.hid_dim = hid_dim
        self.drop = drop
        self.device = device

        self.attention1 = Attention(self.num_head,self.embed,device,drop=self.drop)
        self.norm1 = nn.LayerNorm(self.embed)
        
        # This will also take into account the output from encoder
        self.attention2 = Attention(self.num_head,self.embed,device,drop=self.drop)
        self.norm2 = nn.LayerNorm(self.embed)
        
        self.ff = ffnn(self.embed,self.hid_dim,device,drop=self.drop)
        self.norm3 = nn.LayerNorm(self.embed)
        self.dropout = nn.Dropout(drop)
        
    def forward(self,trg,src,trg_mask=None,src_mask=None):
        """
        trg: of shape (batch_size,trg_len,embed)
        encoder_input: of shape (batch_size,src_len,embed) => Q,V
        """
        # First pass through the self attention layer of decoder
        # selfattn is of shape (batch_size,trg_len,embed)
        output1,_ = self.attention1(trg,trg,trg,trg_mask)
        
        # first layer normalization
        output1 = self.norm1(trg+self.dropout(output1))
        
        # encoder_decoder attention layer
        # attention is of shape (batch_size,num_head,trg_len,src_Len)
        output2,attention = self.attention2(output1,src,src,src_mask)
        
        # second layer normalization
        output2 = self.norm2(output1 + self.dropout(output2))
        
        # feed forward neural network
        ff = self.ff(output2)
        
        output3 = self.norm3(output2+ self.dropout(ff))
        
        return output3,attention

In [17]:
NUM_HEAD = 2
SRC_LEN = 5
TRG_LEN = 1
HID_DIM = 120
EMBED = 50
ENCODER_INPUT = torch.randn(15,SRC_LEN,EMBED)
x = torch.randn(15,TRG_LEN,EMBED)
d = Decoder_layer(NUM_HEAD,EMBED,HID_DIM,'cpu',drop=0.5)
d(x,ENCODER_INPUT)[1].shape

torch.Size([15, 2, 1, 5])

In [18]:
class Decoder(nn.Module):
    def __init__(self,n_layers,output_size,num_head,embed,hid_dim,device,drop=0.1):
        super().__init__()
        self.n_layers = n_layers
        self.num_head = num_head
        self.embed = embed
        self.hid_dim = hid_dim
        self.output_size = output_size
        self.device = device
        self.drop = drop

        layer = Decoder_layer(self.num_head,self.embed,self.hid_dim,device,drop=self.drop)
        self.layers = clones(layer,self.n_layers)
        self.max_size = 200
        
        self.embedding = nn.Embedding(self.output_size,self.embed)
        self.pos_embedding = nn.Embedding(self.max_size,self.embed)
        
        self.fc_out = nn.Linear(embed,output_size)
        self.dropout = nn.Dropout(drop)
        self.scale = torch.sqrt(torch.FloatTensor([self.embed])).to(self.device)
        
    def forward(self,input_,encoder_input,trg_mask=None,src_mask=None):
        """
        input_: (batch_size,trg_len)
        encoder_input: (batch_size,trg_len,embed)
        """
        batch_size = input_.shape[0]
        trg_len = input_.shape[1]

        x = self.embedding(input_)
        
        pos_idx = torch.arange(trg_len).unsqueeze(0).repeat(batch_size,1).to(self.device)
        pos_embedding = self.pos_embedding(pos_idx)

        x = self.dropout(x*self.scale + pos_embedding)
        
        for layer in self.layers:
            x,attention = layer(x,encoder_input,trg_mask,src_mask)
            
        # x is of shape (1,trg_len,output)
        x = self.fc_out(x)
        return x,attention

In [19]:
NUM_HEAD = 2
SRC_LEN = 5
TRG_LEN = 2
HID_DIM = 120
EMBED = 50
OUTPUT_SIZE = 100
N_LAYER = 5
ENCODER_INPUT = torch.randn(2,SRC_LEN,EMBED)
x = torch.LongTensor([[1,2],[2,3]])
de = Decoder(N_LAYER,OUTPUT_SIZE,NUM_HEAD,EMBED,HID_DIM,'cpu',drop=0.3)
de(x,ENCODER_INPUT)[0].shape

torch.Size([2, 2, 100])

# **Full Model**

In [20]:
class self_attn(nn.Module):
    def __init__(self,input_lang,output_lang,n_layers,num_head,embed,hid_dim,device,drop=0.1):
        super().__init__()
        self.input_lang = input_lang
        self.output_lang = output_lang
        
        self.num_head = num_head
        self.embed = embed
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.input_size = self.input_lang.n_words
        self.output_size = self.output_lang.n_words
        self.drop = drop
        self.device = device

        self.SRC_PAD_IDX = self.input_lang.word2index['PAD']
        self.TRG_PAD_IDX = self.output_lang.word2index['PAD']

        self.encoder = Encoder(self.n_layers,self.input_size,self.num_head,self.embed,self.hid_dim,self.device,drop=self.drop)
        self.decoder = Decoder(self.n_layers,self.output_size,self.num_head,self.embed,self.hid_dim,self.device,drop=self.drop)

        self.loss = nn.CrossEntropyLoss(ignore_index=self.TRG_PAD_IDX)

    def make_src_mask(self,input_t):
        # input_t = [batch_size, src_len]
        src_mask = (input_t != self.SRC_PAD_IDX).unsqueeze(1).unsqueeze(2)
        # src_mask = [batch_size,1,1,src_len]

        return src_mask

    def make_trg_mask(self,output_t):
         # output_t = [batch_size,trg len]
        trg_mask = (output_t !=  self.TRG_PAD_IDX).unsqueeze(1).unsqueeze(2)
        # trg_mask = [batch_size , 1, 1,trg len]

        trg_len = output_t.shape[1]

        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        
        trg_mask = trg_mask & trg_sub_mask
        return trg_mask
    
    def forward(self,data):
        # input_t = [batch_size, in_max_len]
        # output_t = [batch_size, out_max_len]
        input_t,output_t,input_len,output_len = data
        
        largest_in_len = torch.max(input_len)
        largest_out_len = torch.max(output_len)

        input_t = input_t[:,:largest_in_len]
        output_t = output_t[:,:largest_out_len]
        
        src_mask = self.make_src_mask(input_t)
        trg_mask = self.make_trg_mask(output_t[:,:-1])

        # src_output of shape (batch_size, src_len,embed)
        src_output,enc_attn = self.encoder(input_t,src_mask)
        
        # trg_output of shape (batch_size, trg_len,output_size)
        trg_output,dec_attn = self.decoder(output_t[:,:-1],src_output,trg_mask,src_mask)
        
        trg_output = trg_output.contiguous().view(-1,self.output_size)
        output_t = output_t[:,1:].contiguous().view(-1)

        loss = self.loss(trg_output,output_t)
        return loss,(enc_attn,dec_attn)

In [21]:
INPUT_LANG,OUTPUT_LANG, pairs= read_Lang()
from sklearn.model_selection import KFold

kf = KFold(n_splits=10,shuffle=True)
pairs = np.array(pairs)

# **Training**

In [22]:
def train(model,trainloader,optimizer):
    clip = 1
    epoch_loss = 0
    
    model.train()
    for data in trainloader:
        # zero out gradient descent
        optimizer.zero_grad()
        # forward pass

        # trg_output of shape (batch_size, trg_len,output_size)   
        loss,_ = model(data)            

        # call backward on loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(),clip)
                
        # perform gradient descent
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss/len(trainloader)
    
    return avg_loss

# **Evaluation**

In [23]:
def evaluation(model,testloader):
    model.eval()
    with torch.no_grad():
        running_loss = 0
        for data in testloader:
            loss,_ = model(data)

            running_loss += loss.item()
                
            
        return running_loss/len(testloader)

# **Translation**

In [24]:
def translate(model,input_,device,maxIter=50):
    input_ = input_.lower()
    input_ = re.sub(r"([.!?])",r" \1",input_)
    input_ =  re.sub(r"[^a-zA-Z.!?]+", r" ",input_)
    
    input_s = input_.split(" ")
    input_s = input_s + ['EOS']    
    input_t = torch.zeros(1,len(input_s)).to(device)
    for i in range(len(input_s)):
        idx_i = model.input_lang.word2index[input_s[i]]
        input_t[:,i] = idx_i
    input_t = input_t.long()

    src_mask = model.make_src_mask(input_t)
    
    with torch.no_grad():
        model.eval()
        src_outputs,src_attention = model.encoder(input_t,src_mask)    
        
        # the first word passed into the decoder network is the SOS
        decoder_input = model.output_lang.word2index['SOS']
        decoder_input = [decoder_input]
            
        iter_ = 0
        attention = []
        word = []
        
        for i in range(maxIter):
            trg_tensor = torch.LongTensor(decoder_input).unsqueeze(0).to(device)
            
            trg_mask = model.make_trg_mask(trg_tensor)

            # trg_output = [batch_size, trg_len, output_size]
            trg_output,trg_att = model.decoder(trg_tensor,src_outputs,trg_mask,src_mask)
            
            attention.append(trg_att.flatten())
                
            # find the top scoring candidate
            top1 = torch.argmax(trg_output[0],dim=1)
            
            # find the current token corresponding to the top scoring candiate
            curr_token = model.output_lang.index2word[top1[-1].item()]

            
            if curr_token == 'EOS':
                break
            word.append(curr_token)

            # update decoder output to the top candidate
            decoder_input.append(top1[-1].item())
            
            iter_ +=1
        if model.output_lang.name == 'eng':
            word = " ".join(word)
        else:
            word = "".join(word)
    return word,attention

In [25]:
HIDDEN_SIZE = 512
EMBEDDED_SIZE = 256
N_LAYERS = 3
NUM_HEAD = 8
DROP_P = 0.1

In [173]:
EPOCH = 20
best_val_loss = float('inf')
num_val = 1

for train_idx, test_idx in kf.split(pairs):
  train_pairs,test_pairs = pairs[train_idx],pairs[test_idx]
  trainloader = language_loader(train_pairs,INPUT_LANG,OUTPUT_LANG,device)
  trainloader = DataLoader(trainloader,shuffle=True,batch_size=128)

  testloader = language_loader(test_pairs,INPUT_LANG,OUTPUT_LANG,device)
  testloader = DataLoader(testloader,batch_size=128)
  

  train_loss_vec  = []
  test_loss_vec = []

  model = self_attn(INPUT_LANG,OUTPUT_LANG,N_LAYERS,NUM_HEAD,EMBEDDED_SIZE,HIDDEN_SIZE,device,DROP_P)
  model.to(device)
  optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)
  for e in range(EPOCH):
    start = time.time()

    train_loss = train(model,trainloader,optimizer)
    test_loss = evaluation(model,testloader)

    train_loss_vec.append(train_loss)
    test_loss_vec.append(test_loss)

    end = time.time()
    elapsed = end-start
    print("At validation {0:d}, iteration {1:d}, the train loss is {2:.3f} and test loss is {3:.3f},time it takes is {4:.3f}".format(num_val,e+1,train_loss,test_loss,elapsed))
  if test_loss < best_val_loss:
    best_val_loss = test_loss
    torch.save(model.state_dict(), "Transformer_{}.pt".format(num_val))

    best_train_loss = train_loss_vec
    best_test_loss = test_loss_vec
  num_val += 1

At validation 1, iteration 1, the train loss is 4.496 and test loss is 3.647,time it takes is 11.741
At validation 1, iteration 2, the train loss is 3.420 and test loss is 3.130,time it takes is 11.743
At validation 1, iteration 3, the train loss is 2.963 and test loss is 2.874,time it takes is 11.738
At validation 1, iteration 4, the train loss is 2.654 and test loss is 2.694,time it takes is 11.610
At validation 1, iteration 5, the train loss is 2.426 and test loss is 2.571,time it takes is 11.662
At validation 1, iteration 6, the train loss is 2.240 and test loss is 2.521,time it takes is 11.612
At validation 1, iteration 7, the train loss is 2.092 and test loss is 2.467,time it takes is 11.680
At validation 1, iteration 8, the train loss is 1.962 and test loss is 2.412,time it takes is 11.686
At validation 1, iteration 9, the train loss is 1.857 and test loss is 2.374,time it takes is 11.701
At validation 1, iteration 10, the train loss is 1.775 and test loss is 2.353,time it takes

In [26]:
model = self_attn(INPUT_LANG,OUTPUT_LANG,N_LAYERS,NUM_HEAD,EMBEDDED_SIZE,HIDDEN_SIZE,device,DROP_P).to(device)
model.load_state_dict(torch.load("Transformer_6.pt",map_location=torch.device('cpu')))

<All keys matched successfully>

# **BLEU evaluation**

In [0]:
from nltk.translate.bleu_score import sentence_bleu
def bleu_score(ref,can,n_gram=4):    
    if n_gram==4:    # 4-gram
        weights=(0.25,0.25,0.25,0.25)
    elif n_gram==3:  # 3-gram
        weights=(0.33,0.33,0.33,0)
    elif n_gram==2:  # 2-gram
        weights=(0.5,0.5,0,0)        
    elif n_gram==1:  # 1-gram
        weights=(1,0,0,0)    
    else:
        print("wrong n_gram")
        return 0
    return sentence_bleu(ref, can, weights)*100

def ref_transform(ref):    
    ref=re.sub(r"([，；:。？、！])",r" ",ref)
    ref=ref.split()

    n_blank=0
    for i in range(len(ref)):
        if ref[i]=='':
            n_blank=n_blank+1
    for i in range(n_blank):
        ref.remove('')
    return ref

def can_transform(result):
    result=re.sub(r"([，；:。？、！])",r" ",result)
    can=[]
    for i in range(len(result)):
        if result[i]!=' ':
            can.append(result[i])
    return can

In [178]:
ref_can_transformer=[]
for i in range(len(pairs)):
    ref=pairs[i][1]
    ref=ref_transform(ref)
    result=translate(model,pairs[i][0],device)[0]
    can=can_transform(result)
    ref_can_transformer.append([ref,can])
    if i%999==0:
      print(i)

0


KeyboardInterrupt: ignored