In [1]:
import sys
import numpy as np
import torch
import math
import pickle
import time

from nltk.translate.bleu_score import corpus_bleu

In [2]:
#params
sourceFileName = '/content/drive/MyDrive/TII_PROJECT/en_bg_data/train.bg'
targetFileName = '/content/drive/MyDrive/TII_PROJECT/en_bg_data/train.en'
sourceDevFileName = '/content/drive/MyDrive/TII_PROJECT/en_bg_data/dev.bg'
targetDevFileName = '/content/drive/MyDrive/TII_PROJECT/en_bg_data/dev.en'

corpusDataFileName = '/content/drive/MyDrive/TII_PROJECT/corpusData'
wordsDataFileName = '/content/drive/MyDrive/TII_PROJECT/wordsData'
modelFileName = '/content/drive/MyDrive/TII_PROJECT/NMTmodel'

device = torch.device("cuda:0")
# device = torch.device("cpu")


# Nx = 6
# n_head = 8
# d_model = 256
# dropout = 0.3

# # params for nmtmodel
#began training with 0.3 dropout, 0.001 lr -> 0.2 dropout, 0.0005 lr -> 0.1 dropout, 0.0003 lr // each step trained for 10 epochs
Nx = 4
n_head = 4
d_model = 128
dropout = 0.3

# Nx = 3
# n_head = 8
# d_model = 256
# dropout = 0.3

learning_rate = 0.001
clip_grad = 5.0
learning_rate_decay = 0.5

batchSize = 32

maxEpochs = 10
log_every = 10
test_every = 2000

max_patience = 5
max_trials = 5


In [3]:
#############################################################################
### Търсене и извличане на информация. Приложение на дълбоко машинно обучение
### Стоян Михов
### Зимен семестър 2022/2023
##########################################################################
###
### Невронен машинен превод
###
#############################################################################

import sys
import random
import nltk
from nltk.translate.bleu_score import corpus_bleu
nltk.download('punkt')

class progressBar:
    def __init__(self ,barWidth = 50):
        self.barWidth = barWidth
        self.period = None
    def start(self, count):
        self.item=0
        self.period = int(count / self.barWidth)
        sys.stdout.write("["+(" " * self.barWidth)+"]")
        sys.stdout.flush()
        sys.stdout.write("\b" * (self.barWidth+1))
    def tick(self):
        if self.item>0 and self.item % self.period == 0:
            sys.stdout.write("-")
            sys.stdout.flush()
        self.item += 1
    def stop(self):
        sys.stdout.write("]\n")

def readCorpus(fileName):
    ### Чете файл от изречения разделени с нов ред `\n`.
    ### fileName е името на файла, съдържащ корпуса
    ### връща списък от изречения, като всяко изречение е списък от думи
    print('Loading file:',fileName)
    return [ nltk.word_tokenize(line) for line in open(fileName,encoding="utf-8") ]

def getDictionary(corpus, startToken, endToken, unkToken, padToken, wordCountThreshold = 2):
    dictionary={}
    for s in corpus:
        for w in s:
            if w in dictionary: dictionary[w] += 1
            else: dictionary[w]=1

    words = [startToken, endToken, unkToken, padToken] + [w for w in sorted(dictionary) if dictionary[w] > wordCountThreshold]
    return { w:i for i,w in enumerate(words)}


def prepareData(sourceFileName, targetFileName, sourceDevFileName, targetDevFileName, startToken, endToken, unkToken, padToken):

    sourceCorpus = readCorpus(sourceFileName)
    targetCorpus = readCorpus(targetFileName)
    sourceWord2ind = getDictionary(sourceCorpus, startToken, endToken, unkToken, padToken)
    targetWord2ind = getDictionary(targetCorpus, startToken, endToken, unkToken, padToken)

    targetCorpus = [ [startToken] + s + [endToken] for s in targetCorpus]

    sourceDev = readCorpus(sourceDevFileName)
    targetDev = readCorpus(targetDevFileName)

    targetDev = [ [startToken] + s + [endToken] for s in targetDev]

    print('Corpus loading completed.')
    return sourceCorpus, sourceWord2ind, targetCorpus, targetWord2ind, sourceDev, targetDev



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
class MultiHeadAttentionLayer(torch.nn.Module):
    def __init__(self, hidden_size, heads, dropout, device):
        super().__init__()
                
        self.hidden_size = hidden_size
        self.heads = heads
        self.head_size = hidden_size // heads
        
        self.fc_q = torch.nn.Linear(hidden_size, hidden_size, bias=False)
        self.fc_k = torch.nn.Linear(hidden_size, hidden_size, bias=False)
        self.fc_v = torch.nn.Linear(hidden_size, hidden_size, bias=False)
        
        self.fc_out = torch.nn.Linear(hidden_size, hidden_size)
        
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_size])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        
        # query = [batch size, query len, hid dim]
        # key = [batch size, key len, hid dim]
        # value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)  # [batch size, query len, hid dim]
        K = self.fc_k(key)    # [batch size, key len, hid dim]  
        V = self.fc_v(value)  # [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.heads, self.head_size).permute(0, 2, 1, 3)  # [batch size, n heads, query len, head dim]
        K = K.view(batch_size, -1, self.heads, self.head_size).permute(0, 2, 3, 1)  # [batch size, n heads, head dim, key len]  
        V = V.view(batch_size, -1, self.heads, self.head_size).permute(0, 2, 1, 3)  # [batch size, n heads, value len, head dim]
                
        attn_score = torch.matmul(Q, K) / self.scale  # [batch size, n heads, query len, key len]
        
        if mask is not None:
            attn_score = attn_score.masked_fill(mask == 0, -float('inf'))
        
        attention = torch.nn.functional.softmax(attn_score, dim = 3)  # [batch size, n heads, query len, key len]
                
        x = torch.matmul(attention, V)  # [batch size, n heads, query len, head dim]
        
        x = x.permute(0, 2, 1, 3).flatten(2,3)  # [batch size, query len, n heads, head dim] -> [batch size, query len * n heads, head dim]

                
        x = self.fc_out(x)  # [batch size, query len, hid dim]
        
        return x


In [5]:
class Encoder(torch.nn.Module):
    def __init__(self, input_size, hidden_size, layers, heads, pf_size, dropout, device, max_length = 1000):
        super().__init__()

        self.device = device
        
        self.word_embedding = torch.nn.Embedding(input_size, hidden_size)
        self.pos_embedding = torch.nn.Embedding(max_length, hidden_size)
        
        self.layers = torch.nn.ModuleList([EncoderLayer(hidden_size, heads, pf_size, dropout, device) for _ in range(layers)])
        
        self.dropout = torch.nn.Dropout(dropout)
        
        
    def forward(self, src, src_mask):
        
        # src = [batch size, src len]
        # src_mask = [batch size, 1, 1, src len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)   # [batch size, src len]
        
        src = self.dropout((self.word_embedding(src)) + self.pos_embedding(pos))  # [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask) # [batch size, src len, hid dim]
            
        return src

class EncoderLayer(torch.nn.Module):
    def __init__(self, hidden_size, heads, pf_size, dropout, device):
        super().__init__()
        
        self.self_attn_layer_norm = torch.nn.LayerNorm(hidden_size)
        self.ff_layer_norm = torch.nn.LayerNorm(hidden_size)
        self.self_attention = MultiHeadAttentionLayer(hidden_size, heads, dropout, device)
        self.positionwise_feedforward = torch.nn.Sequential(
            torch.nn.Linear(hidden_size, pf_size),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(pf_size, hidden_size), 
        )
        self.dropout = torch.nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        # src = [batch size, src len, hid dim]
        # src_mask = [batch size, 1, 1, src len] 
                
        _src = self.self_attention(src, src, src, src_mask)
        src = self.self_attn_layer_norm(src + self.dropout(_src))  # [batch size, src len, hid dim]

        _src = self.positionwise_feedforward(src)
        src = self.ff_layer_norm(src + self.dropout(_src))  # [batch size, src len, hid dim]
        
        return src



In [6]:
class Decoder(torch.nn.Module):
    def __init__(self, output_size, hidden_size, layers, heads, pf_size, dropout, device, max_length = 1000):
        super().__init__()
        
        self.device = device
        
        self.word_embedding = torch.nn.Embedding(output_size, hidden_size)
        self.pos_embedding = torch.nn.Embedding(max_length, hidden_size)
        
        self.layers = torch.nn.ModuleList([DecoderLayer(hidden_size, heads, pf_size, dropout, device) for _ in range(layers)])
        
        self.fc_out = torch.nn.Linear(hidden_size, output_size)
        
        self.dropout = torch.nn.Dropout(dropout)
        
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        # trg = [batch size, trg len]
        # enc_src = [batch size, src len, hid dim]
        # trg_mask = [batch size, 1, trg len, trg len]
        # src_mask = [batch size, 1, 1, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)  # [batch size, trg len] 
            
        trg = self.dropout((self.word_embedding(trg)) + self.pos_embedding(pos))  # [batch size, trg len, hid dim] 
        
        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask)  # [batch size, trg len, hid dim]
        
        output = self.fc_out(trg)  # [batch size, trg len, output dim] 
            
        return output

class DecoderLayer(torch.nn.Module):
    def __init__(self, hidden_size, heads, pf_size, dropout, device):
        super().__init__()
        
        self.self_attn_layer_norm = torch.nn.LayerNorm(hidden_size)
        self.enc_attn_layer_norm = torch.nn.LayerNorm(hidden_size)
        self.ff_layer_norm = torch.nn.LayerNorm(hidden_size)
        self.self_attention = MultiHeadAttentionLayer(hidden_size, heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hidden_size, heads, dropout, device)
        self.positionwise_feedforward = torch.nn.Sequential(
            torch.nn.Linear(hidden_size, pf_size),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(pf_size, hidden_size), 
        )
        self.dropout = torch.nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        # trg = [batch size, trg len, hid dim]
        # enc_src = [batch size, src len, hid dim]
        # trg_mask = [batch size, 1, trg len, trg len]
        # src_mask = [batch size, 1, 1, src len]
        
        _trg = self.self_attention(trg, trg, trg, trg_mask)
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))  # [batch size, trg len, hid dim]

        _trg = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))  # [batch size, trg len, hid dim]
        
        _trg = self.positionwise_feedforward(trg)
        trg = self.ff_layer_norm(trg + self.dropout(_trg))  # [batch size, trg len, hid dim]
        
        
        return trg

In [7]:
class NMTmodel(torch.nn.Module):
    def preparePaddedBatch(self, source, word2ind):
        m = max(len(s) for s in source)
        sents = [[word2ind.get(w,self.unkTokenIdx) for w in s] for s in source]
        sents_padded = [ s+(m-len(s))*[self.padTokenIdx] for s in sents]
        return torch.tensor(sents_padded, dtype=torch.long, device=self.device)
    
    def save(self,fileName):
        torch.save(self.state_dict(), fileName)
    
    def load(self,fileName,device):
        self.load_state_dict(torch.load(fileName, map_location = device))

    def make_src_mask(self, src):
        return (src != self.padTokenIdx).unsqueeze(1).unsqueeze(2)   # [batch size, 1, 1, src len]
    
    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.padTokenIdx).unsqueeze(1).unsqueeze(2)  # [batch size, 1, 1, trg len]
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()  # [trg len, trg len]
        return trg_pad_mask & trg_sub_mask  # [batch size, 1, trg len, trg len]

    def get_topk(self, k, candidates, weights, alpha):
        lenghts = torch.count_nonzero(weights, dim = 1)
        norm_sum = torch.sum(weights, dim = 1) / torch.pow(lenghts, alpha)
        topk = norm_sum.topk(k).indices
        return candidates[topk], weights[topk]
    
    def __init__(self, d_model,n_head,Nx,dropout, device, sourceWord2ind, targetWord2ind, startToken, unkToken, padToken, endToken):
        super(NMTmodel, self).__init__()
        self.device = device
        self.sourceWord2ind = sourceWord2ind
        self.targetWord2ind = targetWord2ind
        self.startTokenIdx = sourceWord2ind[startToken]
        self.unkTokenIdx = sourceWord2ind[unkToken]
        self.padTokenIdx = sourceWord2ind[padToken]
        self.endTokenIdx = sourceWord2ind[endToken]
        self.encoder = Encoder(len(sourceWord2ind),d_model,Nx,n_head,d_model*2,dropout,device)
        self.decoder = Decoder(len(targetWord2ind),d_model,Nx,n_head,d_model*2,dropout,device)

    def forward(self, src, trg):
        src_padded = self.preparePaddedBatch(src, self.sourceWord2ind)    # [batch size, src len]
        trg_padded = self.preparePaddedBatch(trg, self.targetWord2ind)    # [batch size, trg len]
        
        src_mask = self.make_src_mask(src_padded)               # [batch size, 1, 1, src len]      
        trg_mask = self.make_trg_mask(trg_padded[:, :-1])       # [batch size, 1, trg len, trg len]
        
        enc_src = self.encoder(src_padded, src_mask)                                           # [batch size, src len, hid dim]
        output = self.decoder(trg_padded[:, :-1], enc_src, trg_mask, src_mask)      # [batch size, trg len, output dim]

        output_dim = output.shape[-1]
            
        output = output.flatten(0,1)
        trg_padded = trg_padded[:,1:].flatten(0,1)

        H = torch.nn.functional.cross_entropy(output, trg_padded, ignore_index=self.padTokenIdx)

        return H

    def translateSentence(self, sentence, limit=1000):
        ind2word = dict(enumerate(self.targetWord2ind))

        tokens = [self.sourceWord2ind[w] if w in self.sourceWord2ind.keys() else self.unkTokenIdx for w in sentence]
        src = torch.tensor(tokens, dtype=torch.long, device=self.device).unsqueeze(0)
        src_mask = self.make_src_mask(src)
        result = [self.startTokenIdx]

        with torch.no_grad():
            encoder_outputs = self.encoder(src, src_mask)

            for i in range(limit):
                trg = torch.tensor(result, dtype=torch.long, device=self.device).unsqueeze(0)

                trg_mask = self.make_trg_mask(trg)

                output = self.decoder(trg, encoder_outputs, trg_mask, src_mask)
                output = output[:, -1, :].squeeze()

                output = torch.nn.functional.softmax(output,dim=0)
                
                
                topk = output.topk(2).indices.tolist()

                pred_token = topk[0] if topk[0] != self.unkTokenIdx else topk[1]
                result.append(pred_token)

                if pred_token == self.endTokenIdx:
                    break

        return [ind2word[i] for i in result[1:] if i != self.endTokenIdx]

    


In [8]:
def perplexity(nmt, sourceTest, targetTest, batchSize):
    testSize = len(sourceTest)
    H = 0.
    c = 0
    for b in range(0,testSize,batchSize):
        sourceBatch = sourceTest[b:min(b+batchSize, testSize)]
        targetBatch = targetTest[b:min(b+batchSize, testSize)]
        l = sum(len(s)-1 for s in targetBatch)
        c += l
        with torch.no_grad():
            H += l * nmt(sourceBatch,targetBatch)
    return math.exp(H/c)


In [71]:
(sourceCorpus,targetCorpus,sourceDev,targetDev) = pickle.load(open(corpusDataFileName, 'rb'))
(sourceWord2ind,targetWord2ind) = pickle.load(open(wordsDataFileName, 'rb'))
startToken = '<S>'
endToken = '</S>'
unkToken = '<UNK>'
padToken = '<PAD>'
modelFileName = '/content/drive/MyDrive/TII_PROJECT/NMTmodel' +"_"+ str(d_model)+"_"+ str(n_head)+"_"+ str(Nx)+"_"+ str(dropout)
nmt = NMTmodel(d_model,n_head,Nx,dropout,device,sourceWord2ind,targetWord2ind,startToken,unkToken,padToken,endToken).to(device)
optimizer = torch.optim.Adam(nmt.parameters(), lr=learning_rate)

idx = np.arange(len(sourceCorpus), dtype='int32')
nmt.train()
trial = 0
patience = 0
iter = 0
beginTime = time.time()
bestPerplexity = math.inf

for epoch in range(maxEpochs):
    np.random.shuffle(idx)
    targetWords = 0
    trainTime = time.time()
    for b in range(0, len(idx), batchSize):
        iter += 1
        sourceBatch = [sourceCorpus[i]
                       for i in idx[b:min(b+batchSize, len(idx))]]
        targetBatch = [targetCorpus[i]
                       for i in idx[b:min(b+batchSize, len(idx))]]

        st = sorted(list(zip(sourceBatch, targetBatch)),
                    key=lambda e: len(e[0]), reverse=True)
        (sourceBatch, targetBatch) = tuple(zip(*st))
        targetWords += sum(len(s)-1 for s in targetBatch)
        H = nmt(sourceBatch, targetBatch)
        optimizer.zero_grad()
        H.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(nmt.parameters(), clip_grad)
        optimizer.step()
        if iter % log_every == 0:
            print("Iteration:", iter, "Epoch:", epoch+1, '/', maxEpochs, ", Batch:", b//batchSize+1, '/', len(idx) // batchSize+1,
                  ", loss: ", H.item(), "words/sec:", targetWords / (time.time() - trainTime), "time elapsed:", (time.time() - beginTime))
            trainTime = time.time()
            targetWords = 0
        if iter % test_every == 0:
            nmt.eval()
            currentPerplexity = perplexity(
                nmt, sourceDev, targetDev, batchSize)
            nmt.train()
            print('Current model perplexity: ', currentPerplexity)

            if currentPerplexity < bestPerplexity:
                patience = 0
                bestPerplexity = currentPerplexity
                print('Saving new best model.')
                nmt.save(modelFileName)
                torch.save((bestPerplexity, learning_rate,
                           optimizer.state_dict()), modelFileName + '.optim')
            else:
                patience += 1
                if patience == max_patience:

                    trial += 1
                    if trial == max_trials:
                        print('early stop!')
                        exit(0)
                    learning_rate *= learning_rate_decay
                    print(
                        'load previously best model and decay learning rate to:', learning_rate)
                    nmt.load(modelFileName)
                    (bestPerplexity, _, osd) = torch.load(
                        modelFileName + '.optim')
                    optimizer.load_state_dict(osd)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = learning_rate
                    patience = 0


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Iteration: 6750 Epoch: 2 / 10 , Batch: 1125 / 5626 , loss:  3.537274122238159 words/sec: 17182.867898061446 time elapsed: 334.9596493244171
Iteration: 6760 Epoch: 2 / 10 , Batch: 1135 / 5626 , loss:  3.886357307434082 words/sec: 19123.870738365298 time elapsed: 335.4298167228699
Iteration: 6770 Epoch: 2 / 10 , Batch: 1145 / 5626 , loss:  3.6000452041625977 words/sec: 19316.102135454712 time elapsed: 335.88523268699646
Iteration: 6780 Epoch: 2 / 10 , Batch: 1155 / 5626 , loss:  3.582972288131714 words/sec: 17327.310070028074 time elapsed: 336.3552598953247
Iteration: 6790 Epoch: 2 / 10 , Batch: 1165 / 5626 , loss:  3.649428367614746 words/sec: 16311.669374030906 time elapsed: 336.90468645095825
Iteration: 6800 Epoch: 2 / 10 , Batch: 1175 / 5626 , loss:  3.554438829421997 words/sec: 18474.471681165458 time elapsed: 337.3708801269531
Iteration: 6810 Epoch: 2 / 10 , Batch: 1185 / 5626 , loss:  3.5235605239868164 words/sec: 18

In [9]:
#extra train
(sourceCorpus,targetCorpus,sourceDev,targetDev) = pickle.load(open(corpusDataFileName, 'rb'))
(sourceWord2ind,targetWord2ind) = pickle.load(open(wordsDataFileName, 'rb'))
startToken = '<S>'
endToken = '</S>'
unkToken = '<UNK>'
padToken = '<PAD>'

# dropout = 0.2
# learning_rate = 0.0005




modelFileName = '/content/drive/MyDrive/TII_PROJECT/NMTmodel' +"_"+ str(d_model)+"_"+ str(n_head)+"_"+ str(Nx)+"_"+ str(dropout)
nmt = NMTmodel(d_model,n_head,Nx,0.1,device,sourceWord2ind,targetWord2ind,startToken,unkToken,padToken,endToken).to(device)
optimizer = torch.optim.Adam(nmt.parameters(), lr=0.0003)

idx = np.arange(len(sourceCorpus), dtype='int32')
nmt.train()
trial = 0
patience = 0
iter = 0

nmt.load(modelFileName,device)
(bestPerplexity,learning_rate,osd) = torch.load(modelFileName + '.optim')
optimizer.load_state_dict(osd)
for param_group in optimizer.param_groups:
  param_group['lr'] = learning_rate

beginTime = time.time()
for epoch in range(maxEpochs):
      np.random.shuffle(idx)
      targetWords = 0
      trainTime = time.time()
      for b in range(0, len(idx), batchSize):
          iter += 1
          sourceBatch = [ sourceCorpus[i] for i in idx[b:min(b+batchSize, len(idx))] ]
          targetBatch = [ targetCorpus[i] for i in idx[b:min(b+batchSize, len(idx))] ]
            
          st = sorted(list(zip(sourceBatch,targetBatch)),key=lambda e: len(e[0]), reverse=True)
          (sourceBatch,targetBatch) = tuple(zip(*st))
          targetWords += sum( len(s)-1 for s in targetBatch )
          H = nmt(sourceBatch,targetBatch)
          optimizer.zero_grad()
          H.backward()
          grad_norm = torch.nn.utils.clip_grad_norm_(nmt.parameters(), clip_grad)
          optimizer.step()
          if iter % log_every == 0:
            print("Iteration:",iter,"Epoch:",epoch+1,'/',maxEpochs,", Batch:",b//batchSize+1, '/', len(idx) // batchSize+1, ", loss: ",H.item(), "words/sec:",targetWords / (time.time() - trainTime), "time elapsed:", (time.time() - beginTime) )
            trainTime = time.time()
            targetWords = 0
          if iter % test_every == 0:
            nmt.eval()
            currentPerplexity = perplexity(nmt, sourceDev, targetDev, batchSize)
            nmt.train()
            print('Current model perplexity: ',currentPerplexity)

            if currentPerplexity < bestPerplexity:
              patience = 0
              bestPerplexity = currentPerplexity
              print('Saving new best model.')
              nmt.save(modelFileName)
              torch.save((bestPerplexity,learning_rate,optimizer.state_dict()), modelFileName + '.optim')
            else:
              patience += 1
              if patience == max_patience:
                    
                trial += 1
                if trial == max_trials:
                  print('early stop!')
                  exit(0)
                learning_rate *= learning_rate_decay
                print('load previously best model and decay learning rate to:', learning_rate)
                nmt.load(modelFileName,device)
                (bestPerplexity,_,osd) = torch.load(modelFileName + '.optim')
                optimizer.load_state_dict(osd)
                for param_group in optimizer.param_groups:
                  param_group['lr'] = learning_rate
                patience = 0

print('reached maximum number of epochs!')
nmt.eval()
currentPerplexity = perplexity(nmt, sourceDev, targetDev, batchSize)
print('Last model perplexity: ',currentPerplexity)

if currentPerplexity < bestPerplexity:
  bestPerplexity = currentPerplexity
  print('Saving last model.')
  nmt.save(modelFileName)
  torch.save((bestPerplexity,learning_rate,optimizer.state_dict()), modelFileName + '.optim')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Iteration: 6640 Epoch: 2 / 10 , Batch: 1015 / 5626 , loss:  1.1052511930465698 words/sec: 19274.760034761144 time elapsed: 330.19881081581116
Iteration: 6650 Epoch: 2 / 10 , Batch: 1025 / 5626 , loss:  1.0806127786636353 words/sec: 18238.481275528553 time elapsed: 330.65656757354736
Iteration: 6660 Epoch: 2 / 10 , Batch: 1035 / 5626 , loss:  1.063866376876831 words/sec: 19041.09335435853 time elapsed: 331.12155532836914
Iteration: 6670 Epoch: 2 / 10 , Batch: 1045 / 5626 , loss:  1.0134670734405518 words/sec: 19185.358754464713 time elapsed: 331.5821805000305
Iteration: 6680 Epoch: 2 / 10 , Batch: 1055 / 5626 , loss:  1.111252784729004 words/sec: 17798.46023070855 time elapsed: 332.08073925971985
Iteration: 6690 Epoch: 2 / 10 , Batch: 1065 / 5626 , loss:  1.0310975313186646 words/sec: 19117.06375980239 time elapsed: 332.53933691978455
Iteration: 6700 Epoch: 2 / 10 , Batch: 1075 / 5626 , loss:  1.1447144746780396 words/sec:

In [None]:
(sourceWord2ind,targetWord2ind) = pickle.load(open(wordsDataFileName, 'rb'))

nmt = NMTmodel(d_model,n_head,Nx,dropout,device,sourceWord2ind,targetWord2ind,startToken,unkToken,padToken,endToken).to(device)
nmt.load(modelFileName,device)

sourceTest = readCorpus("/content/drive/MyDrive/TII_PROJECT/en_bg_data/test.bg")
targetTest = readCorpus("/content/drive/MyDrive/TII_PROJECT/en_bg_data/test.en")
targetTest = [ [startToken] + s + [endToken] for s in targetTest]

nmt.eval()
print('Model perplexity on test: ', perplexity(nmt, sourceTest, targetTest, batchSize))
print('Model perplexity on dev: ', perplexity(nmt, sourceDev, targetDev, batchSize))

In [97]:
#test translation
(sourceWord2ind, targetWord2ind) = pickle.load(open(wordsDataFileName, 'rb'))

sourceTest = readCorpus("/content/drive/MyDrive/TII_PROJECT/en_bg_data/test.bg")

nmt = NMTmodel(d_model,n_head,Nx,dropout,device,sourceWord2ind,targetWord2ind,startToken,unkToken,padToken,endToken).to(device)
nmt.load(modelFileName,device)

nmt.eval()
file = open("testTranslated.en", 'w')
input = {}
output = {}
pb = progressBar()
pb.start(len(sourceTest))
i = 0
for s in sourceTest:
    input[i] = s
    temp = ' '.join(nmt.translateSentence(s))+"\n"
    output[i] = temp
    file.write(temp)
    pb.tick()
    i+=1
pb.stop()
print(i)

Loading file: /content/drive/MyDrive/TII_PROJECT/en_bg_data/test.bg
[                                                  ]-------------------------------------------------]
6000


In [101]:
ref = [[s] for s in readCorpus("/content/drive/MyDrive/TII_PROJECT/en_bg_data/test.en")]
hyp = readCorpus("testTranslated.en")
print(len(ref))
print(len(hyp))
bleu_score = corpus_bleu(ref[:len(hyp)], hyp)
print('Corpus BLEU: ', (bleu_score * 100))

Loading file: /content/drive/MyDrive/TII_PROJECT/en_bg_data/test.en
Loading file: testTranslated.en
6000
5945
Corpus BLEU:  40.184890058370705


In [139]:
def translate_and_rate(model,sourceFile,sourceTranslated,targetFile):
    sourceText = readCorpus(sourceFile)
    model.eval()
    file = open(targetFile,'w',encoding="utf-8")
    input = {}
    output = {} 
    pb = progressBar()
    pb.start(len(sourceText))
    i = 0
    for s in sourceText:
        input[i] = s
        temp = ' '.join(nmt.translateSentence(s))+"\n"
        output[i] = temp
        file.write(temp)
        pb.tick()
        i+=1
    file.close()
    pb.stop()
    ref = [[s] for s in readCorpus(sourceTranslated)]
    hyp = readCorpus(targetFile)
    print(len(ref))
    print(len(hyp))
    bleu_score = corpus_bleu(ref[:len(hyp)], hyp)
    print(f"Corpus BLEU score for {sourceFile}: {bleu_score*100}")
    return (output,bleu_score)

In [122]:
#test translation
(sourceWord2ind, targetWord2ind) = pickle.load(open(wordsDataFileName, 'rb'))
nmt = NMTmodel(d_model,n_head,Nx,dropout,device,sourceWord2ind,targetWord2ind,startToken,unkToken,padToken,endToken).to(device)
nmt.load(modelFileName,device)

sourceFile = "/content/drive/MyDrive/TII_PROJECT/en_bg_data/test.bg"
sourceFileTranslated = "/content/drive/MyDrive/TII_PROJECT/en_bg_data/test.en"
targetFile = "/content/drive/MyDrive/TII_PROJECT/en_bg_data/testTranslated.bg"

(output,bleu_score) = translate_and_rate(nmt,sourceFile,sourceFileTranslated,targetFile)

Loading file: /content/drive/MyDrive/TII_PROJECT/en_bg_data/test.bg
[                                                  ]-------------------------------------------------]
Loading file: /content/drive/MyDrive/TII_PROJECT/en_bg_data/test.en
Loading file: /content/drive/MyDrive/TII_PROJECT/en_bg_data/testTranslated.bg
6000
6000
Corpus BLEU score for /content/drive/MyDrive/TII_PROJECT/en_bg_data/test.bg: 40.182905557530646


In [127]:
#dev translation
(sourceWord2ind, targetWord2ind) = pickle.load(open(wordsDataFileName, 'rb'))
nmt = NMTmodel(d_model,n_head,Nx,dropout,device,sourceWord2ind,targetWord2ind,startToken,unkToken,padToken,endToken).to(device)
nmt.load(modelFileName,device)

sourceFile = "/content/drive/MyDrive/TII_PROJECT/en_bg_data/dev.bg"
sourceFileTranslated = "/content/drive/MyDrive/TII_PROJECT/en_bg_data/dev.en"
targetFile = "/content/drive/MyDrive/TII_PROJECT/en_bg_data/devTranslated.bg"

(output,bleu_score) = translate_and_rate(nmt,sourceFile,sourceFileTranslated,targetFile)

Loading file: /content/drive/MyDrive/TII_PROJECT/en_bg_data/dev.bg
[                                                  ]-------------------------------------------------]
Loading file: /content/drive/MyDrive/TII_PROJECT/en_bg_data/dev.en
Loading file: /content/drive/MyDrive/TII_PROJECT/en_bg_data/devTranslated.bg
1000
1000
Corpus BLEU score for /content/drive/MyDrive/TII_PROJECT/en_bg_data/dev.bg: 42.17799306804753
