# Get P100 GPU

In [1]:
!nvidia-smi

Fri May  8 02:20:09 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P0    33W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

# Get the data

## Load all the required Libraries

In [0]:
import torchtext
from torchtext.data import Field, BucketIterator, Iterator, TabularDataset

from torchtext.datasets import TranslationDataset
from collections import defaultdict

from datetime import datetime
import pytz
import os
import torch
import copy
import time
import math

import torch.nn as nn 
import torch.nn.functional as F
import dill as pickle
import numpy as np
from torch.autograd import Variable

import numpy as np

## Mount Drive

In [2]:
# link to google drive
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

## Read data into memory

In [0]:
dir_path = '/content/drive/My Drive/Spring-20/11-747/11-747 Project/weight_data/duolingo-sharedtask-2020/data/courses/en-hu'
src = Field(fix_length=100)
trg = Field(init_token = "<sos>", eos_token = "<eos>", fix_length=100)
wgt = Field(use_vocab=False, 
            dtype=torch.float64,
            sequential=False)

In [0]:
# def merge_files(mode):
#     source_path = os.path.join(dir_path, f"{mode}-sents.{source_lan}")
#     target_path = os.path.join(dir_path, f"{mode}-sents.{target_lan}")
#     merge_path = os.path.join(dir_path, f"{mode}_merge.txt")

#     with open (source_path, 'r') as f1, open(target_path, 'r') as f2, open(merge_path, 'w') as f3:
#         for rows in zip(f1, f2):
#             source, target = rows
#             source = source.rstrip()

#             f3.write(source + " | " + target)

# merge_files("train")
# merge_files("dev")
# merge_files("test")

In [0]:
source_lan = "en"
target_lan = "hu"

def data_loader_weight(mode):
    path = os.path.join(dir_path, f"{mode}_merge.txt")

    # source_data = TabularDataset(
    #         path=source_path, # the file path
    #         format='csv',
    #         fields=[('Text', src)])

    iterator = None
    if mode == "train" or mode == "dev":
        data = TabularDataset(path=path, # the file path
                        format='csv',
                        csv_reader_params={"delimiter": "|"},
                        fields=[('Source', src), ('Target', trg), ('Weight', wgt)])
        
        if mode == "train":
          src.build_vocab(data.Source)
          trg.build_vocab(data.Target)


        iterator = BucketIterator(dataset=data, batch_size=128,
            sort_key=lambda x: data.interleave_keys(len(x.Source), len(x.Target)))
    
    else:
        data = TabularDataset(path=path, # the file path
                        format='csv',
                        csv_reader_params={"delimiter": "|"},
                        fields=[('Source', src), ('Target', trg)])
        
        iterator = Iterator(dataset=data, batch_size=64, train=False, 
                            shuffle=False, sort=False)


    return iterator

In [0]:
# Train dataloader
train_iter = data_loader_weight("train")

# Validation dataloader
val_iter = data_loader_weight("dev")

# Test dataloader
test_iter = data_loader_weight("test")

## Model Parameters



In [0]:
class Configuration(object):
  def __init__(self, source, target):
    self.src_data = source
    self.trg_data = target
    self.src_lang = 'de'
    self.trg_lang = 'en'
    self.epochs = 50
    self.n_layers = 6
    self.heads = 8
    self.dropout = 0.2
    self.printevery = 10
    self.lr = 5e-4
    self.emb_dim = 512
    self.ff_hsize = 1024
    self.max_strlen = 100
    self.checkpoint = 0
    self.device = 0
    self.clip_norm = 0.0
    self.src_pad = src.vocab.stoi['<pad>']
    self.trg_pad = trg.vocab.stoi['<pad>']
    self.k = 5
    self.max_len = 100


opt = Configuration(src, trg)

# My Transformer Model

## Embedder

In [0]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)

In [0]:
class PositionalEncoder(nn.Module):
    def __init__(self, opt, max_seq_len=100):
        super().__init__()
        self.dropout = nn.Dropout(p=opt.dropout)
        self.dim = opt.emb_dim
        pe = torch.zeros(max_seq_len, self.dim)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)

        # 1000 ^ (2i / dmodel) = e ^ (2i) * -log(1000)
        div_term = torch.exp(torch.arange(0, self.dim, 2).float() \
                            * (-math.log(10000.0) / self.dim))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe)
 
    
    def forward(self, x):
        x = x * math.sqrt(self.dim)
        pe = Variable(self.pe[:,:x.size(1)], requires_grad=False)

        if x.is_cuda:
            pe.cuda()
        x = x + pe
        return self.dropout(x)

## Sublayers

In [0]:
class Norm(nn.Module):
    def __init__(self, d_model, eps=1e-5):
        super().__init__()
    
        self.size = d_model
        
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        
        self.eps = eps
    
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

def attention(q, k, v, d_k, mask=None, dropout=None):
    
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, -1e9)
    
    scores = F.softmax(scores, dim=-1)
    
    if dropout is not None:
        scores = dropout(scores)
        
    output = torch.matmul(scores, v)
    return output

In [0]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout=0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    def forward(self, q, k, v, mask=None):
        
        bs = q.size(0)
        
        # perform linear operation and split into N heads
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        # transpose to get dimensions bs * N * sl * d_model
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        

        # calculate attention using function we will define next
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous()\
        .view(bs, -1, self.d_model)
        output = self.out(concat)
    
        return output

In [0]:
class FeedForward(nn.Module):
    def __init__(self, opt):
        super().__init__() 

        linear_1 = nn.Linear(opt.emb_dim, opt.ff_hsize)
        dropout = nn.Dropout(opt.dropout)
        linear_2 = nn.Linear(opt.ff_hsize, opt.emb_dim)

        self.layers = nn.Sequential(linear_1, nn.ReLU(), dropout, linear_2)
    
    def forward(self, x):
        self.layers(x)
        return x

## Layers

In [0]:
class EncoderLayer(nn.Module):
    def __init__(self, opt):
        super().__init__()
        self.norm_1 = Norm(opt.emb_dim)
        self.norm_2 = Norm(opt.emb_dim)

        self.dropout_1 = nn.Dropout(opt.dropout)
        self.dropout_2 = nn.Dropout(opt.dropout)

        self.attn = MultiHeadAttention(opt.heads, 
                                       opt.emb_dim, 
                                       dropout=opt.dropout)
        
        self.ff = FeedForward(opt)
        
        
    def forward(self, x, mask):
        '''
        This implementation follows the Tensor2Tensor implementation
        instead of the original paper "Attention is all you need"
        The Norm is applied to the input first, then self attention
        is applied to the sub-layer.
        '''

        x = self.norm_1(x)
        x1 = x + self.dropout_1(self.attn(x, x, x, mask))

        x1 = self.norm_2(x1)
        x2 = x1 + self.dropout_2(self.ff(x1))

        return x2

In [0]:
# build a decoder layer with two multi-head attention layers and
# one feed-forward layer
class DecoderLayer(nn.Module):
    def __init__(self, opt):
        super().__init__()
        self.norm_1 = Norm(opt.emb_dim)
        self.norm_2 = Norm(opt.emb_dim)
        self.norm_3 = Norm(opt.emb_dim)
        
        self.dropout_1 = nn.Dropout(opt.dropout)
        self.dropout_2 = nn.Dropout(opt.dropout)
        self.dropout_3 = nn.Dropout(opt.dropout)
        
        self.attn_1 = MultiHeadAttention(opt.heads, 
                                         opt.emb_dim, 
                                         dropout=opt.dropout)
        
        self.attn_2 = MultiHeadAttention(opt.heads, 
                                         opt.emb_dim, 
                                         dropout=opt.dropout)
        
        self.ff = FeedForward(opt)

    def forward(self, x, e_outputs, src_mask, trg_mask):
        '''
        This implementation follows the Tensor2Tensor implementation
        instead of the original paper "Attention is all you need"
        The Norm is applied to the input first, then self attention
        is applied to the sub-layer.
        '''
        x = self.norm_1(x)
        x1 = x + self.dropout_1(self.attn_1(x, x, x, trg_mask))

        x1 = self.norm_2(x1)
        x2 = x1 + self.dropout_2(self.attn_2(x1, 
                                             e_outputs, 
                                             e_outputs,
                                             src_mask))

        x2 = self.norm_3(x2)
        x3 = x2 + self.dropout_3(self.ff(x2))

        return x3

## Transformer Model

In [0]:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class Encoder(nn.Module):
    def __init__(self, vocab_size, opt):
        super().__init__()
        self.N = opt.n_layers
        self.embed = Embedder(vocab_size, opt.emb_dim)
        self.pe = PositionalEncoder(opt)
        self.layers = get_clones(EncoderLayer(opt), self.N)
        self.norm = Norm(opt.emb_dim)

    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)

        for i in range(self.N):
            x = self.layers[i](x, mask)

        return self.norm(x)
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, opt):
        super().__init__()
        self.N = opt.n_layers
        self.embed = Embedder(vocab_size, opt.emb_dim)
        self.pe = PositionalEncoder(opt)
        self.layers = get_clones(DecoderLayer(opt), self.N)
        self.norm = Norm(opt.emb_dim)

    def forward(self, trg, e_outputs, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)

        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)

        return self.norm(x)

class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, opt):
        super().__init__()
        self.encoder = Encoder(src_vocab, opt)
        self.decoder = Decoder(trg_vocab, opt)
        self.out = nn.Linear(opt.emb_dim, trg_vocab)

    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output)
        return output

def get_model(opt, src_vocab, trg_vocab):
    
    assert opt.emb_dim % opt.heads == 0
    assert opt.dropout < 1

    model = Transformer(src_vocab, trg_vocab, opt)
       
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p) 
    
    if opt.device == 0:
        model = model.cuda()
    
    return model

# Optimizer

In [0]:
class NoamOpt():
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        #print(f"Learning rate: {rate}")
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model):
    return NoamOpt(512, 1, 4000,
            torch.optim.Adam(model.parameters(), lr=0, 
                             betas=(0.9, 0.98), weight_decay=0.0001, eps=1e-9))

# Train the model

In [0]:
def nopeak_mask(size, opt):
    np_mask = np.triu(np.ones((1, size, size)),
    k=1).astype('uint8')
    np_mask =  Variable(torch.from_numpy(np_mask) == 0)
    if opt.device == 0:
      np_mask = np_mask.cuda()
    return np_mask

def create_masks(src, trg, opt):
    
    src_mask = (src != opt.src_pad).unsqueeze(-2)

    if trg is not None:
        trg_mask = (trg != opt.trg_pad).unsqueeze(-2)
        size = trg.size(1) # get seq_len for matrix
        np_mask = nopeak_mask(size, opt)
        if trg.is_cuda:
            np_mask.cuda()
        trg_mask = trg_mask & np_mask
    else:
        trg_mask = None
    return src_mask, trg_mask

In [0]:
def train_model(model, opt, criterion, check_path):
    print("training model...")
    model.train()
    start = time.time()

    best_acc = []
                 
    for epoch in range(opt.epochs):  
        total_loss = 0

        for i, batch in enumerate(train_iter):
            src = batch.Source.transpose(0,1)
            trg = batch.Target.transpose(0,1)
            wgt = batch.Weight

            batch_size = wgt.shape[0]

            wgt = torch.ones(batch_size) - wgt
            wgt = wgt.unsqueeze(1).to(device)

            src, trg = src.to(device), trg.to(device)
            trg_input = trg[:, :-1]
            
            src_mask, trg_mask = create_masks(src, trg_input.cuda(), opt)
            
            preds = model(src, trg_input, src_mask, trg_mask)
            ys = trg[:, 1:].contiguous().view(-1)
            opt.optimizer.optimizer.zero_grad()

            loss = criterion(preds.view(-1, preds.size(-1)), ys)

            if loss.shape[0] % batch_size == 0:
                loss = loss.contiguous().view(batch_size, -1)
                loss = wgt * loss
                loss = loss.mean()
                loss.backward()
            
                #torch.nn.utils.clip_grad_norm_(model.parameters(), opt.clip_norm)
                opt.optimizer.step() 
                total_loss += loss.item()
                
                if (i + 1) % opt.printevery == 0:
                    p = int(100 * (i + 1) / len(train_iter))
                    avg_loss = total_loss/opt.printevery

                    print(" %dm: epoch %d [%s%s]  %d%%  loss = %.3f" %\
                    ((time.time() - start)//60, epoch + 1, "".join('#'*(p//5)), "".join(' '*(20-(p//5))), p, avg_loss))
                    total_loss = 0
            else: 
                print(f"loss dimension fucked...")

                print(f"src shape: {src.shape}")
                print(f"trg shape: {trg.shape}")
                print(f"wgt shape: {wgt.shape}")

            del src
            del trg
            del src_mask
            del trg_mask
   
        val_loss = evaluate(model, criterion)
        best_checkpoints(best_acc, val_loss, epoch, check_path)
        save_checkpoint(epoch, model, opt, check_path)

        print("%dm: epoch %d [%s%s]  %d%%  loss = %.3f\nepoch %d complete, loss = %.03f, val_loss = %0.3f" %\
        ((time.time() - start)//60, epoch + 1, "".join('#'*(100//5)), "".join(' '*(20-(100//5))), 100, avg_loss, epoch + 1, avg_loss, val_loss))


In [0]:
def s_key(lst):
  return lst[1]

In [0]:
def best_checkpoints(best_acc, val_loss, epoch, check_path):
    best_acc.append((epoch, val_loss))

    with open(os.path.join(check_path, "stats.txt"), "a+") as fl:
        size = 5 if len(best_acc) > 4 else len(best_acc)
        best = sorted(best_acc, key=s_key)[:size]
        indices = [str(a[0]) for a in best]
        pt = " ".join(indices)
        fl.write(f"epoch_num: {epoch}, val_loss: {val_loss}, top 5 checkpoints: {pt}\n")
        fl.write("====\n")
        fl.close()

    return

In [0]:
def make_checkpoint_dir(path):
    d = datetime.now()
    EST = pytz.timezone('US/Eastern')
    d = d.astimezone(EST)
    fd = str(d.strftime("afternorm-%d-%H_%M_%S"))

    check_path = os.path.join(path, fd)

    try:
        os.mkdir(check_path)
    except OSError:
        print("Creation of the directory %s failed" % check_path)
    else:
        print("Successfully created the directory %s " % check_path)
    return check_path

In [0]:
def save_checkpoint(epoch, model, opt, check_path):
    torch.save({
          'epoch': epoch,
          'model_state_dict': model.state_dict(),
          'optimizer_state_dict': opt.optimizer.optimizer.state_dict(),
        }, os.path.join(check_path, 'transformer_'  + str(epoch) + '_model.pth'))

In [0]:
def evaluate(model, criterion):
    model.eval()
    total_loss = 0

    for i, batch in enumerate(val_iter):
        with torch.no_grad():

            src = batch.Source.transpose(0,1)
            trg = batch.Target.transpose(0,1)
            wgt = batch.Weight

            wgt = torch.ones(wgt.shape) - wgt
            wgt = wgt.unsqueeze(1).to(device)

            src, trg = src.to(device), trg.to(device)
            trg_input = trg[:, :-1]

            src_mask, trg_mask = create_masks(src, trg_input.cuda(), opt)

            preds = model(src, trg_input, src_mask, trg_mask)
            ys = trg[:, 1:].contiguous().view(-1)

            loss = criterion(preds.view(-1, preds.size(-1)), ys)
            loss = wgt * loss
            loss = loss.mean()

            total_loss += loss.item()

            del src
            del trg
            del src_mask
            del trg_mask

    val_loss = total_loss / len(val_iter)
    return val_loss


## Declare the Training Model

In [0]:
model = get_model(opt, len(src.vocab), len(trg.vocab))
#opt.optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=(0.9, 0.98), weight_decay=0.0001, eps=1e-9)
opt.optimizer = get_std_opt(model)

In [0]:
# check_path = make_checkpoint_dir(
#    '/content/drive/My Drive/Spring-20/11-747/11-747 Project/checkpoints')
# criterion = nn.CrossEntropyLoss(ignore_index=1, reduction="none")
# train_model(model, opt, criterion, check_path)

In [38]:
checkpoint = torch.load('/content/drive/My Drive/Spring-20/11-747/11-747 Project/checkpoints/afternorm-07-19_29_10/transformer_8_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])
opt.optimizer.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
device = torch.device("cuda")
model.to(device)

Transformer(
  (encoder): Encoder(
    (embed): Embedder(
      (embed): Embedding(2450, 512)
    )
    (pe): PositionalEncoder(
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (layers): ModuleList(
      (0): EncoderLayer(
        (norm_1): Norm()
        (norm_2): Norm()
        (dropout_1): Dropout(p=0.2, inplace=False)
        (dropout_2): Dropout(p=0.2, inplace=False)
        (attn): MultiHeadAttention(
          (q_linear): Linear(in_features=512, out_features=512, bias=True)
          (v_linear): Linear(in_features=512, out_features=512, bias=True)
          (k_linear): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
          (out): Linear(in_features=512, out_features=512, bias=True)
        )
        (ff): FeedForward(
          (layers): Sequential(
            (0): Linear(in_features=512, out_features=1024, bias=True)
            (1): ReLU()
            (2): Dropout(p=0.2, inplace=False)
            (3): Linea

# Generate Translation and Evaluate Performance

## Decoder and Beam Search

In [0]:
SRC = src
TRG = trg
hypothesis = []

def init_vars(test_sent, model, SRC, TRG, opt):    
    init_tok = TRG.vocab.stoi['<sos>']
    src_mask = (test_sent != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
    # print(f"test_sent init_vars: {test_sent}")
    e_output = model.encoder(test_sent, src_mask)
    
    outputs = torch.LongTensor([[init_tok]])
    if opt.device == 0:
        outputs = outputs.cuda()

    # To Cuda
    e_output = e_output.to(device)
    src_mask = src_mask.to(device)
    trg_mask = nopeak_mask(1, opt).to(device)

    out = model.out(model.decoder(outputs, e_output, src_mask, trg_mask))
    out = F.softmax(out, dim=-1)
    
    probs, ix = out[:, -1].data.topk(opt.k)
    log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0)
    
    outputs = torch.zeros(opt.k, opt.max_len).long().to(device)
    
    outputs[:, 0] = init_tok
    outputs[:, 1] = ix[0]
    
    e_outputs = torch.zeros(opt.k, e_output.size(-2),e_output.size(-1))

    if opt.device == 0:
        e_outputs = e_outputs.cuda()
    e_outputs[:, :] = e_output[0]

    return outputs, e_outputs, log_scores


def k_best_outputs(outputs, out, log_scores, i, k): 
    probs, ix = out[:, -1].data.topk(k)
    log_probs = torch.Tensor([math.log(p) for p in probs.data.view(-1)]).view(k, -1) + log_scores.transpose(0,1)
    k_probs, k_ix = log_probs.view(-1).topk(k)
    
    row = k_ix // k
    col = k_ix % k

    outputs[:, :i] = outputs[row, :i]
    outputs[:, i] = ix[row, col]

    log_scores = k_probs.unsqueeze(0)
    
    return outputs, log_scores


def beam_search(test_sent, model, SRC, TRG, opt):
    outputs, e_outputs, log_scores = init_vars(test_sent, model, SRC, TRG, opt)
    eos_tok = TRG.vocab.stoi['<eos>']
    src_mask = (test_sent != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
    ind = None

    for i in range(2, opt.max_len):
    
        trg_mask = nopeak_mask(i, opt)
        out = model.out(model.decoder(outputs[:,:i],
        e_outputs, src_mask, trg_mask))

        print(f"Source mask: {src_mask.shape}, target mask: {trg_mask.shape}")
        
        print(f"outputs shape: {outputs.shape}")
        print(f"out shape: {out.shape}")
        print(f"encoder output shape: {e_outputs.shape}")

        out = F.softmax(out, dim=-1)
        outputs, log_scores = k_best_outputs(outputs, out, log_scores, i, opt.k)
        
        ones = (outputs==eos_tok).nonzero() # Occurrences of end symbols for all input sentences.
        sentence_lengths = torch.zeros(len(outputs), dtype=torch.long).cuda()
        for vec in ones:
            i = vec[0]
            if sentence_lengths[i]==0: # First end symbol has not been found yet
                sentence_lengths[i] = vec[1] # Position of first end symbol

        num_finished_sentences = len([s for s in sentence_lengths if s > 0])

        if num_finished_sentences == opt.k:
            alpha = 0.7
            div = 1/(sentence_lengths.type_as(log_scores)**alpha)
            _, ind = torch.max(log_scores * div, 1)
            ind = ind.data[0]
            break
  
    if ind is None:
        length = (outputs[0]==eos_tok).nonzero()   
        return [TRG.vocab.itos[tok] for tok in outputs[0]]
    else:
        length = (outputs[ind]==eos_tok).nonzero()[0]
        return [TRG.vocab.itos[tok] for tok in outputs[ind][1:length]]

In [0]:
def translate_sentence(test_batch, model, opt, SRC, TRG): 
    final_output = []
    for i, sentence in enumerate(test_batch):
        pad_size = sentence.size()[0]
        sentence = F.pad(sentence, pad=(0, 100 - pad_size), mode='constant', value=1)
        sentence = beam_search(sentence, model, SRC, TRG, opt)
        final_output.append(sentence)
    return final_output

In [0]:
import nltk
from nltk.translate.bleu_score import SmoothingFunction
chencherry = SmoothingFunction()

def calculate_bleu_scores(output, gold_output):
    for out, gout in zip(output, gold_output):
        total.append(nltk.translate.bleu_score.sentence_bleu([gout], out, smoothing_function=chencherry.method0))
    print(np.mean(total))


def append_to_list(output):
    print(output)
    for out in output:
        line = ' '.join(out)\
                .replace('@@', '')\
                .replace('<sos>', '')\
                .replace('<eos>', '')\
                .replace('<unk>', '')
        hypothesis.append(line)


def test_model(model, opt):
    print("testing model...")
    model.eval() 
              
    for i, batch in enumerate(test_iter):
        test_batch = batch.Source.transpose(0,1).to(device)
        output = translate_sentence(test_batch, model, opt, SRC, TRG)
        append_to_list(output)
        print(f"Finished batch: {i}")

In [42]:
hypothesis = []
test_model(model, opt)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
outputs shape: torch.Size([5, 100])
out shape: torch.Size([5, 3, 21388])
encoder output shape: torch.Size([5, 100, 512])
Source mask: torch.Size([1, 100]), target mask: torch.Size([1, 4, 4])
outputs shape: torch.Size([5, 100])
out shape: torch.Size([5, 4, 21388])
encoder output shape: torch.Size([5, 100, 512])
Source mask: torch.Size([1, 100]), target mask: torch.Size([1, 5, 5])
outputs shape: torch.Size([5, 100])
out shape: torch.Size([5, 5, 21388])
encoder output shape: torch.Size([5, 100, 512])
Source mask: torch.Size([1, 100]), target mask: torch.Size([1, 6, 6])
outputs shape: torch.Size([5, 100])
out shape: torch.Size([5, 6, 21388])
encoder output shape: torch.Size([5, 100, 512])
Source mask: torch.Size([1, 100]), target mask: torch.Size([1, 7, 7])
outputs shape: torch.Size([5, 100])
out shape: torch.Size([5, 7, 21388])
encoder output shape: torch.Size([5, 100, 512])
Source mask: torch.Size([1, 100]), target mask: to

In [43]:
len(hypothesis)

381

In [0]:
pkl_path = '/content/drive/My Drive/Spring-20/11-747/11-747 Project/checkpoints/afternorm-07-19_29_10'

with open(pkl_path + '/hypothesis_fq.txt', 'w') as hypothesis_writer:
    for x in hypothesis:
        hypothesis_writer.write('%s\n' % x)

In [45]:
hypothesis = open("hypothesis_fq.txt").readlines()
hypothesis = [x.strip('\n').split(' ') for x in hypothesis]

reference = open("test.en").readlines()
reference = [[x.strip('\n').split(' ')] for x in reference]

FileNotFoundError: ignored

In [0]:
.import nltk
from nltk.translate.bleu_score import corpus_bleu

score = corpus_bleu(reference, hypothesis)
print(score)

0.29801597219681486


In [0]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import sentence_bleu

class Score(object):
    def __init__(self, reference=None, hypothesis=None, style=None):
        self.hypothesis = hypothesis
        self.reference = reference
        self.style = style

    def calculate_score(self):
        if self.style is None or self.style is "corpus":
            return corpus_bleu(self.reference, self.hypothesis)
        elif self.style is "sentence":
            return sentence_bleu(self.reference, self.hypothesis)

In [0]:
a = [1, 2, 3, 4]

a = torch.Tensor(a)
a = a.contiguous().view(4, 1)

b = [[1, 2], [1, 2], [1, 2], [1, 2]]
b = torch.Tensor(b)

c = a * b

In [40]:
c

tensor([[1., 2.],
        [2., 4.],
        [3., 6.],
        [4., 8.]])