In [238]:

from spacy.lang.en import English
import torch
import os
import random
import numpy as np
from torch.utils.data import DataLoader
import time
import math
from torch import nn, optim
from torch.utils.data import Dataset
import os
from tqdm import tqdm
import pickle
from collections import Counter, OrderedDict
from torchtext.vocab import vocab
import numpy as np

In [239]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [240]:
def saveDataToCache(data_path_cache,fileName):
  
  with open(data_path_cache, "wb") as f:
    pickle.dump(fileName, f)

In [241]:
def loadCachedFile(data_path_cache_file):
  with open(data_path_cache_file, "rb") as f:
    return pickle.load(f)

In [242]:
class DataProcessor(Dataset):
    def __init__(self, data_dir, set_name, tokenizer = None, use_cache_file = True, save_to_cache = True):

        super().__init__()
        self.src_data_path = os.path.join(data_dir, f"{set_name}.txt.src")
        self.target_data_path = os.path.join(data_dir, f"{set_name}.txt.tgt")
        self.src_data_path_cache = os.path.join(data_dir, f"{set_name}.src.cached")
        self.target_data_path_cache = os.path.join(data_dir, f"{set_name}.tgt.cached")
        self.src_idx = None
        self.tgt_idx = None

        if use_cache_file and os.path.isfile(self.src_data_path_cache):
          self.src = loadCachedFile(self.src_data_path_cache)
          print("Using cached file to load source paragraphs")
        else:
          print("Loading sources from original file using tokenizer...")
          with open(self.src_data_path, "r") as f:
            self.sourceFile = f.readlines()

          self.sourceFile = np.array([tokenizer(text.strip()) for text in (tqdm(self.sourceFile) )], dtype=object)

          if save_to_cache:
            saveDataToCache(self.src_data_path_cache,self.sourceFile)


        if use_cache_file and os.path.isfile(self.target_data_path_cache):
          print("Using cached file to load targets")
          self.tgt = loadCachedFile(self.target_data_path_cache)
        else:
          print("Loading targets from original file using tokenizer...")
          with open(self.target_data_path, "r") as f:
            self.targetFile = f.readlines()

          self.targetFile = np.array([tokenizer(text.strip()) for text in (tqdm(self.targetFile) )], dtype=object)

          if save_to_cache:
            saveDataToCache(self.target_data_path_cache,self.targetFile)

    def to_indexes(self, vocab):
        self.src_idx = np.array([np.array(vocab(text)) for text in self.sourceFile], dtype=object)
        self.tgt_idx = np.array([np.array(vocab(text)) for text in self.targetFile], dtype=object)

    def __len__(self):
        return len(self.sourceFile)

    def __getitem__(self, idx):
        if self.src_idx is not None:
            return self.sourceFile[idx], self.src_idx[idx], self.targetFile[idx], self.tgt_idx[idx]
        else:
            return self.sourceFile[idx], self.targetFile[idx]

In [243]:
tokenizer = English().tokenizer
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return text.split()


In [244]:
print("Test 1")
train_set = DataProcessor("data", "train", tokenize_en)
val_set = DataProcessor("data", "val", tokenize_en)
test_set = DataProcessor("data", "test", tokenize_en)

Test 1
Loading sources from original file using tokenizer...


100%|██████████| 28722/28722 [00:01<00:00, 14467.35it/s]


Loading targets from original file using tokenizer...


100%|██████████| 28722/28722 [00:02<00:00, 12751.53it/s]


Loading sources from original file using tokenizer...


100%|██████████| 1336/1336 [00:00<00:00, 16938.74it/s]


Loading targets from original file using tokenizer...


100%|██████████| 1336/1336 [00:00<00:00, 38997.23it/s]


Loading sources from original file using tokenizer...


100%|██████████| 11490/11490 [00:00<00:00, 13774.95it/s]


Loading targets from original file using tokenizer...


100%|██████████| 11490/11490 [00:00<00:00, 92685.29it/s]


In [245]:

def construct_vocab(dataset, specials = [], min_freq = 0):
    tokens = Counter([tok for example in tqdm(dataset.src + dataset.tgt) for tok in example ])
    sorted_by_freq_tuples = [(item, count) for item, count in sorted(tokens.items(), key=lambda x: x[1], reverse=True) if count >= min_freq]
    ordered_dict = OrderedDict(sorted_by_freq_tuples)
    for special in specials:
        ordered_dict[special] = min_freq
    return vocab(ordered_dict)

UNK_TAG = "<UNK>"
PAD_TAG = "<PAD>"
START_TAG = "<SOS>"
END_TAG = "<EOS>"
MAX_UTTERANCE_LENGTH = 400
MIN_VOCAB_FREQ = 320
if os.path.isfile(f"vocab.{MIN_VOCAB_FREQ}.pth"):
    vocab = torch.load(f"vocab.{MIN_VOCAB_FREQ}.pth")
else:
    vocab = construct_vocab(train_set, specials = [UNK_TAG, PAD_TAG, START_TAG, END_TAG], min_freq=MIN_VOCAB_FREQ)
    torch.save(vocab, f"vocab.{MIN_VOCAB_FREQ}.pth")
vocab.set_default_index(vocab[UNK_TAG])
PAD_IDX = vocab[PAD_TAG]
END_IDX = vocab[END_TAG]
START_IDX = vocab[START_TAG]
UNK_IDX = vocab[UNK_TAG]
print("Number tokens in vocab: ", len(vocab))

Number tokens in vocab:  12277


In [246]:
import torch.nn.functional as F
class Attention(nn.Module):
  
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs, mask):
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        attention = self.v(energy).squeeze(2)
        attention = attention.masked_fill(mask == 0, -torch.inf)
        return F.softmax(attention, dim = 1)

In [247]:
class Encoder(nn.Module):
    def __init__(self, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_len):
      
        embedded = self.dropout(src)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len, enforce_sorted=False)
        packed_outputs, hidden = self.rnn(packed_embedded)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs) 
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        return outputs, hidden

In [248]:
import torch.nn.functional as F
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention, pointer_generation = False):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.pointer_generation = pointer_generation
        if self.pointer_generation:
            self.pg_linear = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, 1)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear(dec_hid_dim + (enc_hid_dim * 2), dec_hid_dim)
        self.out2 = nn.Linear(dec_hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs, mask, src_extended_vocab = None, max_oov_vocab_size = 0):
        
        embedded = self.dropout(input.unsqueeze(0))
        a = self.attention(hidden, encoder_outputs, mask)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a.unsqueeze(1), encoder_outputs)
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        hidden = self.fc_out(torch.cat((output, weighted), dim = 1)) # <- probability distribution of words
        word_dist = self.out2(hidden)

        if self.pointer_generation:
            src_extended_vocab = src_extended_vocab
            prob_gen = torch.sigmoid(self.pg_linear(torch.cat((output, weighted, embedded), dim = 1))) # <- probability of generation
            word_dist = (1 - prob_gen) * F.softmax(word_dist, dim = 1)
            attn_dist = prob_gen * a

            if max_oov_vocab_size > 0:
                full_word_dist = torch.cat((word_dist, word_dist.new_zeros((word_dist.size(0), max_oov_vocab_size))), dim = 1)
            final_word_dist = full_word_dist.scatter_add(1, src_extended_vocab, attn_dist)

        else:
            final_word_dist = F.softmax(word_dist, dim = 1)
  
        return final_word_dist, hidden.squeeze(0), a

In [249]:
import torch.nn.functional as F
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, input_dim, emb_dim, src_pad_idx, device, unk_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.device = device
        self.unk_idx = unk_idx
        
    def create_mask(self, src):
        mask = (src != self.src_pad_idx).permute(1, 0)
        return mask
        
    def forward(self, src, src_len, trg_forcing, teacher_forcing_ratio = 0.5, src_extended_vocab = None, max_oov_vocab_size = 0):
        
        batch_size = src.shape[1]
        trg_len = trg_forcing.shape[0]
        trg_vocab_size = self.decoder.output_dim
        mask = self.create_mask(src)
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size + max_oov_vocab_size).to(self.device)
        emb_src = self.embedding(src)
        encoder_outputs, hidden = self.encoder(emb_src, src_len)
        del src
        input = trg_forcing[0,:]
        for t in range(1, trg_len):
            output, hidden, _ = self.decoder(self.embedding(input), hidden, encoder_outputs, mask, src_extended_vocab, max_oov_vocab_size)
            outputs[t] = output
            top1 = output.argmax(1) 
            teacher_force = random.random() < teacher_forcing_ratio
            input = trg_forcing[t] if teacher_force or (top1 >= trg_vocab_size).sum() > 0 else top1
        return outputs

In [250]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def train(model, iterator, optimizer, criterion, clip, device='cpu', pointer_generation = False):
    print("Test 1")
    model.train()
    print("Use pointer generator inside train : ",pointer_generation)
    epoch_loss = 0
    
    for i, batch in enumerate(tqdm(iterator)):
        
        src, trg, trg_forcing, src_len, src_extended_vocab, oov_vocabs, max_oov_vocab_len, tgt_lens = batch
        src = src.to(device)
        trg = trg.to(device)
        if pointer_generation:
            src_extended_vocab = src_extended_vocab.to(device)
            trg_forcing = trg_forcing.to(device)
        else:
            trg_forcing = trg
        optimizer.zero_grad()
        output = model(src, src_len, trg_forcing, 0.5, src_extended_vocab, max_oov_vocab_len)
        output_dim = output.shape[-1]
        gold_probs = torch.gather(output[:-1], 2, trg[1:].unsqueeze(2)).squeeze()
        losses = -torch.log(gold_probs + 1e-15)
        trg_mask = trg[1:] != model.src_pad_idx
        sum_losses = torch.sum(losses[trg_mask], 0)
        batch_avg_loss = sum_losses/tgt_lens.to(device)
        loss = torch.mean(batch_avg_loss)
        loss.backward()
        epoch_loss += loss.item()
        del loss, trg, output, output_dim, src, tgt_lens, sum_losses, batch_avg_loss, trg_mask
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion, device='cpu', pointer_generation = False):
    
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
            src, trg, trg_forcing, src_len, src_extended_vocab, oov_vocabs, max_oov_vocab_len, tgt_lens = batch
            src = src.to(device)
            trg = trg.to(device)
            if pointer_generation:
                src_extended_vocab = src_extended_vocab.to(device)
                trg_forcing = trg_forcing.to(device)
            else:
                trg_forcing = trg
            
            output = model(src, src_len, trg_forcing, 0, src_extended_vocab, max_oov_vocab_len)
            gold_probs = torch.gather(output[:-1], 2, trg[1:].unsqueeze(2)).squeeze()
            losses = -torch.log(gold_probs + 1e-15)
            trg_mask = trg[1:] != model.src_pad_idx
            sum_losses = torch.sum(losses[trg_mask], 0)
            batch_avg_loss = sum_losses/tgt_lens.to(device)
            loss = torch.mean(batch_avg_loss)
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [251]:

train_set.to_indexes(vocab)
val_set.to_indexes(vocab)
test_set.to_indexes(vocab)

BATCH_SIZE = 32
pointer_generator_flag = True
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

def collate_batch(batch):
     src_lens, tgt_lens = [], []
     out_of_vocab_words = []
     for (source, source_idxs, tgt, tgt_idxs) in batch:
         tgt_lens.append(len(tgt) + 2)
         src_lens.append(min(len(source) + 2, MAX_UTTERANCE_LENGTH + 2))
         if pointer_generator_flag:
          oov_word_set = {}
          for idx in np.where(source_idxs == UNK_IDX)[0]:
               if idx < MAX_UTTERANCE_LENGTH and source[idx] not in oov_word_set:
                    oov_word_set[source[idx]] = len(vocab) + len(oov_word_set)
          out_of_vocab_words.append(oov_word_set)


     max_src_len = min(max(src_lens), MAX_UTTERANCE_LENGTH + 2)
     max_tgt_len = max(tgt_lens)
     src_tensor = torch.full(size=(max_src_len, len(batch)), fill_value=PAD_IDX)
     tgt_tensor = torch.full(size=(max_tgt_len, len(batch)), fill_value=PAD_IDX)

     if pointer_generator_flag:
          
          src_extended_tensor = torch.full(size=(max_src_len, len(batch)), fill_value=PAD_IDX)
          tgt_forcing_tensor = tgt_tensor.clone()
     
     for (i, (source, source_idxs, tgt, tgt_idxs)) in enumerate(batch):
          tgt_tensor[1:tgt_lens[i] - 1, i] = torch.tensor(tgt_idxs, dtype=torch.int32)
          tgt_tensor[0, i] = START_IDX
          tgt_tensor[tgt_lens[i] - 1, i] = END_IDX
          src_tensor[1:src_lens[i] - 1, i] = torch.tensor(source_idxs[:max_src_len - 2], dtype=torch.int32)
          src_tensor[0, i] = START_IDX
          src_tensor[src_lens[i] - 1, i] = END_IDX
          if pointer_generator_flag:
               tgt_forcing_tensor[:, i] = tgt_tensor[:, i]
               oov_tgt_word_idxs = np.where(tgt_idxs == UNK_IDX)[0]
               for idx in oov_tgt_word_idxs:
                    if tgt[idx] in out_of_vocab_words[i]:
                         tgt_tensor[1 + idx, i] = out_of_vocab_words[i][tgt[idx]]
               oov_src_word_idxs = np.where(source_idxs == UNK_IDX)[0]
               src_extended_tensor[:, i] = src_tensor[:, i]
               for idx in oov_src_word_idxs:
                    if idx < MAX_UTTERANCE_LENGTH:
                         src_extended_tensor[1 + idx, i] = out_of_vocab_words[i][source[idx]]


     src_lens = torch.tensor(src_lens, dtype=torch.int16)
     tgt_lens = torch.tensor(tgt_lens, dtype=torch.int16)

     return src_tensor, tgt_tensor, tgt_forcing_tensor if pointer_generator_flag else tgt_tensor, src_lens, src_extended_tensor.permute(1, 0) if pointer_generator_flag else None, out_of_vocab_words if pointer_generator_flag else None, max([len(s) for s in out_of_vocab_words]) if pointer_generator_flag else 0, tgt_lens


NUM_WORKERS = 4
train_iterator = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch, num_workers = NUM_WORKERS)
val_iterator = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch, num_workers = NUM_WORKERS)
test_iterator = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch, num_workers = 0)

VOCAB_DIM = len(vocab)
EMB_DIM = 64 #128 #256
ENC_HID_DIM = 128 #256 #512
DEC_HID_DIM = 128 #256 #512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
encoder = Encoder(EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
decoder = Decoder(VOCAB_DIM, EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn, pointer_generation = pointer_generator_flag)
model = Seq2Seq(encoder, decoder, VOCAB_DIM, EMB_DIM, PAD_IDX, device, UNK_IDX).to(device)


def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)
print(f'The model has {count_parameters(model):,} trainable parameters')
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

N_EPOCHS = 5
CLIP = 1
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    end_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP, device = device, pointer_generation = pointer_generator_flag)
    valid_loss = evaluate(model, val_iterator, criterion, device = device, pointer_generation = pointer_generator_flag)
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model-best-new.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

if pointer_generator_flag:
  torch.save(model.state_dict(), 'model-final-new-pg.pt')
else:
  torch.save(model.state_dict(), 'model-final-new.pt')


The model has 2,823,286 trainable parameters
Test 1
Use pointer generator inside train :  True


100%|██████████| 898/898 [05:17<00:00,  2.83it/s]


Epoch: 01 | Time: 0m 0s
	Train Loss: 203.030 | Train PPL: 14962403532066137111443559386240591928212717857102004331316958890141541429284492609060864.000
	 Val. Loss: 212.783 |  Val. PPL: 257339671337641771067015668402907794313894424541255406567160160680427343186454488763351433216.000
Test 1
Use pointer generator inside train :  True


100%|██████████| 898/898 [05:18<00:00,  2.82it/s]


Epoch: 02 | Time: 0m 0s
	Train Loss: 198.664 | Train PPL: 189895082985984034268392440619665873586224457715139700492808304005415946649549215367168.000
	 Val. Loss: 212.633 |  Val. PPL: 221504592060201843962448871826281350668098214011471265593261920676313390255544912072759312384.000
Test 1
Use pointer generator inside train :  True


100%|██████████| 898/898 [05:15<00:00,  2.84it/s]


Epoch: 03 | Time: 0m 0s
	Train Loss: 195.907 | Train PPL: 12065449847512422070783597971640897230608755913630207465322821000785725678732259098624.000
	 Val. Loss: 211.989 |  Val. PPL: 116313083139314735669303506396628743304724077775753963246058958505713064933265083511982784512.000
Test 1
Use pointer generator inside train :  True


100%|██████████| 898/898 [05:17<00:00,  2.83it/s]


Epoch: 04 | Time: 0m 0s
	Train Loss: 193.475 | Train PPL: 1060078005163197011497185591304962071612426794236700913003486559475769761745531305984.000
	 Val. Loss: 211.114 |  Val. PPL: 48471219025263342475040332550698066314837924105565221813479037330803458538463212899777118208.000
Test 1
Use pointer generator inside train :  True


100%|██████████| 898/898 [05:16<00:00,  2.84it/s]


Epoch: 05 | Time: 0m 0s
	Train Loss: 191.341 | Train PPL: 125419273385522831965501985131584530627153123911311429833231283766027303028061634560.000
	 Val. Loss: 211.572 |  Val. PPL: 76620861583019635312225815117075244085589961900074225287589232971415038047814089048944803840.000


In [252]:
from tqdm import tqdm
model.eval()
epoch_loss = 0
trgs = []
outputs = []
srcs = []
all_oov_vocabs = []
with torch.no_grad():

    
    for i, batch in enumerate(tqdm(val_iterator)):
        src, trg, trg_forcing, src_len, src_extended_vocab, oov_vocabs, max_oov_vocab_len, tgt_lens = batch
        src = src.to(device)
        trg = trg.to(device)
        if pointer_generator_flag:
            src_extended_vocab = src_extended_vocab.to(device)
            trg_forcing = trg_forcing.to(device)
        else:
            trg_forcing = trg

        output = model(src, src_len, trg_forcing, 0, src_extended_vocab, max_oov_vocab_len) #turn off teacher forcing
        trg = trg.permute(1, 0)
        trgs.extend(trg.cpu().numpy())
        output = torch.argmax(output, dim = 2).permute(1, 0)
        outputs.extend(output.cpu().numpy())
        srcs.extend(src.permute(1, 0).cpu().numpy())
        if oov_vocabs:
            all_oov_vocabs.extend(oov_vocabs)

100%|██████████| 42/42 [00:07<00:00,  5.84it/s]


In [253]:
def decode(seq, vocab, oov_vocab):
    inverted_oov = {idx: word for word, idx in oov_vocab.items()}
    if len(oov_vocab) == 0:
        return vocab.lookup_tokens(seq)
    else:
        return [vocab.lookup_token(idx) if idx < len(vocab) else inverted_oov[idx] for idx in seq]

In [259]:
idx = 41
print("Original Text:")
print(" ".join(vocab.lookup_tokens(srcs[idx])))
print("Target Summary Text:")
print(" ".join(decode(trgs[idx], vocab, all_oov_vocabs[idx] if idx < len(all_oov_vocabs) else {})))
print("Generated Summary Text:")
print(" ".join(decode(outputs[idx], vocab, all_oov_vocabs[idx] if idx < len(all_oov_vocabs) else {})))

Original Text:
<SOS> -lrb- cnn -rrb- a facebook post by actor ashton kutcher <UNK> the lack of <UNK> changing tables in public men 's rooms has parents talking . the new father -- he and partner <UNK> <UNK> welcomed baby <UNK> <UNK> in october -- complained to his followers that he had yet to encounter a changing table in the public <UNK> he visits . he offered to give a social media <UNK> to the first business where he found a <UNK> table in the men 's room . the post had <UNK> more than <UNK> comments as of wednesday morning . lots of folks offered up places kutcher should <UNK> , such as walmart and <UNK> barrel , where they say changing tables <UNK> . some dads said they did n't have a problem finding changing tables , but it may be because they 're <UNK> more <UNK> <UNK> than the hollywood star . other posters said `` family <UNK> '' would take care of the problem altogether . do modern dads get enough credit ? many praised kutcher for raising the issue : `` thank you for doing th

In [255]:
trialOutput = [" ".join(decode(outputs[i], vocab, all_oov_vocabs[i] if idx < len(all_oov_vocabs) else {})) for i in tqdm(range(len(outputs)))]
referenceOutput = [" ".join(decode(trgs[i], vocab, all_oov_vocabs[i] if idx < len(all_oov_vocabs) else {})) for i in tqdm(range(len(trgs)))]

100%|██████████| 1336/1336 [00:00<00:00, 7551.80it/s]
100%|██████████| 1336/1336 [00:00<00:00, 6936.80it/s]


In [256]:
!pip install rouge
from rouge import Rouge
rouge = Rouge()
rouge.get_scores(trialOutput, referenceOutput, avg = True)

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


{'rouge-1': {'f': 0.1780476423461404,
  'p': 0.5225782527445748,
  'r': 0.10958464429582805},
 'rouge-2': {'f': 0.009309292957345512,
  'p': 0.018953538970449713,
  'r': 0.006407892480837956},
 'rouge-l': {'f': 0.1643511496129068,
  'p': 0.4828030028364653,
  'r': 0.10115495334657565}}

In [257]:
with open("predictions.baseline.txt", "w") as f:
    f.writelines([h + "\n" for h in trialOutput])