In [1]:
import os, pickle, time, random, logging, json, gc, warnings
from datetime import datetime
from pathlib import Path
from tqdm import tqdm

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AdamW, BertConfig, BertTokenizer
from transformers import get_linear_schedule_with_warmup
from transformers import EncoderDecoderModel
from transformers import BertModel

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate import bleu
import spacy

warnings.simplefilter("ignore")

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
bert_model = 'bert-large-uncased'
model_path = Path('./models')
data_path = Path('.')
bert_path = model_path / bert_model

In [4]:
model_paths = []
for i, x in enumerate(model_path.glob('*-{}'.format(bert_model))):
    if (x / 'best_model.pt').exists():
        model_paths.append(x)
        print('[{}]: {}'.format(len(model_paths), x))
        
idx = int(input('Which Model do you want?')) - 1
model_save_dir = model_paths[idx]
print("Chosen {}".format(model_save_dir))
json.load(open(model_save_dir / 'hp.json', 'r'))

[1]: models/21-05-10-1316-bert-large-uncased
[2]: models/21-05-10-1610-bert-large-uncased


Which Model do you want? 2


Chosen models/21-05-10-1610-bert-large-uncased


{'num_workers': '0',
 'pin_memory': 'False',
 'batch_size': '4',
 'weight_decay': '0.001',
 'lr': '0.05',
 'momentum': '0.9',
 'decoder_hidden_size': '512',
 'decoder_input_size': '512',
 'attention_hidden_size': '512',
 'max_input_length': '512',
 'n_layers': '1',
 'clip': '1',
 'dropout': '1',
 'n_epochs': '4',
 'max_epochs': '32',
 'patience': '2',
 'mb': '32',
 'checkpoint': 'None',
 'encoder_trained': 'False',
 'pw_criterion': 'CrossEntropyLoss()',
 'bert_model': 'bert-large-uncased',
 'tokenizer': "PreTrainedTokenizer(name_or_path='bert-large-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})",
 'bert_path': 'models/bert-large-uncased',
 'bert_hidden_size': '1024',
 'bert_vocab_size': '30522'}

In [5]:
class HP:
    def __init__(self):
        self.num_workers = 0
        self.pin_memory = False
        self.batch_size = 4
        self.weight_decay = 0.001
        self.lr = 0.05
        self.adam_lr = 0.001
        self.momentum = 0.9
        self.decoder_hidden_size = 512
        self.decoder_input_size = 512
        self.attention_hidden_size = 512
        self.max_input_length = 512
        self.n_layers = 1
        self.clip = 1
        self.dropout = 1
        self.n_epochs = 4
        self.max_epochs = 32
        self.patience = 3
        self.mb = 32
        self.checkpoint = None
        self.encoder_trained = False
        
        conf_file = json.load(open((bert_path / 'config.json'), 'r'))
        self.bert_hidden_size = conf_file['hidden_size']
        self.bert_vocab_size = conf_file['vocab_size']
        
        
hp = HP()

In [6]:
class HP:
    def __init__(self):
        self.num_workers = 0
        self.pin_memory = False
        self.batch_size = 4
        self.weight_decay = 0.001
        self.lr = 0.05
        self.momentum = 0.9
        self.decoder_hidden_size = 1024
        self.decoder_input_size = 1024
        self.attention_hidden_size = 1024
        self.max_input_length = 1024
        self.n_layers = 1
        self.clip = 1
        self.dropout = 1
        self.n_epochs = 4
        self.max_epochs = 32
        self.patience = 3
        self.mb = 32
        self.checkpoint = None
        self.encoder_trained = False

        self.pw_criterion = nn.CrossEntropyLoss(ignore_index=0)

        self.bert_model = 'bert-large-uncased'
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_model)
        
        self.bert_path = model_path / self.bert_model

        if not (model_path / self.bert_model / 'config.json').is_file():
            BertModel.from_pretrained(self.bert_model).save_pretrained(model_path / self.bert_model)
        conf_file = json.load(open((self.bert_path / 'config.json'), 'r'))
        self.bert_hidden_size = conf_file['hidden_size']
        self.bert_vocab_size = conf_file['vocab_size']
        
hp = HP()

In [17]:
tokenizer = BertTokenizer.from_pretrained(bert_model)

def bleu_score(prediction, ground_truth):
    prediction = prediction.max(2)[1]
    acc_bleu = 0

    for x, y in zip(ground_truth, prediction):
        x = tokenizer.convert_ids_to_tokens(x.tolist())
        y = tokenizer.convert_ids_to_tokens(y.tolist())
        idx1 = x.index('[PAD]') if '[PAD]' in x else len(x)
        idx2 = y.index('[SEP]') if '[SEP]' in y else len(y)

        acc_bleu += bleu([x[1:idx1 - 1]], y[1:idx2 - 1], smoothing_function=SmoothingFunction().method4)
    return acc_bleu / prediction.size(0)

def eval(model, dataloader, criterion):
    log = logging.getLogger(__name__)
    model.eval()

    epoch_loss = 0
    epoch_bleu = 0

    with torch.no_grad():

        for i, (input_, output_) in tqdm(enumerate(dataloader), total=len(dataloader)):

            input_data, input_length = input_
            output_data, output_length = output_

            prediction = model([x.to(device) for x in input_data], output_data.to(device), 0)  # turn off teacher forcing

            sample_t = tokenizer.convert_ids_to_tokens(output_data[0].tolist())
            sample_p = tokenizer.convert_ids_to_tokens(prediction[0].max(1)[1].tolist())
            idx1 = sample_t.index('[PAD]') if '[PAD]' in sample_t else len(sample_t)
            idx2 = sample_p.index('[SEP]') if '[SEP]' in sample_p else len(sample_p)

            bleu_i = bleu_score(prediction, output_data.to(device))

            trg_sent_len = prediction.size(1)
            # trg = [trg sent len, batch size]
            # output = [trg sent len, batch size, output dim]

            prediction = prediction[:, 1:].contiguous().view(-1, prediction.shape[-1])
            output_data = output_data[:, 1:].contiguous().view(-1)  # Find a way to avoid calling contiguous

            # trg = [(trg sent len - 1) * batch size]
            # output = [(trg sent len - 1) * batch size, output dim]

            pw_loss = hp.pw_criterion(prediction, output_data.to(device))

            loss = criterion(prediction, output_data.to(device))
            loss = loss.view(-1, trg_sent_len - 1)
            loss = loss.sum(1)
            loss = loss.mean(0)

            if i % int(len(dataloader) * 0.1) == int(len(dataloader) * 0.1) - 1:
                print('Batch {}: Sentence Loss: {:.3f}, Word Loss: {:.3f}, BLEU score {:.4f}'.format(i, loss.item(), pw_loss.item(), bleu_i))

                print(
                    'Target -> {}\nPrediction -> {}\n\n'.format(
                        sample_t[1:idx1-1], sample_p[1:idx2-1]
                    ))
            epoch_loss += pw_loss.item()
            epoch_bleu += bleu_i

        return epoch_loss / len(dataloader), epoch_bleu / len(dataloader)

In [18]:
class BertDataset(Dataset):
    def __init__(self, json_path, bert_model):
        all_data = json.load(open(json_path, 'r'))
        input, output = self._extract_data(all_data)
        self.data = self._tokenize_data(input, output, bert_model)

    def _extract_data(self, all_data):
        input, output = [], []
        for data in all_data:
            input.append((data['passages'], data['responses']))
            output.append(data['clues'])
        input, output = input[:-1], output[:-1]
        return input, output

    def _tokenize_data(self, input, output, bert_model):
        tokenizer = BertTokenizer.from_pretrained(bert_model)
        
        data = tokenizer.batch_encode_plus(input, pad_to_max_length=True, return_tensors='pt')
        out_dict = tokenizer.batch_encode_plus(output, pad_to_max_length=True, return_tensors='pt')
        
        data['output_ids'] = out_dict['input_ids']
        data['output_len'] = out_dict['attention_mask'].sum(dim=1)
        data['input_len'] = data['attention_mask'].sum(dim=1)
        
        idx = (data['input_len'] <= hp.max_input_length)
        in_m = max(data['input_len'][idx])
        out_m = max(data['output_len'][idx])
        
        data['input_ids'] = data['input_ids'][idx, :in_m]
        data['attention_mask'] = data['attention_mask'][idx, :in_m]
        data['token_type_ids'] = data['token_type_ids'][idx, :in_m]
        data['input_len'] = data['input_len'][idx]
        
        data['output_ids'] = data['output_ids'][idx, :out_m]
        data['output_len'] = data['output_len'][idx]
        
        return data
    
    def __len__(self):
        return self.data['input_ids'].shape[0]
    
    def __getitem__(self, idx):
        return (((self.data['input_ids'][idx],
                self.data['attention_mask'][idx],
                self.data['token_type_ids'][idx]),
                self.data['input_len'][idx]),
                (self.data['output_ids'][idx],
                self.data['output_len'][idx])
                )

In [19]:
dev_set = BertDataset('data_dev.json', bert_model)

dev_loader = DataLoader(dev_set, batch_size=hp.batch_size, shuffle=True,
                        num_workers=hp.num_workers, pin_memory=hp.pin_memory
                       )

In [20]:
class Attention(nn.Module):
    def __init__(self, n_h_enc, n_h_dec, n_h_attention):
        super().__init__()
        self.attn = nn.Linear(n_h_enc + n_h_dec, n_h_attention)
        self.v = nn.Parameter(torch.rand(n_h_attention), requires_grad=True)
        
    def forward(self, key, queries):
        batch_size, src_len = queries.shape[:2]
        
        key = key.unsqueeze(1).repeat(1, src_len, 1)
        if key.size(0) != queries.size(0) or key.size(1) != queries.size(1):
            print(key.shape, queries.shape)
        kitten = torch.cat((key, queries), dim=2)
        energy = torch.tanh(self.attn(kitten))
        
        v = self.v.repeat(batch_size, 1).unsqueeze(2)
        
        attn = torch.bmm(energy, v).squeeze(2)
        
        return F.softmax(attn, dim=1)

class Decoder(nn.Module):
    def __init__(self, out_dims, emb_dims, n_h_enc, n_h_dec, n_layers, dropout, attn):
        super().__init__()
        self.out_dims = out_dims
        self.emb_dims = emb_dims
        self.n_h_enc = n_h_enc
        self.n_h_dec = n_h_dec
        self.n_layers = n_layers
        self.attn = attn
        
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(out_dims, emb_dims)
        
        self.rnn = nn.GRU(emb_dims, n_h_dec, batch_first=True, num_layers=n_layers, dropout=dropout)

        self.out_dense = nn.Linear(n_h_enc + n_h_dec, out_dims)
        
    def forward(self, src, queries, hidden):
        src = src.unsqueeze(1)
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)
        
        out, hidden = self.rnn(embedded, hidden)
        out = out.squeeze()
        
        a = self.attn(out, queries)
        a = a.unsqueeze(1)
        weighted = torch.bmm(a, queries)
        
        weighted = weighted.squeeze(1)
        
        out = self.out_dense(torch.cat([out, weighted], dim=1))
        
        return out, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, encoder_trained):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.encoder_trained = encoder_trained
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        input_ids, token_ids, attn_mask = src
        
        if self.encoder_trained:
            bert_hs = self.encoder(input_ids, token_type_ids=token_ids, attention_mask=attn_mask)
        else:
            with torch.no_grad():
                bert_hs = self.encoder(input_ids, token_type_ids=token_ids, attention_mask=attn_mask)
        
        bert_encodings = bert_hs[0]
        
        batch_size, max_len = trg.shape[:2]
        
        trg_vocab_size = self.decoder.out_dims
        
        outputs = torch.zeros(batch_size, max_len, trg_vocab_size).to(device)
        
        out = trg[:, 0]
        
        hidden = torch.zeros(self.decoder.n_layers, out.shape[0], self.decoder.n_h_dec).to(device)
        
        for t in range(1, max_len):
            out, hidden = self.decoder(out, bert_encodings, hidden)
            outputs[:, t] = out
            teacher_force = random.random() < teacher_forcing_ratio
            out = (trg[:, t] if teacher_force else out.max(1)[1])
        
        return outputs

In [22]:
model = torch.load(model_save_dir / 'best_model.pt')

model.to(device);

criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='none')

dev_loss, bleu_score_i = eval(model, dev_loader, criterion)