In [1]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install rouge_score bert_score sacrebleu




[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
os.environ['CURL_CA_BUNDLE'] = ''

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, get_polynomial_decay_schedule_with_warmup
from datasets import load_dataset
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import random
from tqdm import tqdm
import sys
import math
import pickle
# from google.colab import files
import evaluate



In [5]:
dataset_name = 'empathetic_dialogues' #'daily_dialog' #

In [6]:
if dataset_name == 'daily_dialog':
    print('Loading ', dataset_name)
    dataset = load_dataset('daily_dialog')
    train_dialogues = dataset['train']['dialog']
    valid_dialogues = dataset['validation']['dialog']
    test_dialogues = dataset['test']['dialog']
elif dataset_name == 'empathetic_dialogues':
    print('Loading ', dataset_name)
    dataset = load_dataset('empathetic_dialogues')
    train_dialogues = dataset['train']
    valid_dialogues = dataset['validation']
    test_dialogues = dataset['test']
else:
    print('No dataset selected')

Loading  empathetic_dialogues


Found cached dataset empathetic_dialogues (C:/Users/aramosvela/.cache/huggingface/datasets/empathetic_dialogues/default/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf)
100%|██████████| 3/3 [00:00<00:00, 58.64it/s]


In [7]:
space = 'Ġ'
pre_quote = '’'
end_marks = ['.', ',', '?', '!', '...']
quotes = ['"', '\'']
abbreviations = ['s', 'd', 't', 'm', 're', 'll', 've', 'S', 'D', 'T', 'M', 'Re', 'Ll', 'Ve']

# For empathetic dialogues
exclude_symbol = "_conv"
comma_symbol = "_comma_"

def process_token_list(token_list):
    token_list[0] = token_list[0].capitalize()

    quote_count = 0
    for i, token in enumerate(token_list):
        if space in token:
            if token[1:] in end_marks or token[1:] in abbreviations:
                token_list[i] = token[1:]

            if token[1:] == quotes[1]:
                if i<len(token_list)-1:
                    if token_list[i+1] in abbreviations or (token_list[i+1][0] == space and token_list[i+1][1:] in abbreviations):
                        token_list[i] = token[1:]

        if token[0] == space and token[1:] in quotes:
            if quote_count % 2 == 1:
                token_list[i] = token[1:]
                quote_count = 0
            else:
                if i<len(token_list)-1 and token_list[i+1][0] == space:
                    token_list[i+1] = token_list[i+1][1:]
                quote_count += 1

        if token in end_marks or token[1:] in end_marks:
            if i<len(token_list)-1:
                if token_list[i+1][0] != space:
                    token_list[i+1] = space + token_list[i+1].capitalize()
                else:
                    token_list[i+1] = space + token_list[i+1][1:].capitalize()

    new_token_list = [token for token in token_list if token != space and len(token)>0]
    if new_token_list[-1] not in end_marks:
        new_token_list.append(end_marks[0])

    return new_token_list

In [8]:
def load_empathetic(dataset, tokenizer):

    total_utters = dataset['utterance']
    total_conv_ids = dataset['conv_id']
    total_speaker_ids = dataset['speaker_idx']

    assert len(total_utters) == len(total_conv_ids) and len(total_conv_ids) == len(total_speaker_ids)

    num = 0

    conv_dict = {}
    cur_speaker_idx = -1
    for i, utter in enumerate(tqdm(total_utters)):
        conv_id = total_conv_ids[i]
        speaker_idx = total_speaker_ids[i]

        utter_modified = utter.strip().replace(comma_symbol, ',')
        new_token_list = process_token_list(tokenizer.tokenize(utter_modified))
        text = tokenizer.convert_tokens_to_string(new_token_list)

        if exclude_symbol in utter:
            continue

        if conv_id not in conv_dict:
            conv_dict[conv_id] = []
            cur_speaker_idx = -1

        if cur_speaker_idx != speaker_idx:
            conv_dict[conv_id].append(text)
            cur_speaker_idx = speaker_idx
        else:
            conv_dict[conv_id][-1] += f" {text}"

    utter_num = 0
    dialogues = []

    for i, (conv_id, utter_list) in enumerate(conv_dict.items()):
        utter_num += len(utter_list)
        dialogues.append(utter_list)

    return dialogues, utter_num

def load_daily(dataset, tokenizer):

    for i, dialogue in enumerate(tqdm(dataset)):
        new_dialogue = []
        for utter in dialogue:
            token_list = tokenizer.tokenize(utter.strip().replace(pre_quote, quotes[1]))
            token_list = process_token_list(token_list)
            text = tokenizer.convert_tokens_to_string(token_list)
            new_dialogue.append(text)

        dataset[i] = new_dialogue

    utter_num = 0

    for dialogue in dataset:
        utter_num += len(dialogue)

    return dataset, utter_num

In [9]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

In [10]:
if dataset_name == 'daily_dialog':
    train_dialogues, num_train = load_daily(train_dialogues, tokenizer)
    valid_dialogues, num_valid = load_daily(valid_dialogues, tokenizer)
    test_dialogues, num_test = load_daily(test_dialogues, tokenizer)
elif dataset_name == 'empathetic_dialogues':
    train_dialogues, num_train = load_empathetic(train_dialogues, tokenizer)
    valid_dialogues, num_valid = load_empathetic(valid_dialogues, tokenizer)
    test_dialogues, num_test = load_empathetic(test_dialogues, tokenizer)

100%|██████████| 76673/76673 [00:08<00:00, 8533.63it/s] 
100%|██████████| 12030/12030 [00:01<00:00, 9221.09it/s]
100%|██████████| 10943/10943 [00:01<00:00, 9059.97it/s]


In [48]:
print(f"The number of train dialogues: {len(train_dialogues)}")
print(f"The number of valid dialogues: {len(valid_dialogues)}")
print(f"The number of test dialogues: {len(test_dialogues)}")

print(f"The number of train utterances: {num_train}")
print(f"The number of valid utterances: {num_valid}")
print(f"The number of test utterances: {num_test}")

The number of train dialogues: 11118
The number of valid dialogues: 1000
The number of test dialogues: 1000
The number of train utterances: 87170
The number of valid utterances: 8069
The number of test utterances: 7740


In [10]:
sp1_token = '<sp1>'
sp2_token = '<sp2>'
# bos_token = '<bos>'
# eos_token = '<eos>'
max_len = 1024
seed = 0
gpu = 0

#Tokeniser
special_tokens = {#'bos_token': bos_token,
                'additional_special_tokens': [sp1_token, sp2_token]}

# eos_token = tokenizer.eos_token
num_new_tokens = tokenizer.add_special_tokens(special_tokens)

vocab = tokenizer.get_vocab()
vocab_size = len(vocab)
# bos_id = vocab[bos_token]
# eos_id = vocab[eos_token]
sp1_id = vocab[sp1_token]
sp2_id = vocab[sp2_token]

lr = 2e-5
batch_size = 8
num_workers = 0
num_epochs = 6
warmup_ratio = 0.1
last_epoch = 0
end_command = 'Quit!'
top_p = 0.8


In [11]:
def fix_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)

fix_seed(seed)

In [12]:
!mkdir 'saved_models'
ckpt_dir = 'saved_models'

In [13]:
def preprocess_dialog(dialog, window_size=5):
    instances = []

    # response = dialog["dialog"][-1]  # Last utterance as the response

    for i in range(0, len(dialog) - window_size, 2):

        window = dialog[i:i+window_size]
        window_context = []
        for j, utterance in enumerate(window):
            speaker = sp1_token if j % 2 == 0 else sp2_token
            window_context.append(speaker + " " + utterance)

        # Add special tokens for bos, eos
        # window_context.insert(0, '<s>')
        # window_context.append("</s>")

        window_context = ' '.join(window_context)
        # window_context = window_context + sp2_token
        response =  sp2_token + dialog[i+window_size]

        # print('window_context: ', type(window_context), window_context)
        # print('response: ', type(response), response)



        # print()
        # print('window_context: ', window_context)
        # print('response: ', response)


        # Tokenize the context and response
        # input_ids = tokenizer.encode_plus(window_context, add_special_tokens=True, padding='max_length', max_length=max_len, truncation=True , return_tensors="pt")
        # decoder_input_ids = tokenizer.encode_plus(response, add_special_tokens=True, padding='max_length', max_length=max_len, truncation=True, return_tensors="pt")

        input_ids = tokenizer.encode_plus(window_context, add_special_tokens=True, return_tensors="pt")
        decoder_input_ids = tokenizer.encode_plus(response, add_special_tokens=True, return_tensors="pt")

        if (len(input_ids['input_ids']) + len(decoder_input_ids['input_ids']) -2) <= max_len: # 2 to ignore eos and bos tokens of decoder


            labels = decoder_input_ids['input_ids']
            # labels[labels[:, :] == vocab['<pad>']] = -100

            instance = {
                "input_ids": input_ids["input_ids"].squeeze(0),
                # "decoder_input_ids": decoder_input_ids["input_ids"].squeeze(0), #Testing purposes
                # "decoder_attention_mask": decoder_input_ids["attention_mask"].squeeze(0), #Testing purposes
                "attention_mask": input_ids["attention_mask"].squeeze(0),
                "labels": labels.squeeze(0)
            }

            # print('input_ids: ', instance['input_ids'].shape)
            # print('attention_mask: ', instance['attention_mask'].shape)
            # print('decoder_input_ids: ', instance['decoder_input_ids'].shape)
            # print('decoder_attention_mask: ', instance['decoder_attention_mask'].shape)
            instances.append(instance)




    return instances

In [14]:
train_dialogues[0]

['I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people, We felt like the only people in the world.',
 'Was this a friend you were in love with, Or just a best friend?',
 'This was a best friend. I miss her.',
 'Where has she gone?',
 'We no longer talk.',
 'Oh was this something that happened because of an argument?']

In [15]:
train_instances = []
val_instances = []

#dummy
test_instances = []

for dialog in tqdm(train_dialogues):
    # print(len(dialog), dialog)
    train_instances.extend(preprocess_dialog(dialog))

for dialog in tqdm(valid_dialogues):
    val_instances.extend(preprocess_dialog(dialog))

for dialog in tqdm(test_dialogues):
    test_instances.extend(preprocess_dialog(dialog))

100%|██████████| 17793/17793 [00:01<00:00, 13841.30it/s]
100%|██████████| 2759/2759 [00:00<00:00, 12713.88it/s]
100%|██████████| 2540/2540 [00:00<00:00, 12055.60it/s]


In [17]:
train_instances[0]['input_ids']

tensor([    0, 50265,   100,  2145,   164,     7,   192,     5, 10756,    19,
          127,   275,  1441,     4,    85,    21,     5,    78,    86,    52,
          655,  1240,    86,  1937,   561,     4,  2223,    89,    21,    10,
          319,     9,    82,     6,   166,  1299,   101,     5,   129,    82,
           11,     5,   232,     4, 50266, 32112,    42,    10,  1441,    47,
           58,    11,   657,    19,     6,  1793,    95,    10,   275,  1441,
          116, 50265,   713,    21,    10,   275,  1441,     4,    38,  2649,
           69,     4, 50266, 13841,    34,    79,  1613,   116, 50265,   170,
          117,  1181,  1067,     4,     2])

In [18]:
train_instances[0]['attention_mask']

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [19]:
# train_instances[0]['decoder_input_ids']

In [20]:
train_instances[0]['labels']

tensor([    0, 50266,  7516,    21,    42,   402,    14,  1102,   142,     9,
           41,  4795,   116,     2])

In [21]:
class DialogueDataset(Dataset):
    def __init__(self, instances):
        self.instances = instances

    def __len__(self):
        return len(self.instances)

    def __getitem__(self, idx):
        return self.instances[idx]

class PadCollate():
    def __init__(self, pad_id):
        self.pad_id = pad_id

    def pad_collate(self, batch):
        input_ids, attention_mask, labels =[], [], []
        for idx, seqs in enumerate(batch):
            input_ids.append(torch.LongTensor(seqs['input_ids']))
            attention_mask.append(torch.LongTensor(seqs['attention_mask']))
            labels.append(torch.LongTensor(seqs['labels']))

        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_id)
        attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)

        return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "labels": labels
            }


In [22]:
#Create data

ppd = PadCollate(pad_id=vocab['<pad>'])


train_dataset = DialogueDataset(train_instances)
val_dataset = DialogueDataset(val_instances)
test_dataset =  DialogueDataset(test_instances)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=ppd.pad_collate)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=ppd.pad_collate)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=ppd.pad_collate)


In [23]:
print(train_dataset.__getitem__(0)['input_ids'].shape)
print(train_dataset.__getitem__(0)['attention_mask'].shape)

# print(train_dataset.__getitem__(0)['decoder_input_ids'].shape)
# print(train_dataset.__getitem__(0)['decoder_attention_mask'].shape)


torch.Size([85])
torch.Size([85])


In [24]:
print(train_dataset.__getitem__(1)['input_ids'].shape)
print(train_dataset.__getitem__(1)['attention_mask'].shape)

# print(train_dataset.__getitem__(1)['decoder_input_ids'].shape)
# print(train_dataset.__getitem__(1)['decoder_attention_mask'].shape)

torch.Size([68])
torch.Size([68])


In [25]:
if torch.cuda.is_available():
    device = torch.device(f"cuda:{gpu}")
    print('Using GPU')
else:
    device = torch.device("cpu")
    print('Using CPU')

Using GPU


In [26]:
#Define the BART model and optimizer
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(device)
model.resize_token_embeddings(vocab_size)

Embedding(50267, 768)

In [27]:
#Load from checkpoint
# ckpt = torch.load("/content/saved_models/best_ckpt_epoch=5_valid_loss=2.1621.ckpt", map_location=device)
# model.load_state_dict(ckpt['model_state_dict'])

In [28]:
print("Loading the optimizer...")
optim = torch.optim.AdamW(model.parameters(), lr=lr)


Loading the optimizer...


In [29]:
# Calculate total training steps
num_batches = len(train_dataloader)
total_train_steps = num_epochs * num_batches
warmup_steps = int(warmup_ratio * total_train_steps)

sched = get_polynomial_decay_schedule_with_warmup(
    optim,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_train_steps,
    power=2
)

writer = SummaryWriter()


In [30]:
def validation():

    print("Validation processing...")
    model.eval()

    valid_losses = []
    valid_ppls = []
    with torch.no_grad():
        for i, batch in enumerate(tqdm(val_dataloader)):

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            # decoder_input_ids = batch["decoder_input_ids"].to(device)
            # decoder_attention_mask = batch["decoder_attention_mask"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                # decoder_input_ids=decoder_input_ids,
                # decoder_attention_mask=decoder_attention_mask,
                labels = labels
                # use_cache=False
            )

            loss = outputs.loss

            valid_losses.append(loss.detach())
            ppl = torch.exp(loss.detach())
            valid_ppls.append(ppl)

        valid_losses = [loss.item() for loss in valid_losses]
        valid_ppls = [ppl.item() if not math.isinf(ppl.item()) else 1e+8 for ppl in valid_ppls]
        valid_loss = np.mean(valid_losses)
        valid_ppl = np.mean(valid_ppls)

        if math.isnan(valid_ppl):
            valid_ppl = 1e+8

    return valid_loss, valid_ppl

In [31]:
def train():

    print('Number of epochs: ', num_epochs)
    fix_seed(seed)  # Fix seed before training
    print("Training starts.")

    best_loss = sys.float_info.max
    last_epoch= 0

    start_epoch = last_epoch +1

    for epoch in range(start_epoch, start_epoch+num_epochs):
        model.train()

        print(f"#"*50 + f"Epoch: {epoch}" + "#"*50)
        train_losses = []
        train_ppls = []

        # total_loss = 0

        for batch in tqdm(train_dataloader):

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optim.zero_grad()


            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels = labels
            )

            loss = outputs.loss
            loss.backward()
            optim.step()
            sched.step()

            train_losses.append(loss.detach())
            ppl = torch.exp(loss.detach())
            train_ppls.append(ppl)

        train_losses = [loss.item() for loss in train_losses]
        train_ppls = [ppl.item() if not math.isinf(ppl.item()) else 1e+8 for ppl in train_ppls]
        train_loss = np.mean(train_losses)
        train_ppl = np.mean(train_ppls)
        print(f"Train loss: {train_loss} || Train perplexity: {train_ppl}")

        writer.add_scalar("Loss/train", train_loss, epoch)
        writer.add_scalar("PPL/train", train_ppl, epoch)

        last_epoch += 1

        valid_loss, valid_ppl = validation()

        if valid_loss < best_loss:
            best_loss = valid_loss
            state_dict = {
                'model_state_dict': model.state_dict(),
                'optim_state_dict': optim.state_dict(),
                'sched_state_dict': sched.state_dict(),
                'loss': best_loss,
                'epoch': last_epoch
            }

            torch.save(state_dict, f"{ckpt_dir}/best_ckpt_epoch={epoch}_valid_loss={round(best_loss, 4)}.ckpt")
            print("*"*10 + "Current best checkpoint is saved." + "*"*10)
            print(f"{ckpt_dir}/best_ckpt_epoch={epoch}_valid_loss={round(best_loss, 4)}.ckpt")

        print(f"Best valid loss: {best_loss}")
        print(f"Valid loss: {valid_loss} || Valid perplexity: {valid_ppl}")

        writer.add_scalar("Loss/valid", valid_loss, epoch)
        writer.add_scalar("PPL/valid", valid_ppl, epoch)

        writer.add_scalars("Losses", {
            'train': train_loss,
            'valid': valid_loss,
        }, epoch)
        writer.add_scalars("PPLs", {
            'train': train_ppl,
            'valid': valid_ppl,
        }, epoch)

    print("Training finished!")

In [32]:
train()

Number of epochs:  6
Training starts.
##################################################Epoch: 1##################################################


100%|██████████| 140/140 [00:30<00:00,  4.54it/s]


Train loss: 3.6560330969946726 || Train perplexity: 48.30674480029515
Validation processing...


100%|██████████| 25/25 [00:01<00:00, 16.75it/s]


**********Current best checkpoint is saved.**********
saved_models/best_ckpt_epoch=1_valid_loss=2.7374.ckpt
Best valid loss: 2.7374249839782716
Valid loss: 2.7374249839782716 || Valid perplexity: 16.16849609375
##################################################Epoch: 2##################################################


100%|██████████| 140/140 [00:31<00:00,  4.49it/s]


Train loss: 2.9316333379064288 || Train perplexity: 19.51716752052307
Validation processing...


100%|██████████| 25/25 [00:01<00:00, 17.68it/s]


**********Current best checkpoint is saved.**********
saved_models/best_ckpt_epoch=2_valid_loss=2.7122.ckpt
Best valid loss: 2.712186040878296
Valid loss: 2.712186040878296 || Valid perplexity: 15.737069683074951
##################################################Epoch: 3##################################################


100%|██████████| 140/140 [00:30<00:00,  4.54it/s]


Train loss: 2.749595161846706 || Train perplexity: 16.336497116088868
Validation processing...


100%|██████████| 25/25 [00:01<00:00, 17.01it/s]


**********Current best checkpoint is saved.**********
saved_models/best_ckpt_epoch=3_valid_loss=2.7066.ckpt
Best valid loss: 2.706581687927246
Valid loss: 2.706581687927246 || Valid perplexity: 15.684145278930664
##################################################Epoch: 4##################################################


100%|██████████| 140/140 [00:31<00:00,  4.51it/s]


Train loss: 2.6402110712868825 || Train perplexity: 14.527891867501395
Validation processing...


100%|██████████| 25/25 [00:01<00:00, 17.36it/s]


**********Current best checkpoint is saved.**********
saved_models/best_ckpt_epoch=4_valid_loss=2.6919.ckpt
Best valid loss: 2.691890058517456
Valid loss: 2.691890058517456 || Valid perplexity: 15.470690650939941
##################################################Epoch: 5##################################################


100%|██████████| 140/140 [00:31<00:00,  4.50it/s]


Train loss: 2.5806405544281006 || Train perplexity: 13.635267148699079
Validation processing...


100%|██████████| 25/25 [00:01<00:00, 17.21it/s]


Best valid loss: 2.691890058517456
Valid loss: 2.6968470096588133 || Valid perplexity: 15.550805473327637
##################################################Epoch: 6##################################################


100%|██████████| 140/140 [00:30<00:00,  4.56it/s]


Train loss: 2.5696304883275713 || Train perplexity: 13.566252786772592
Validation processing...


100%|██████████| 25/25 [00:01<00:00, 17.64it/s]

Best valid loss: 2.691890058517456
Valid loss: 2.6971686553955077 || Valid perplexity: 15.557815475463867
Training finished!





In [33]:
def infer(window_size=5):
    model.eval()
    fix_seed(seed)

    generated_responses = []
    actual_responses = []

    with torch.no_grad():

        for dialog in tqdm(test_dialogues):

            for i in range(0, len(dialog) - window_size, 2): #In steps of 2

                window = dialog[i:i+window_size]
                window_context = []
                for j, utterance in enumerate(window):
                    speaker = sp1_token if j % 2 == 0 else sp2_token
                    window_context.append(speaker + " " + utterance)

                # Add special tokens for bos, eos
                # window_context.insert(0, '<s>')
                # window_context.append("</s>")

                window_context = ' '.join(window_context)
                # window_context = window_context

                # print()
                # print('window context: ', window_context)

                #Get encodings
                encodings = tokenizer.encode_plus(window_context, add_special_tokens=True, padding='max_length', max_length=512, truncation=True , return_tensors="pt")
                # print(encodings)
                input_ids = encodings['input_ids'].to(device)
                attention_mask = encodings['attention_mask'].to(device)

                # print('input_ids: ', input_ids.shape, input_ids)
                # print('attention_mask: ', attention_mask.shape, attention_mask)

                output_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_len, do_sample=True, top_p=top_p).squeeze(0)

                #Generate response
                # output_ids = model(input_ids=input_ids, attention_mask=attention_mask)

                # print('encoded response: ', output_ids)
                # print('encoded response: ', output_ids.squeeze(0))

                response = tokenizer.decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

                # print('generated response: ', response)

                actual_response = dialog[i+window_size]

                # print('actual_response: ', actual_response)

                generated_responses.append(response)
                actual_responses.append(actual_response)
            # break

    return generated_responses, actual_responses

In [34]:
generated_responses, actual_responses = infer()

100%|██████████| 2540/2540 [00:41<00:00, 61.58it/s]


In [35]:
assert len(generated_responses) == len(actual_responses)
print(len(generated_responses))
print(len(actual_responses))

174
174


In [36]:


selected_model = 'bart'

file_generated = "" + selected_model + "_epochs_" + str(num_epochs) + "_generated_responses_" + dataset_name
file_actual = "" + selected_model + "_epochs_" + str(num_epochs) + "_actual_responses_"  + dataset_name

with open(file_generated, "wb") as fp:
    pickle.dump(generated_responses, fp)

with open(file_actual, "wb") as fp:
    pickle.dump(actual_responses, fp)

files.download(file_generated)
files.download(file_actual)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [37]:


sacrebleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
chrf = evaluate.load("chrf")

In [38]:
actual_responses = [[res] for res in actual_responses] #Refs must be in a list of list of str

print(generated_responses[:5])
print(actual_responses[:5])

["I'm sorry you had to go through that.", "That's great! I hope you find a new job soon!", "I'm glad you enjoyed it!", "That's a shame. You should learn from your mistakes.", "I haven't heard of that one yet."]
[["Wow, So your going to take being a bad person to the grave. Maybe you'll see her in the next life?"], ["Well I've been in the business all my life and have worked for some great people. So I pull from what I learned from them."], ['Oh, Wow. Not only to be able to do all the running but to view the scenery!'], ['I know.'], ['Sounds interesting!']]


In [39]:
bleu_score = sacrebleu.compute(predictions=generated_responses, references=actual_responses)

rouge_score = rouge.compute(predictions=generated_responses, references=actual_responses)

bert_score = bertscore.compute(predictions=generated_responses, references=actual_responses, lang='en')
precision = bert_score['precision']
recall = bert_score['recall']
f1 = bert_score['f1']
avg_precision_bert = sum(precision) / len(precision)
avg_recall_bert = sum(recall) / len(recall)
avg_f1_bert = sum(f1) / len(f1)

chrf_score = chrf.compute(predictions=generated_responses, references=actual_responses)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
print('Bleu score: \n', bleu_score) #Range from 0 to 100
print('Rouge score: \n', rouge_score)
print('Bert score: \n', bert_score)
print('Avg precision Bert score: ', avg_precision_bert)
print('Avg recall Bert score: ', avg_recall_bert)
print('Avg f1 Bert score: ', avg_f1_bert)
print('chrf score: \n', chrf_score)


Bleu score: 
 {'score': 0.7777648426184913, 'counts': [303, 20, 5, 3], 'totals': [1552, 1378, 1204, 1030], 'precisions': [19.52319587628866, 1.4513788098693758, 0.4152823920265781, 0.2912621359223301], 'bp': 0.5716214253476395, 'sys_len': 1552, 'ref_len': 2420}
Rouge score: 
 {'rouge1': 0.12538659964780177, 'rouge2': 0.012817914276200135, 'rougeL': 0.11483889014572904, 'rougeLsum': 0.11476143910821986}
Bert score: 
 {'precision': [0.8749120235443115, 0.8499920964241028, 0.8318235874176025, 0.8739137649536133, 0.8369052410125732, 0.8467947244644165, 0.8660605549812317, 0.9032583832740784, 0.8504906892776489, 0.8751018047332764, 0.8603325486183167, 0.875321626663208, 0.8782193660736084, 0.8926254510879517, 0.8968781232833862, 0.8962040543556213, 0.8577842712402344, 0.906535804271698, 0.883204996585846, 0.8991708159446716, 0.8784940242767334, 0.9107457995414734, 0.8642939329147339, 0.8715198636054993, 0.8566023111343384, 0.8650470972061157, 0.8715588450431824, 0.8851257562637329, 0.858521

In [43]:
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [42]:
# def generate_response(context):
#     model.eval()
#     input_ids = tokenizer.encode(context, padding='max_length', max_length=512, truncation=True , return_tensors="pt").to(device)
#     input_ids = input_ids.to(device)
#     output_ids = model.generate(input_ids=input_ids, max_length=512, do_sample=True, top_p=top_p)
#     response = tokenizer.decode(output_ids.squeeze(), skip_special_tokens=True)

#     return response

# context = "<sp1> How are you? <sp2> I'm doing well. How about you? <sp1> I am good too. Thanks for asking. <sp2>"
# context = "<sp1> What date is today?"

# response = generate_response(context)
# print("Generated Response:", response)

### Qualitative evaluation

In [11]:
dataset_name

'empathetic_dialogues'

In [28]:
#Load generated responses from gpt2, dialogpt and bart

models = ['gpt2', 'dialoGPT-small', 'bart']

responses = {}

for m in models:
    file_name = 'Results\\SecondExperiment\\' + m +  "\\" + dataset_name + '\\' + m + '_epochs_6_generated_responses_' + dataset_name
    # file_name = m + '_epochs_6_generated_responses_' + dataset_name
    with open(file_name, "rb") as fp:
        responses[m] = pickle.load(fp)

assert len(responses['bart']) == len(responses['gpt2']) == len(responses['dialoGPT-small'])
print(len(responses['bart']))

174


In [29]:
#Go through testing dialogues to check context
def context(window_size=5):

    context = []
    actual_responses = []

    for dialog in test_dialogues:

        for i in range(0, len(dialog) - window_size, 2): #In steps of 2

            window = dialog[i:i+window_size]

            context.append(window)
            actual_responses.append(dialog[i+window_size])

    return context, actual_responses

In [30]:
contexts, actual_responses = context()

assert len(contexts) == len(actual_responses)
print(len(contexts))

174


In [31]:
len(contexts)

174

In [32]:
n_samples = 30
idx = np.random.choice(len(contexts), size=n_samples, replace=False)

gpt2_responses = [responses['gpt2'][i] for i in idx]
dialogpt_responses = [responses['dialoGPT-small'][i] for i in idx]
bart_responses = [responses['bart'][i] for i in idx]
actual_responses = [actual_responses[i] for i in idx]
contexts = [contexts[i] for i in idx]




In [33]:
#Load responses to double check that all models have the same actual responses
for m in models:
    file_name = 'Results\\SecondExperiment\\' + m +  "\\" + dataset_name + '\\' + m + '_epochs_6_actual_responses_' + dataset_name
    # file_name = m + '_epochs_6_generated_responses_' + dataset_name
    with open(file_name, "rb") as fp:
        key_name = m +'_res'
        responses[key_name] = pickle.load(fp)

gpt2_actual_res = [responses['gpt2_res'][i] for i in idx]
dialogpt_actual_res = [responses['dialoGPT-small_res'][i] for i in idx]
bart_res = [responses['bart_res'][i] for i in idx]

for gpt2_res, dialogpt_res, bart_res, actual_res in zip(gpt2_actual_res, dialogpt_actual_res, bart_res, actual_responses):
    assert gpt2_res == dialogpt_res == bart_res == actual_res


In [34]:
for gpt_res, dialogpt_res, bart_res, actual_res, ctx in zip(gpt2_responses, dialogpt_responses, bart_responses, actual_responses, contexts):
    print()
    print('Context: ')
    for c in ctx:
        print(c)
    print()
    print('Actual res: ', actual_res)
    print('Gpt2 res: ', gpt_res)
    print('dialoGPT res: ', dialogpt_res)
    print('bart res: ', bart_res)


Context: 
I found out yesterday at work that I'm one of the top performers for my district! I figured I was doing good, I didn't know I was doing *that* well!
That's great.
Usually at the end of the day it's never anything nice when my supervisor wants to talk to me but she broke the news and I was surprised! Been walking on sunshine all weekend!
Congrats. I want a promotion at my job.
How long have you been there?

Actual res:  Three years.
Gpt2 res:  I hope you're ok!
dialoGPT res:  Yeah that is the best part of being an adult with a job. I hope you are still there.
bart res:  That's great! I wish you all the best!

Context: 
Yes, I am always better off with them as my parents, They want to see us succeed.
Do you find the home buying experience worth it?
Yes, It was well worth it! I love being independant. Teaches you a lot.
How long do you plan on staying in that house?
Maybe 5 more years then I am ready for something bigger. Probably when I get married.

Actual res:  Well that's a