# Research Project

## Import required libraries

In [None]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install rouge_score bert_score sacrebleu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collec

In [None]:
from datasets import list_datasets, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2Tokenizer, GPT2LMHeadModel, get_polynomial_decay_schedule_with_warmup
from torch.nn import functional as F
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from itertools import chain
import torch
import math
import numpy as np
import random
import datasets

#For networking purposes
import os, sys
os.environ['CURL_CA_BUNDLE'] = ''

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


In [None]:
selected_model =  'dialoGPT-small' #'gpt2'
dataset_name = 'empathetic_dialogues'# 'daily_dialog'

## Tokenizer

In [None]:
if selected_model == 'dialoGPT-small':
    tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
elif selected_model == 'gpt2':
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
else:
    print('No tokenizer')

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
#Parameters

sp1_token = '<sp1>'
sp2_token = '<sp2>'
bos_token = '<bos>'
max_turns = 5
max_len = 1024
seed = 0
gpu = 0

#Tokeniser
special_tokens = {'bos_token': bos_token,
                'additional_special_tokens': [sp1_token, sp2_token]}

eos_token = tokenizer.eos_token

#Add pad token
pad_token = eos_token
tokenizer.add_special_tokens({'pad_token':pad_token})

num_new_tokens = tokenizer.add_special_tokens(special_tokens)


vocab = tokenizer.get_vocab()
vocab_size = len(vocab)
bos_id = vocab[bos_token]
eos_id = vocab[eos_token]
sp1_id = vocab[sp1_token]
sp2_id = vocab[sp2_token]

lr = 2e-5
batch_size = 8
num_workers = 0
num_epochs = 6
warmup_ratio = 0.1
last_epoch = 0
end_command = 'Quit!'
top_p = 0.8


In [None]:
def fix_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)

fix_seed(seed)

In [None]:
space = 'Ġ'
pre_quote = '’'
end_marks = ['.', ',', '?', '!', '...']
quotes = ['"', '\'']
abbreviations = ['s', 'd', 't', 'm', 're', 'll', 've', 'S', 'D', 'T', 'M', 'Re', 'Ll', 'Ve']

# For empathetic dialogues
exclude_symbol = "_conv"
comma_symbol = "_comma_"

def process_token_list(token_list):
    token_list[0] = token_list[0].capitalize()

    quote_count = 0
    for i, token in enumerate(token_list):
        if space in token:
            if token[1:] in end_marks or token[1:] in abbreviations:
                token_list[i] = token[1:]

            if token[1:] == quotes[1]:
                if i<len(token_list)-1:
                    if token_list[i+1] in abbreviations or (token_list[i+1][0] == space and token_list[i+1][1:] in abbreviations):
                        token_list[i] = token[1:]

        if token[0] == space and token[1:] in quotes:
            if quote_count % 2 == 1:
                token_list[i] = token[1:]
                quote_count = 0
            else:
                if i<len(token_list)-1 and token_list[i+1][0] == space:
                    token_list[i+1] = token_list[i+1][1:]
                quote_count += 1

        if token in end_marks or token[1:] in end_marks:
            if i<len(token_list)-1:
                if token_list[i+1][0] != space:
                    token_list[i+1] = space + token_list[i+1].capitalize()
                else:
                    token_list[i+1] = space + token_list[i+1][1:].capitalize()

    new_token_list = [token for token in token_list if token != space and len(token)>0]
    if new_token_list[-1] not in end_marks:
        new_token_list.append(end_marks[0])

    return new_token_list

## Load dataset

In [None]:
if dataset_name == 'daily_dialog':
    print('Loading ', dataset_name)
    dataset = load_dataset('daily_dialog')
    train_dialogues = dataset['train']['dialog']
    valid_dialogues = dataset['validation']['dialog']
    test_dialogues = dataset['test']['dialog']
elif dataset_name == 'empathetic_dialogues':
    print('Loading ', dataset_name)
    dataset = load_dataset('empathetic_dialogues')
    train_dialogues = dataset['train']
    valid_dialogues = dataset['validation']
    test_dialogues = dataset['test']
else:
    print('No dataset selected')

Loading  empathetic_dialogues


Downloading builder script:   0%|          | 0.00/4.51k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.15k [00:00<?, ?B/s]

Downloading and preparing dataset empathetic_dialogues/default to /root/.cache/huggingface/datasets/empathetic_dialogues/default/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf...


Downloading data:   0%|          | 0.00/28.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/76673 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12030 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10943 [00:00<?, ? examples/s]

Dataset empathetic_dialogues downloaded and prepared to /root/.cache/huggingface/datasets/empathetic_dialogues/default/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def load_empathetic(dataset, tokenizer):

    total_utters = dataset['utterance']
    total_conv_ids = dataset['conv_id']
    total_speaker_ids = dataset['speaker_idx']

    assert len(total_utters) == len(total_conv_ids) and len(total_conv_ids) == len(total_speaker_ids)

    num = 0

    conv_dict = {}
    cur_speaker_idx = -1
    for i, utter in enumerate(tqdm(total_utters)):
        conv_id = total_conv_ids[i]
        speaker_idx = total_speaker_ids[i]

        utter_modified = utter.strip().replace(comma_symbol, ',')
        new_token_list = process_token_list(tokenizer.tokenize(utter_modified))
        text = tokenizer.convert_tokens_to_string(new_token_list)

        if exclude_symbol in utter:
            continue

        if conv_id not in conv_dict:
            conv_dict[conv_id] = []
            cur_speaker_idx = -1

        if cur_speaker_idx != speaker_idx:
            conv_dict[conv_id].append(text)
            cur_speaker_idx = speaker_idx
        else:
            conv_dict[conv_id][-1] += f" {text}"

    utter_num = 0
    dialogues = []

    for i, (conv_id, utter_list) in enumerate(conv_dict.items()):
        utter_num += len(utter_list)
        dialogues.append(utter_list)

    return dialogues, utter_num

def load_daily(dataset, tokenizer):

    for i, dialogue in enumerate(tqdm(dataset)):
        new_dialogue = []
        for utter in dialogue:
            token_list = tokenizer.tokenize(utter.strip().replace(pre_quote, quotes[1]))
            token_list = process_token_list(token_list)
            text = tokenizer.convert_tokens_to_string(token_list)
            new_dialogue.append(text)

        dataset[i] = new_dialogue

    utter_num = 0

    for dialogue in dataset:
        utter_num += len(dialogue)

    return dataset, utter_num

In [None]:
if dataset_name == 'daily_dialog':
    train_dialogues, num_train = load_daily(train_dialogues, tokenizer)
    valid_dialogues, num_valid = load_daily(valid_dialogues, tokenizer)
    test_dialogues, num_test = load_daily(test_dialogues, tokenizer)
elif dataset_name == 'empathetic_dialogues':
    train_dialogues, num_train = load_empathetic(train_dialogues, tokenizer)
    valid_dialogues, num_valid = load_empathetic(valid_dialogues, tokenizer)
    test_dialogues, num_test = load_empathetic(test_dialogues, tokenizer)

  2%|▏         | 1480/76673 [00:00<00:43, 1713.24it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (6234 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 76673/76673 [00:17<00:00, 4450.10it/s]
100%|██████████| 12030/12030 [00:01<00:00, 6783.14it/s]
100%|██████████| 10943/10943 [00:01<00:00, 6850.41it/s]


In [None]:
print(f"The number of train dialogues: {len(train_dialogues)}")
print(f"The number of valid dialogues: {len(valid_dialogues)}")
print(f"The number of test dialogues: {len(test_dialogues)}")

print(f"The number of train utterances: {num_train}")
print(f"The number of valid utterances: {num_valid}")
print(f"The number of test utterances: {num_test}")



The number of train dialogues: 17793
The number of valid dialogues: 2759
The number of test dialogues: 2540
The number of train utterances: 76622
The number of valid utterances: 12025
The number of test utterances: 10939


In [None]:
all_len = [d for d in train_dialogues if len(d)>= 6]

In [None]:
len(all_len)

879

In [None]:
!mkdir 'saved_models'
ckpt_dir = 'saved_models'

In [None]:
def preprocess_dialog(dialog, window_size=5 + 1): #Context is 5, 6th is response
  instances = []

  # response = dialog["dialog"][-1]  # Last utterance as the response

  for i in range(0, len(dialog) - window_size + 1, 2): #Need the +1 to consider last response
    window = dialog[i:i+window_size]
    window_context = []
    for j, utterance in enumerate(window):
      sp_id = sp1_id if j % 2 == 0 else sp2_id
      input_ids = [sp_id] + tokenizer.encode(utterance)
      window_context.append(input_ids)

    input_ids = [bos_id] + list(chain.from_iterable(window_context)) + [eos_id]

    if len(input_ids) <= max_len:
      token_type_ids = [[sp1_id] * len(ctx) if c % 2 == 0 else [sp2_id] * len(ctx) for c, ctx in enumerate(window_context)]
      token_type_ids = [sp1_id] + list(chain.from_iterable(token_type_ids)) + [sp2_id]

      assert len(input_ids) == len(token_type_ids)

      labels = [[-100] * len(ctx) if c < len(window_context)-1 else [-100] + ctx[1:] for c, ctx in enumerate(window_context)]
      assert labels[-1][1:] == window_context[-1][1:]
      labels = [-100] + list(chain.from_iterable(labels)) + [eos_id]
      assert len(input_ids) == len(labels)

      instance = {
          "input_ids": torch.LongTensor(input_ids),
          "token_type_ids": torch.LongTensor(token_type_ids),
          "labels": torch.LongTensor(labels)
      }

      instances.append(instance)
    else:
      print('xd')


  return instances

In [None]:
train_instances = []
val_instances = []

# dummy
# test_instances = []
debug=0
for dialog in tqdm(train_dialogues):
    train_instances.extend(preprocess_dialog(dialog))

for dialog in tqdm(valid_dialogues):
    val_instances.extend(preprocess_dialog(dialog))

# for dialog in tqdm(test_dialogues):
#     test_instances.extend(preprocess_dialog(dialog))

100%|██████████| 17793/17793 [00:00<00:00, 18387.95it/s]
100%|██████████| 2759/2759 [00:00<00:00, 22023.44it/s]


In [None]:
print('bos_id: ', bos_id, 'eos_id: ', eos_id, 'sp1_id: ', sp1_id, 'sp2_id: ', sp2_id )

bos_id:  50257 eos_id:  50256 sp1_id:  50258 sp2_id:  50259


In [None]:
train_instances[0]['input_ids'].shape

torch.Size([97])

In [None]:
train_instances[0]['token_type_ids'].shape

torch.Size([97])

In [None]:
train_instances[0]['labels'].shape

torch.Size([97])

In [None]:
class DialogueDataset(Dataset):
    def __init__(self, instances):
        self.instances = instances

    def __len__(self):
        return len(self.instances)

    def __getitem__(self, idx):
        return self.instances[idx]

class PadCollate():
    def __init__(self, eos_id):
        self.eos_id = eos_id

    def pad_collate(self, batch):
        input_ids, token_type_ids, labels =[], [], []
        for idx, seqs in enumerate(batch):
            input_ids.append(torch.LongTensor(seqs['input_ids']))
            token_type_ids.append(torch.LongTensor(seqs['token_type_ids']))
            labels.append(torch.LongTensor(seqs['labels']))

        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.eos_id)
        token_type_ids = torch.nn.utils.rnn.pad_sequence(token_type_ids, batch_first=True, padding_value=self.eos_id)
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)

        return {
                "input_ids": input_ids,
                "token_type_ids": token_type_ids,
                "labels": labels
            }

In [None]:
train_dialogues[0]

['I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people, We felt like the only people in the world.',
 'Was this a friend you were in love with, Or just a best friend?',
 'This was a best friend. I miss her.',
 'Where has she gone?',
 'We no longer talk.',
 'Oh was this something that happened because of an argument?']

In [None]:
#Load Model
if torch.cuda.is_available():
    device = torch.device(f"cuda:{gpu}")
    print('Using GPU')
else:
    device = torch.device("cpu")
    print('Using CPU')

Using GPU


In [None]:
print("Loading the model: ", selected_model)


if selected_model == 'dialoGPT-small':
    model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small").to(device)
elif selected_model == 'gpt2':
    model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
else:
    print('No model')

model.resize_token_embeddings(vocab_size)
max_len = min(max_len, model.config.n_ctx)

Loading the model:  dialoGPT-small


Downloading pytorch_model.bin:   0%|          | 0.00/351M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
#Load from checkpoint
# ckpt = torch.load("/content/saved_models/best_ckpt_epoch=5_valid_loss=2.583.ckpt", map_location=device)
# model.load_state_dict(ckpt['model_state_dict'])

In [None]:
# Load optimizer
print("Loading the optimizer...")
optim = torch.optim.AdamW(model.parameters(), lr=lr)

Loading the optimizer...


In [None]:
#Create data loaders

ppd = PadCollate(eos_id=eos_id)

train_dataset = DialogueDataset(train_instances)
val_dataset = DialogueDataset(val_instances)
# test_dataset =  DialogueDataset(test_instances)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=ppd.pad_collate)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=ppd.pad_collate)
# test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=ppd.pad_collate)

In [None]:
# Calculate total training steps
num_batches = len(train_dataloader)
total_train_steps = num_epochs * num_batches
warmup_steps = int(warmup_ratio * total_train_steps)

sched = get_polynomial_decay_schedule_with_warmup(
    optim,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_train_steps,
    power=2
)

writer = SummaryWriter()

In [None]:
def validation():

    print("Validation processing...")
    model.eval()

    valid_losses = []
    valid_ppls = []
    with torch.no_grad():
        for i, batch in enumerate(tqdm(val_dataloader)):

            input_ids = batch["input_ids"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                token_type_ids = token_type_ids,
                labels = labels
            )

            loss, logits = outputs[0], outputs[1]

            valid_losses.append(loss.detach())
            ppl = torch.exp(loss.detach())
            valid_ppls.append(ppl)

        valid_losses = [loss.item() for loss in valid_losses]
        valid_ppls = [ppl.item() if not math.isinf(ppl.item()) else 1e+8 for ppl in valid_ppls]
        valid_loss = np.mean(valid_losses)
        valid_ppl = np.mean(valid_ppls)

        if math.isnan(valid_ppl):
            valid_ppl = 1e+8

    return valid_loss, valid_ppl

In [None]:
def train():

    fix_seed(seed)  # Fix seed before training
    print("Training starts.")

    best_loss = sys.float_info.max
    last_epoch= 0

    start_epoch = last_epoch +1
    for epoch in range(start_epoch, start_epoch+num_epochs):
        model.train()

        print(f"#"*50 + f"Epoch: {epoch}" + "#"*50)
        train_losses = []
        train_ppls = []
        for i, batch in enumerate(tqdm(train_dataloader)):

            input_ids = batch["input_ids"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            labels = batch["labels"].to(device)

            optim.zero_grad()

            outputs = model(
                input_ids=input_ids,
                token_type_ids = token_type_ids,
                labels = labels
            )

            loss = outputs.loss
            loss.backward()
            optim.step()
            sched.step()

            train_losses.append(loss.detach())
            ppl = torch.exp(loss.detach())
            train_ppls.append(ppl)

        train_losses = [loss.item() for loss in train_losses]
        train_ppls = [ppl.item() if not math.isinf(ppl.item()) else 1e+8 for ppl in train_ppls]
        train_loss = np.mean(train_losses)
        train_ppl = np.mean(train_ppls)
        print(f"Train loss: {train_loss} || Train perplexity: {train_ppl}")

        writer.add_scalar("Loss/train", train_loss, epoch)
        writer.add_scalar("PPL/train", train_ppl, epoch)

        last_epoch += 1

        valid_loss, valid_ppl = validation()

        if valid_loss < best_loss:
            best_loss = valid_loss
            state_dict = {
                'model_state_dict': model.state_dict(),
                'optim_state_dict': optim.state_dict(),
                'sched_state_dict': sched.state_dict(),
                'loss': best_loss,
                'epoch': last_epoch
            }

            torch.save(state_dict, f"{ckpt_dir}/best_ckpt_epoch={epoch}_valid_loss={round(best_loss, 4)}.ckpt")
            print("*"*10 + "Current best checkpoint is saved." + "*"*10)
            print(f"{ckpt_dir}/best_ckpt_epoch={epoch}_valid_loss={round(best_loss, 4)}.ckpt")

        print(f"Best valid loss: {best_loss}")
        print(f"Valid loss: {valid_loss} || Valid perplexity: {valid_ppl}")

        writer.add_scalar("Loss/valid", valid_loss, epoch)
        writer.add_scalar("PPL/valid", valid_ppl, epoch)

        writer.add_scalars("Losses", {
            'train': train_loss,
            'valid': valid_loss,
        }, epoch)
        writer.add_scalars("PPLs", {
            'train': train_ppl,
            'valid': valid_ppl,
        }, epoch)

    print("Training finished!")

In [None]:
train()

Training starts.
##################################################Epoch: 1##################################################


100%|██████████| 140/140 [00:53<00:00,  2.60it/s]


Train loss: 5.769222550732749 || Train perplexity: 1502.231432723999
Validation processing...


100%|██████████| 25/25 [00:02<00:00,  8.79it/s]


**********Current best checkpoint is saved.**********
saved_models/best_ckpt_epoch=1_valid_loss=3.6922.ckpt
Best valid loss: 3.6921842288970947
Valid loss: 3.6921842288970947 || Valid perplexity: 46.90516944885254
##################################################Epoch: 2##################################################


100%|██████████| 140/140 [00:52<00:00,  2.66it/s]


Train loss: 3.972916485582079 || Train perplexity: 60.077066108158654
Validation processing...


100%|██████████| 25/25 [00:02<00:00,  9.53it/s]


**********Current best checkpoint is saved.**********
saved_models/best_ckpt_epoch=2_valid_loss=3.4413.ckpt
Best valid loss: 3.441296157836914
Valid loss: 3.441296157836914 || Valid perplexity: 35.02263153076172
##################################################Epoch: 3##################################################


100%|██████████| 140/140 [00:54<00:00,  2.57it/s]


Train loss: 3.7062791415623257 || Train perplexity: 44.828785269601006
Validation processing...


100%|██████████| 25/25 [00:02<00:00,  9.25it/s]


**********Current best checkpoint is saved.**********
saved_models/best_ckpt_epoch=3_valid_loss=3.3749.ckpt
Best valid loss: 3.374907922744751
Valid loss: 3.374907922744751 || Valid perplexity: 32.57033107757568
##################################################Epoch: 4##################################################


100%|██████████| 140/140 [00:54<00:00,  2.55it/s]


Train loss: 3.610293972492218 || Train perplexity: 41.89447454043797
Validation processing...


100%|██████████| 25/25 [00:02<00:00,  9.17it/s]


**********Current best checkpoint is saved.**********
saved_models/best_ckpt_epoch=4_valid_loss=3.3398.ckpt
Best valid loss: 3.339781379699707
Valid loss: 3.339781379699707 || Valid perplexity: 31.37452323913574
##################################################Epoch: 5##################################################


100%|██████████| 140/140 [00:56<00:00,  2.48it/s]


Train loss: 3.5513374328613283 || Train perplexity: 37.77904007094247
Validation processing...


100%|██████████| 25/25 [00:02<00:00,  9.08it/s]


**********Current best checkpoint is saved.**********
saved_models/best_ckpt_epoch=5_valid_loss=3.3313.ckpt
Best valid loss: 3.331313171386719
Valid loss: 3.331313171386719 || Valid perplexity: 31.09738697052002
##################################################Epoch: 6##################################################


100%|██████████| 140/140 [00:55<00:00,  2.53it/s]


Train loss: 3.5321727275848387 || Train perplexity: 37.294756058284214
Validation processing...


100%|██████████| 25/25 [00:02<00:00,  8.95it/s]


**********Current best checkpoint is saved.**********
saved_models/best_ckpt_epoch=6_valid_loss=3.3311.ckpt
Best valid loss: 3.3310992431640627
Valid loss: 3.3310992431640627 || Valid perplexity: 31.098675384521485
Training finished!


In [None]:
def infer(window_size=5):
    model.eval()
    fix_seed(seed)
    generated_responses = []
    actual_responses = []

    with torch.no_grad():

        for dialog in tqdm(test_dialogues):

            for i in range(0, len(dialog) - window_size, 2): #In steps of 2

                window = dialog[i:i+window_size]
                window_context = []

                for j, utterance in enumerate(window):
                    #Set speaker 1 or speaker 2
                    sp_id = sp1_id if j % 2 == 0 else sp2_id
                    input_ids = [sp_id] + tokenizer.encode(utterance)
                    window_context.append(input_ids)
                    # context.append(utterance)

                # print()
                # for c in window_context:
                #     print(tokenizer.decode(c))
                start_sp_id = window_context[0][0]
                next_sp_id = sp1_id if start_sp_id == sp2_id else sp2_id
                assert start_sp_id != next_sp_id

                input_ids = [bos_id] + list(chain.from_iterable(window_context)) + [next_sp_id] #Because window is 5, so 6th utter is = sp2


                token_type_ids = [[start_sp_id] * len(hist) if h % 2 == 0 else [next_sp_id] * len(hist) for h, hist in enumerate(window_context)]
                assert len(token_type_ids) == len(window_context)
                token_type_ids = [start_sp_id] + list(chain.from_iterable(token_type_ids)) + [next_sp_id]

                assert len(input_ids) == len(token_type_ids)
                input_len = len(input_ids)

                input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(device)
                token_type_ids = torch.LongTensor(token_type_ids).unsqueeze(0).to(device)

                output_ids = model.generate(input_ids=input_ids, token_type_ids=token_type_ids, pad_token_id=eos_id,
                                            max_length=max_len, do_sample=True, top_p=top_p).squeeze(0)

                output_ids = output_ids.tolist()[input_len:]


                # output_ids = nucleus_sampling(input_ids, token_type_ids, input_len, next_sp_id)


                res = tokenizer.decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)



                actual_res = dialog[i+window_size]

                # print(f"Bot response: {res}")
                # print(f"Actual response: {actual_res}")


                generated_responses.append(res)
                actual_responses.append(actual_res)

    return generated_responses, actual_responses



In [None]:
generated_responses, actual_responses = infer()

100%|██████████| 2540/2540 [00:31<00:00, 79.41it/s]


In [None]:
assert len(generated_responses) == len(actual_responses)
print(len(generated_responses))
print(len(actual_responses))

174
174


### Store responses

In [None]:
import pickle
from google.colab import files

file_generated = "" + selected_model + "_epochs_" + str(num_epochs) + "_generated_responses_" + dataset_name
file_actual = "" + selected_model + "_epochs_" + str(num_epochs) + "_actual_responses_"  + dataset_name

with open(file_generated, "wb") as fp:
    pickle.dump(generated_responses, fp)

with open(file_actual, "wb") as fp:
    pickle.dump(actual_responses, fp)

files.download(file_generated)
files.download(file_actual)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#Loading example
# with open("gpt2_batch8_generated_responses", "rb") as fp:   # Unpickling
#     dummy = pickle.load(fp)

### Compute metrics

In [None]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
chrf = evaluate.load("chrf")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [None]:
actual_responses = [[res] for res in actual_responses] #Refs must be in a list of list of str

print(generated_responses[:5])
print(actual_responses[:5])

["I've been friends with that girl who just left her with a girl friend. She said he liked her for like I'd been gone for like that.", 'Oh that is a good.', 'It really was a great experience!', "Don't be so serious, It is a good learning experience,", "Yes it's one of those I've seen too."]
[["Wow, So your going to take being a bad person to the grave. Maybe you'll see her in the next life?"], ["Well I've been in the business all my life and have worked for some great people. So I pull from what I learned from them."], ['Oh, Wow. Not only to be able to do all the running but to view the scenery!'], ['I know.'], ['Sounds interesting!']]


In [None]:
bleu_score = sacrebleu.compute(predictions=generated_responses, references=actual_responses)

rouge_score = rouge.compute(predictions=generated_responses, references=actual_responses)

bert_score = bertscore.compute(predictions=generated_responses, references=actual_responses, lang='en')
precision = bert_score['precision']
recall = bert_score['recall']
f1 = bert_score['f1']
avg_precision_bert = sum(precision) / len(precision)
avg_recall_bert = sum(recall) / len(recall)
avg_f1_bert = sum(f1) / len(f1)

chrf_score = chrf.compute(predictions=generated_responses, references=actual_responses)

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print('Bleu score: \n', bleu_score) #Range from 0 to 100
print('Rouge score: \n', rouge_score)
print('Bert score: \n', bert_score)
print('Avg precision Bert score: ', avg_precision_bert)
print('Avg recall Bert score: ', avg_recall_bert)
print('Avg f1 Bert score: ', avg_f1_bert)
print('chrf score: \n', chrf_score)


Bleu score: 
 {'score': 0.38246335370461976, 'counts': [301, 15, 2, 0], 'totals': [1902, 1728, 1555, 1389], 'precisions': [15.825446898002102, 0.8680555555555556, 0.12861736334405144, 0.03599712023038157], 'bp': 0.7615915427834669, 'sys_len': 1902, 'ref_len': 2420}
Rouge score: 
 {'rouge1': 0.10312247688964572, 'rouge2': 0.009068764006068996, 'rougeL': 0.09272396500307499, 'rougeLsum': 0.09246614254961924}
Bert score: 
 {'precision': [0.8298017382621765, 0.8669712543487549, 0.8919275999069214, 0.829082727432251, 0.8196436166763306, 0.8261119723320007, 0.8424946069717407, 0.8212094306945801, 0.8592186570167542, 0.9028390645980835, 0.8465954065322876, 0.8388491272926331, 0.8440271615982056, 0.8657508492469788, 0.881737232208252, 0.881360650062561, 0.8594015836715698, 0.8412224650382996, 0.8377891778945923, 0.8754595518112183, 0.9249991178512573, 0.9048036336898804, 0.837327778339386, 0.8665248155593872, 0.8531848192214966, 0.7778225541114807, 0.8397326469421387, 0.8370019197463989, 0.873

In [None]:
# Play an audio beep. Any audio URL will do.
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [None]:
# predictions = ["hello there general kenobi", "foo bar foobar"]
# references = [["hello there general kenobi", "hello there !"],
#                  ["foo bar foobar", "foo bar foobar"]]
# sacrebleu = evaluate.load("sacrebleu")
# results = sacrebleu.compute(predictions=predictions,
#                              references=references)
# print(results)

# results = rouge.compute(predictions=predictions,
#                              references=references)
# print(results)

# results = bertscore.compute(predictions=predictions,
#                              references=references, lang='eng')
# print(results)

# results = chrf.compute(predictions=predictions,
#                              references=references)
# print(results)