# <center> Human-Robot Interaction: Dialogue System (Fine-Tuning and Evaluation)

# 1. Set-up

In [None]:
# set this to True if you want to train a new bot
RUN_TRAINING = False

Use the [DialoGPT](https://github.com/microsoft/DialoGPT) small version (117M)

In [None]:
# import requirements
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# download pretrained models (once)
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")

Use the [Empathic Dialogues](https://github.com/facebookresearch/EmpatheticDialogues) dataset

Here we've already downloaded and preprocessed the dataset for you (stored in "Support/Data/empatheticdialogues").

**Download the fine-tuned model [here](https://drive.google.com/file/d/1ZebPQikUkiJ5Lwa0DydlkPnQS4wnWQou/view?usp=sharing) (~440MB) and unzip it under your "Support" folder**

In [None]:
# accessing support files and dataset
import sys, os
from Practical11_Support.helper import *

data_dir = 'Practical11_Support/Data/empatheticdialogues/'
model_dir = 'Practical11_Support/fine-tuned'

download_fine_tuned(model_dir)

# 2. Configuring the model

Training functions are inside the helper script

In [None]:
import glob, logging, os, pickle, random, re, torch, pandas as pd, numpy as np
from typing import Dict, List, Tuple
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tqdm.notebook import tqdm, trange
from pathlib import Path
from transformers import (
    AdamW,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)

# training arguments
# more info see: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments
class Args():
    def __init__(self):
        self.output_dir = model_dir # where to save the fine-tuned model
        self.model_type = 'gpt2' # don't change
        self.model_name_or_path = 'microsoft/DialoGPT-small' # don't change
        self.config_name = 'microsoft/DialoGPT-small' # don't change
        self.tokenizer_name = 'microsoft/DialoGPT-small' # don't change
        self.cache_dir = 'cached' # save cache to working directory, can change to somewhere else
        self.block_size = 16 # don't change
        self.per_gpu_train_batch_size = 1 # can be changed, speed-related
        self.gradient_accumulation_steps = 1 # can be changed
        self.learning_rate = 5e-5 # can be changed
        self.weight_decay = 0.0 # can be changed
        self.adam_epsilon = 1e-8 # can be changed
        self.max_grad_norm = 1.0 # can be changed
        self.num_train_epochs = 5  # can be changed
        self.max_steps = -1 # don't change
        self.warmup_steps = 0 # don't change
        self.logging_steps = 1000 # don't change
        self.save_total_limit = None # don't change
        self.seed = 42 # don't change
        self.local_rank = -1 # don't change

args = Args()

# 3. Training an empathic chatbot
Fine-tuning the DialoGPT-small model with the Empathic Dialogues dataset


In [None]:
# data segments
data_trn = data_dir + 'train_processed.csv' # training set for training the model
data_val = data_dir + 'valid_processed.csv' # validation set for parameter tunning
data_tst = data_dir + 'test_processed.csv' # testing set for evaluating the model

Take a look at the data

In [None]:
df = pd.read_csv(data_trn, error_bad_lines=False)
df.head(5)

Fine-tuning the DialoGPT-small model on the training set. After training is finished the fine-tuned model will be saved as "fine-tuned"

In [None]:
# set RUN_TRAINING to True if you want to train a new bot
if RUN_TRAINING:
    # training with more data takes longer and might lead to an out-of-memory error
    df_trn = pd.read_csv(data_trn, error_bad_lines=False, usecols=['utterance'], nrows=3000)
    main(df_trn, args)

# 4. Evaluating the chatbot
## 4.1 Quantitative and objective evaluation

We define two metrics to evaluate the generated responses:
- Metric 1: BLEU score, which measures the lexical similarity between the generated responses and the human responses (ground-truth)
- Metric 2: sentiment alignment score, which measures the emotional similarity between the generated responses and the human responses (ground-truth). **Fill in the TODO to compute the sentiment label (1-5 stars) of the ground-truth and generated responses (sent_true and sent_pred).**

In [None]:
# compute BLEU score and alignment of sentiment between ground-truth responses and generated responses
# import requirements
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torchtext.data.metrics import bleu_score

# download pretrained models for sentiment classification
sent_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
sent_model = AutoModelForSequenceClassification.from_pretrained(sent_model_name)
sent_tokenizer = AutoTokenizer.from_pretrained(sent_model_name)
sent_classifier = pipeline('sentiment-analysis', model=sent_model, tokenizer=sent_tokenizer)

def eval(groundtruth, pred_prompt, tokenizer, model, verbose=False):
    args = Args()
    set_seed(args.seed) # Set seed

    sent_align = 0

    # ground-truth responses
    groundtruth.utterance = groundtruth.utterance.str[:].str.split(' ').tolist()

    # generated responses
    pred = []
    for prom_input in tqdm(pred_prompt.prompt):
        # encoding input
        new_prom_input_ids = tokenizer.encode(prom_input + tokenizer.eos_token, return_tensors='pt')
        bot_input_ids = new_prom_input_ids

        # generate a response with beam search
        # more about different generation methods: https://huggingface.co/blog/how-to-generate 
        chat_history_ids = model.generate(
            bot_input_ids,
            do_sample=True, 
            max_length=200,
            num_beams=5, 
            no_repeat_ngram_size=2,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
        # save generated response
        response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
        pred.append(response.split(' '))

        # set verbose to true if you want to look at the responses for debugging
        if verbose:
            print("prompt: ", prom_input)
            print("generated response: ", response)

    # compute BLEU score (unigram-only)
    bleu = bleu_score(pred, groundtruth.utterance, max_n=1, weights=[1.0])

    # compute sentiment alignment between ground-truth and generated response
    sent_align = 0

    for i in tqdm(range(len(groundtruth.utterance))):
        #TODO: compute sentiment lable of the ground-truth and generated response-------------
        sent_true = ''
        sent_pred = ''
        #ENDTODO -----------------------------------------------------------------------------

        if sent_true == sent_pred:
            sent_align += 1

        # set verbose to true if you want to look at the sentiments for debugging
        if verbose:
            print("sentiment of generated response: ", sent_pred)
            print("ground truth response: ", human_response)
            print("sentiment of ground truth response: ", sent_true)
            
    sent_score = sent_align/len(groundtruth.utterance)


    return bleu, sent_score

Test the performance of the original DialoGPT-small model on the Empathic Dialogues dataset before fine-tuning

In [None]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-small')

# evaluate on the first 10 rows of the validation set
nrows=10
groundtruth = pd.read_csv(data_val, error_bad_lines=False, usecols=['utterance'], nrows=nrows)
pred_prompt = pd.read_csv(data_val, error_bad_lines=False, usecols=['prompt'], nrows=nrows)
bleu_val, sent_score_val = eval(groundtruth, pred_prompt, tokenizer, model)

print("Original DialoGPT-small model:")
print("BLEU score on the validation set is {:.3f}".format(bleu_val))
print("Sentiment alignment score on the validation set is {:.3f}".format(sent_score_val))

Test the fine-tuned model on the Empathic Dialogues dataset. 

In [None]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelForCausalLM.from_pretrained(args.output_dir) # use fine-tuned model

# evaluate on the first 10 rows of the validation set
nrows=10
groundtruth = pd.read_csv(data_val, error_bad_lines=False, usecols=['utterance'], nrows=nrows)
pred_prompt = pd.read_csv(data_val, error_bad_lines=False, usecols=['prompt'], nrows=nrows)

'''
# if you want to evaluate your model on a different segement (e.g., 100-199), use the code below instead:
groundtruth = pd.read_csv(data_val, error_bad_lines=False, usecols=['utterance'])
groundtruth = groundtruth.iloc[100:199]
groundtruth = groundtruth.reset_index(drop=True)
pred_prompt = pd.read_csv(data_val, error_bad_lines=False, usecols=['prompt'])
pred_prompt = pred_prompt.iloc[100:199]
pred_prompt = pred_prompt.reset_index(drop=True)
'''

bleu_val, sent_score_val = eval(groundtruth, pred_prompt, tokenizer, model, verbose=False)

print("Fine-tuned model:")
print("BLEU score on the validation set is {:.3f}".format(bleu_val))
print("Sentiment alignment score on the validation set is {:.3f}".format(sent_score_val))

In [None]:
# post-lecture quiz 4
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelForCausalLM.from_pretrained(args.output_dir) # use fine-tuned model

# evaluate on the first 10 rows of the test set
nrows=10
groundtruth = pd.read_csv(data_tst, error_bad_lines=False, usecols=['utterance'], nrows=nrows)
pred_prompt = pd.read_csv(data_tst, error_bad_lines=False, usecols=['prompt'], nrows=nrows)
bleu_tst, sent_score_tst = eval(groundtruth, pred_prompt, tokenizer, model)
print("Fine-tuned model:")
print("BLEU score on the test set is {:.3f}".format(bleu_tst))
print("Sentiment alignment score on the test set is {:.3f}".format(sent_score_val))

## 4.2 Qualitative and subjective evaluation

Chat with the fine-tuned chatbot to see how the responses differ from the original DialoGPT.

In [None]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelForCausalLM.from_pretrained(args.output_dir)

history = True # toggle whether to include dialog history or not
fixed = True # toggle whether or not you want the bot to generate the same response for the same output
n_turns = 5 # number of user inputs (turns)

# set a random seed if you want the bot to generate the same response for the same output
if fixed:
    set_seed(1234)

# Let's chat
for step in range(n_turns):
    # encode user input
    user_input = input("You: ")
    new_user_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')

    # append the new input to the chat history and respond to the whole history
    if history:
        bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
    # only respond to the current input
    else:
        bot_input_ids = new_user_input_ids

    # generated a response
    chat_history_ids = model.generate(
        bot_input_ids,
        do_sample=True, 
        max_length=1000,
        top_k=50, 
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # print bot response
    print("Bot: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], 
                                            skip_special_tokens=True)))