# Training / Fine-tuning a Dialogue model

We are going to look at model fine-tuning by taking a general purpose language model and fine-tuning it to perform dialogue in the style of the 1990s TV series Friends

In [None]:
!pip install accelerate -U
!pip install transformers -U
!pip install datasets
!pip install py7zr
!pip install tiktoken
!pip install sentencepiece
!pip install evaluate
!pip install rouge_score

In [None]:
import transformers
from transformers import pipeline, set_seed
import datasets
from datasets import load_dataset
import py7zr
import accelerate
import pandas as pd
import torch
import numpy as np

In [None]:
dataset_friends = load_dataset("michellejieli/friends_dataset")

In [None]:
dataset_friends.shape

In [None]:
dataset_friends = load_dataset("michellejieli/friends_dataset")
context=["BEGIN"]
context.extend(dataset_friends["train"][0:14502]["text"])
dataset_friends=datasets.Dataset.from_pandas(pd.DataFrame(np.array([context,dataset_friends["train"][0:14503]["text"]]).T.tolist(),columns=["context","response"]))
dataset_friends=dataset_friends.train_test_split(test_size=500/dataset_friends.shape[0],seed=99)
dataset_friends_test=dataset_friends["test"]
dataset_friends=dataset_friends["train"]
dataset_friends=dataset_friends.train_test_split(test_size=500/(dataset_friends.shape[0]-500),seed=99)


In [None]:
dataset_friends = load_dataset("michellejieli/friends_dataset")
context=["BEGIN"]
context.extend(dataset_friends["train"][0:14502]["text"])
dataset_friends=datasets.Dataset.from_pandas(pd.DataFrame(np.array([context,dataset_friends["train"][0:14503]["text"]]).T.tolist(),columns=["context","response"]))
dataset_friends=dataset_friends.train_test_split(test_size=500/dataset_friends.shape[0],seed=99)
dataset_friends_test=dataset_friends["test"]
dataset_friends=dataset_friends["train"]
dataset_friends=dataset_friends.train_test_split(test_size=500/(dataset_friends.shape[0]),seed=99)

In [None]:
from transformers import AutoTokenizer, BartForConditionalGeneration
device="cuda"
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(device)

In [None]:
torch.cuda.empty_cache()
vanilla_predictions=[]
for i in range(dataset_friends_test.shape[0]):
  input_ = tokenizer.batch_encode_plus(dataset_friends_test[i:i+1]["context"], max_length=1024, pad_to_max_length=True,truncation=True, padding='longest', return_tensors="pt")
  input_ids = input_['input_ids']
  input_mask = input_['attention_mask']
  responses = model.generate(input_ids=input_ids.to(device),
                         attention_mask=input_mask.to(device),
                         num_beams=100,
                         no_repeat_ngram_size=2,
                         early_stopping=True,
                         num_return_sequences=1,
                         max_length=1024)
  vanilla_predictions.extend(tokenizer.batch_decode(responses, skip_special_tokens=True))


In [None]:
import evaluate
references=dataset_friends_test[:]["response"]
bleu = evaluate.load("bleu")
bleu.add(predictions=str(vanilla_predictions), references=str(references))
results = bleu.compute()
print(results)

In [None]:
import evaluate
references=dataset_friends_test[:]["response"]
rouge = evaluate.load("rouge")
rouge.add(predictions=str(vanilla_predictions), references=str(references))
results = rouge.compute()
print(results)

### Fine-Tuning

To fine tune model uncomment the next five blocks of code and run. Note though that it will take a good few hours to run.

In [None]:
#def convert_examples_to_features(example_batch):
#    input_encodings = tokenizer(example_batch["context"], max_length=1024,
#                                truncation=True)
#
#    with tokenizer.as_target_tokenizer():
#        target_encodings = tokenizer(example_batch["response"], max_length=1024,
#                                     truncation=True)
#
#    return {"input_ids": input_encodings["input_ids"],
#            "attention_mask": input_encodings["attention_mask"],
#            "labels": target_encodings["input_ids"]}
#
# dataset_friends_pt = dataset_friends.map(convert_examples_to_features,
#                                       batched=True)
#columns = ["input_ids", "labels", "attention_mask"]
#dataset_friends_pt.set_format(type="torch", columns=columns)

In [None]:
#from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

#seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

#training_args = TrainingArguments(
#    output_dir='dialogue-friends', num_train_epochs=6, warmup_steps=500,
#    per_device_train_batch_size=1, per_device_eval_batch_size=1,
#    weight_decay=0.01, logging_steps=10, push_to_hub=False,
#    evaluation_strategy='steps', eval_steps=250, save_steps=1e6,gradient_accumulation_steps=128)

#trainer = Trainer(model=model, args=training_args,
#                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
#                  train_dataset=dataset_friends_pt["train"],
#                  eval_dataset=dataset_friends_pt["test"])

In [None]:
#!pip install wandb

In [None]:
#import wandb
#from huggingface_hub import notebook_login

#notebook_login()
#wandb.init(mode="disabled")

In [None]:
# hide_output
#torch.cuda.empty_cache()
#trainer.train()
# To save your fine-tuned model:
#trainer.save_model("dialogue-summ-model-bart")

To use a pre-tuned model run the following

In [None]:
!gdown 1V4JaqrDANpsxEU-IOt61Bj8GQ0FOzwLq
!gunzip dialogue-summ-model-bart.tar.gz
!tar xf dialogue-summ-model-bart.tar

In [None]:
from transformers import AutoTokenizer, BartForConditionalGeneration
model_ckpt="./dialogue-summ-model-bart"
device="cuda"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = BartForConditionalGeneration.from_pretrained(model_ckpt).to(device)

### Generating And Evaluating Dialogue

In [None]:
torch.cuda.empty_cache()
predictions=[]
for i in range(dataset_friends_test.shape[0]):
  input_ = tokenizer.batch_encode_plus(dataset_friends_test[i:i+1]["context"], max_length=1024, pad_to_max_length=True,truncation=True, padding='longest', return_tensors="pt")
  input_ids = input_['input_ids']
  input_mask = input_['attention_mask']
  responses = model.generate(input_ids=input_ids.to(device),
                         attention_mask=input_mask.to(device),
                         num_beams=100,
                         no_repeat_ngram_size=2,
                         early_stopping=True,
                         num_return_sequences=1,
                         max_length=1024)
  predictions.extend(tokenizer.batch_decode(responses, skip_special_tokens=True))

In [None]:
import evaluate
references=dataset_friends_test[:]["response"]
bleu = evaluate.load("bleu")
bleu.add(predictions=str(predictions), references=str(references))
results = bleu.compute()
print(results)

In [None]:
import evaluate
references=dataset_friends_test[:]["response"]
rouge = evaluate.load("rouge")
rouge.add(predictions=str(predictions), references=str(references))
results = rouge.compute()
print(results)