# Training / Fine-tuning a Dialogue model

We are going to look at model fine-tuning by taking a general purpose language model and fine-tuning it to perform dialogue in the style of the 1990s TV series Friends

In [1]:
!pip install accelerate -U
!pip install transformers -U
!pip install datasets
!pip install py7zr
!pip install tiktoken
!pip install sentencepiece
!pip install evaluate
!pip install rouge_score

Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0

In [2]:
import transformers
from transformers import pipeline, set_seed
import datasets
from datasets import load_dataset
import py7zr
import accelerate
import pandas as pd
import torch
import numpy as np

In [3]:
dataset_friends = load_dataset("michellejieli/friends_dataset")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.88k [00:00<?, ?B/s]

friends_cleaned.csv:   0%|          | 0.00/732k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14503 [00:00<?, ? examples/s]

In [4]:
dataset_friends.shape

{'train': (14503, 2)}

In [5]:
dataset_friends = load_dataset("michellejieli/friends_dataset")
context=["BEGIN"]
context.extend(dataset_friends["train"][0:14502]["text"])
dataset_friends=datasets.Dataset.from_pandas(pd.DataFrame(np.array([context,dataset_friends["train"][0:14503]["text"]]).T.tolist(),columns=["context","response"]))
dataset_friends=dataset_friends.train_test_split(test_size=500/dataset_friends.shape[0],seed=99)
dataset_friends_test=dataset_friends["test"]
dataset_friends=dataset_friends["train"]
dataset_friends=dataset_friends.train_test_split(test_size=500/(dataset_friends.shape[0]-500),seed=99)


In [6]:
dataset_friends = load_dataset("michellejieli/friends_dataset")
context=["BEGIN"]
context.extend(dataset_friends["train"][0:14502]["text"])
dataset_friends=datasets.Dataset.from_pandas(pd.DataFrame(np.array([context,dataset_friends["train"][0:14503]["text"]]).T.tolist(),columns=["context","response"]))
dataset_friends=dataset_friends.train_test_split(test_size=500/dataset_friends.shape[0],seed=99)
dataset_friends_test=dataset_friends["test"]
dataset_friends=dataset_friends["train"]
dataset_friends=dataset_friends.train_test_split(test_size=500/(dataset_friends.shape[0]),seed=99)

In [7]:
from transformers import AutoTokenizer, BartForConditionalGeneration
device="cuda"
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(device)

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [8]:
torch.cuda.empty_cache()
vanilla_predictions=[]
for i in range(dataset_friends_test.shape[0]):
  input_ = tokenizer.batch_encode_plus(dataset_friends_test[i:i+1]["context"], max_length=1024, pad_to_max_length=True,truncation=True, padding='longest', return_tensors="pt")
  input_ids = input_['input_ids']
  input_mask = input_['attention_mask']
  responses = model.generate(input_ids=input_ids.to(device),
                         attention_mask=input_mask.to(device),
                         num_beams=100,
                         no_repeat_ngram_size=2,
                         early_stopping=True,
                         num_return_sequences=1,
                         max_length=1024)
  vanilla_predictions.extend(tokenizer.batch_decode(responses, skip_special_tokens=True))


In [9]:
import evaluate
references=dataset_friends_test[:]["response"]
bleu = evaluate.load("bleu")
bleu.add(predictions=str(vanilla_predictions), references=str(references))
results = bleu.compute()
print(results)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.33774851625597013, 'precisions': [0.7739602169981917, 0.45395629238884705, 0.25670786855592403, 0.14427860696517414], 'brevity_penalty': 1.0, 'length_ratio': 1.0381727158948686, 'translation_length': 6636, 'reference_length': 6392}


In [10]:
import evaluate
references=dataset_friends_test[:]["response"]
rouge = evaluate.load("rouge")
rouge.add(predictions=str(vanilla_predictions), references=str(references))
results = rouge.compute()
print(results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': np.float64(0.7359277275886033), 'rouge2': np.float64(0.2752548656163114), 'rougeL': np.float64(0.16029650220060226), 'rougeLsum': np.float64(0.16029650220060226)}


### Fine-Tuning

To fine tune model uncomment the next five blocks of code and run. Note though that it will take a good few hours to run.

In [11]:
#def convert_examples_to_features(example_batch):
#    input_encodings = tokenizer(example_batch["context"], max_length=1024,
#                                truncation=True)
#
#    with tokenizer.as_target_tokenizer():
#        target_encodings = tokenizer(example_batch["response"], max_length=1024,
#                                     truncation=True)
#
#    return {"input_ids": input_encodings["input_ids"],
#            "attention_mask": input_encodings["attention_mask"],
#            "labels": target_encodings["input_ids"]}
#
# dataset_friends_pt = dataset_friends.map(convert_examples_to_features,
#                                       batched=True)
#columns = ["input_ids", "labels", "attention_mask"]
#dataset_friends_pt.set_format(type="torch", columns=columns)

In [12]:
#from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

#seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

#training_args = TrainingArguments(
#    output_dir='dialogue-friends', num_train_epochs=6, warmup_steps=500,
#    per_device_train_batch_size=1, per_device_eval_batch_size=1,
#    weight_decay=0.01, logging_steps=10, push_to_hub=False,
#    evaluation_strategy='steps', eval_steps=250, save_steps=1e6,gradient_accumulation_steps=128)

#trainer = Trainer(model=model, args=training_args,
#                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
#                  train_dataset=dataset_friends_pt["train"],
#                  eval_dataset=dataset_friends_pt["test"])

In [13]:
#!pip install wandb

In [14]:
#import wandb
#from huggingface_hub import notebook_login

#notebook_login()
#wandb.init(mode="disabled")

In [15]:
# hide_output
#torch.cuda.empty_cache()
#trainer.train()
# To save your fine-tuned model:
#trainer.save_model("dialogue-summ-model-bart")

To use a pre-tuned model run the following

In [18]:
!gdown 1V4JaqrDANpsxEU-IOt61Bj8GQ0FOzwLq
!gunzip dialogue-summ-model-bart.tar.gz
!tar xf dialogue-summ-model-bart.tar

Downloading...
From (original): https://drive.google.com/uc?id=1V4JaqrDANpsxEU-IOt61Bj8GQ0FOzwLq
From (redirected): https://drive.google.com/uc?id=1V4JaqrDANpsxEU-IOt61Bj8GQ0FOzwLq&confirm=t&uuid=1fbf5456-bc4a-47eb-b5dc-2a9622ddacbd
To: /content/dialogue-summ-model-bart.tar.gz
100% 517M/517M [00:02<00:00, 196MB/s]


In [20]:
from transformers import AutoTokenizer, BartForConditionalGeneration
model_ckpt="./dialogue-summ-model-bart"
device="cuda"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = BartForConditionalGeneration.from_pretrained(model_ckpt).to(device)

### Generating And Evaluating Dialogue

In [21]:
torch.cuda.empty_cache()
predictions=[]
for i in range(dataset_friends_test.shape[0]):
  input_ = tokenizer.batch_encode_plus(dataset_friends_test[i:i+1]["context"], max_length=1024, pad_to_max_length=True,truncation=True, padding='longest', return_tensors="pt")
  input_ids = input_['input_ids']
  input_mask = input_['attention_mask']
  responses = model.generate(input_ids=input_ids.to(device),
                         attention_mask=input_mask.to(device),
                         num_beams=100,
                         no_repeat_ngram_size=2,
                         early_stopping=True,
                         num_return_sequences=1,
                         max_length=1024)
  predictions.extend(tokenizer.batch_decode(responses, skip_special_tokens=True))

In [24]:
import evaluate
references=dataset_friends_test[:]["response"]
bleu = evaluate.load("bleu")
bleu.add(predictions=str(predictions), references=str(references))
results = bleu.compute()
print(results)

{'bleu': 0.13271856523562672, 'precisions': [0.8135761589403974, 0.5339516396157669, 0.34095427435387676, 0.18230029830957906], 'brevity_penalty': 0.3274053440806567, 'length_ratio': 0.47246558197747185, 'translation_length': 3020, 'reference_length': 6392}


In [23]:
import evaluate
references=dataset_friends_test[:]["response"]
rouge = evaluate.load("rouge")
rouge.add(predictions=str(predictions), references=str(references))
results = rouge.compute()
print(results)

{'rouge1': np.float64(0.37010548900411233), 'rouge2': np.float64(0.13843677338579863), 'rougeL': np.float64(0.14696942606829969), 'rougeLsum': np.float64(0.14696942606829969)}
