# Training / Fine-tuning a Dialogue model based on a toddler

We are going to look at model fine-tuning by taking a general purpose language model and fine-tuning it to perform dialogue in the style of a toddler interacting with their caregiver

In [None]:
!pip install accelerate -U
!pip install transformers -U
!pip install datasets
!pip install py7zr
!pip install tiktoken
!pip install sentencepiece
!pip install evaluate
!pip install rouge_score

In [None]:
import transformers
from transformers import pipeline, set_seed
import py7zr
import accelerate
import pandas as pd
import torch
import numpy as np

Download training and test data - selected and preprocessed pairs of caregiver utterances with child responses from this corpus:

https://childes.talkbank.org/access/Eng-UK/Thomas.html

In [None]:
!gdown 1iY6xKKp455CCtoMBsONIQKtdarxchB-J

In [None]:
df = pd.read_csv("thomas-clean.csv")
df=df.dropna()

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
ds=Dataset.from_pandas(df)
ds=ds.train_test_split(test_size=0.001,seed=99)


In [None]:
from transformers import AutoTokenizer, BartForConditionalGeneration
device="cuda"
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(device)

Update the model vocabulary to include words from the child-caregiver speech corpus

In [None]:
text=ds["train"][:]["CONTEXT"]
text.extend(ds["test"][:]["CONTEXT"])
text.extend(ds["train"][:]["RESPONSE"])
text.extend(ds["test"][:]["RESPONSE"])

tokenset = list(set(str.split(' '.join(text))))
print(len(tokenizer))  # 28996
tokenizer.add_tokens(tokenset)
print(len(tokenizer))  # 28997

model.resize_token_embeddings(len(tokenizer))

Examine the performance of the untuned BART model in producing responses to the caregiver utterances in the test data

In [None]:
torch.cuda.empty_cache()
vanilla_predictions=[]
for i in range(ds["test"].shape[0]):
  input_ = tokenizer.batch_encode_plus(ds["test"][i:i+1]["CONTEXT"], max_length=1024, pad_to_max_length=True,truncation=True, padding='longest', return_tensors="pt")
  input_ids = input_['input_ids']
  input_mask = input_['attention_mask']
  responses = model.generate(input_ids=input_ids.to(device),
                         attention_mask=input_mask.to(device),
                         num_beams=100,
                         no_repeat_ngram_size=2,
                         early_stopping=True,
                         num_return_sequences=1,
                         max_length=1024)
  vanilla_predictions.extend(tokenizer.batch_decode(responses, skip_special_tokens=True))


In [None]:
vanilla_predictions

In [None]:
import evaluate
references=ds["test"][:]["RESPONSE"]
bleu = evaluate.load("bleu")
bleu.add(predictions=str(vanilla_predictions), references=str(references))
results = bleu.compute()
print(results)


In [None]:
import evaluate
references=ds["test"][:]["RESPONSE"]
rouge = evaluate.load("rouge")
rouge.add(predictions=str(vanilla_predictions), references=str(references))
results = rouge.compute()
print(results)

### Fine-Tuning

To fine tune model uncomment the next five blocks of code and run. Note though that it will take a good few hours to run.

In [None]:
#def convert_examples_to_features(example_batch):
#    input_encodings = tokenizer(example_batch["CONTEXT"], max_length=1024,
#                                truncation=True)

#    with tokenizer.as_target_tokenizer():
#        target_encodings = tokenizer(example_batch["RESPONSE"], max_length=1024,
#                                     truncation=True)

#    return {"input_ids": input_encodings["input_ids"],
#            "attention_mask": input_encodings["attention_mask"],
#            "labels": target_encodings["input_ids"]}

#dataset_pt = ds.map(convert_examples_to_features,
#                                       batched=True)
#columns = ["input_ids", "labels", "attention_mask"]
#dataset_pt.set_format(type="torch", columns=columns)

In [None]:
#from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

#seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

#training_args = TrainingArguments(
#    output_dir='dialogue-thomas', num_train_epochs=6, warmup_steps=500,
#    per_device_train_batch_size=1, per_device_eval_batch_size=1,
#    weight_decay=0.01, logging_steps=10, push_to_hub=False,
#    eval_steps=250, save_steps=1e6,gradient_accumulation_steps=128)

#trainer = Trainer(model=model, args=training_args,
#                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
#                  train_dataset=dataset_pt["train"],
#                  eval_dataset=dataset_pt["test"])

In [None]:
#!pip install wandb

In [None]:
#import wandb
#from huggingface_hub import notebook_login

#notebook_login()
#wandb.init(mode="disabled")

In [None]:
# hide_output
#torch.cuda.empty_cache()
#trainer.train()
# To save your fine-tuned model:
#trainer.save_model("dialogue-thomas-model-bart")

To use a pre-tuned model run the following

In [None]:
!gdown 1iYiJtoo1cM5v5oJgKZOlRCG-zM2yGKk5

!gunzip dialogue-thomas-model-bart-6e.tar.gz
!tar xf dialogue-thomas-model-bart-6e.tar

In [None]:
from transformers import AutoTokenizer, BartForConditionalGeneration
model_ckpt="./dialogue-thomas-model-bart-6e"
device="cuda"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = BartForConditionalGeneration.from_pretrained(model_ckpt).to(device)

### Generating And Evaluating Dialogue

In [None]:
torch.cuda.empty_cache()
predictions=[]
for i in range(ds["test"].shape[0]):
  input_ = tokenizer.batch_encode_plus(ds["test"][i:i+1]["CONTEXT"], max_length=1024, pad_to_max_length=True,truncation=True, padding='longest', return_tensors="pt")
  input_ids = input_['input_ids']
  input_mask = input_['attention_mask']
  responses = model.generate(input_ids=input_ids.to(device),
                         attention_mask=input_mask.to(device),
                         num_beams=100,
                         no_repeat_ngram_size=2,
                         early_stopping=True,
                         num_return_sequences=1,
                         max_length=1024)
  predictions.extend(tokenizer.batch_decode(responses, skip_special_tokens=True))

In [None]:
predictions

In [None]:
import evaluate
references=ds["test"][:]["RESPONSE"]
bleu = evaluate.load("bleu")
bleu.add(predictions=str(predictions), references=str(references))
results = bleu.compute()
print(results)

In [None]:
import evaluate
references=ds["test"][:]["RESPONSE"]
rouge = evaluate.load("rouge")
rouge.add(predictions=str(predictions), references=str(references))
results = rouge.compute()
print(results)