In [1]:
%%bash

pip install --upgrade pip
pip install  --disable-pip-version-check \
  torch==1.13.1 \
  torchdata==0.5.1 --quiet

pip install transformers==4.27.2 --quiet
pip install datasets==2.11.0 --quiet
pip install evaluate==0.4.0 --quiet
pip install rouge_score==0.1.2 --quiet
pip install loralib==0.1.1 --quiet
pip install peft==0.3.0 --quiet





In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, GenerationConfig, Trainer
from datasets import load_dataset
import time
import evaluate

In [3]:
# data processing
data_name = 'knkarthick/dialogsum'
model_name = 'google/flan-t5-small'


dataset = load_dataset(data_name)

print(dataset)

model= AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenizer function
def tokenizer_function(example):
  start_prompt = 'Summarize the following conversation'
  end_prompt = '\n\nSummary: '
  prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]

  example['input_ids'] = tokenizer(
      prompt, padding='max_length', truncation=True, return_tensors='pt'
      ).input_ids
  example['labels'] = tokenizer(
      example['summary'], padding='max_length', truncation=True, return_tensors='pt'
      ).input_ids

  return example

tokenized_datasets = dataset.map(tokenizer_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(
    ['id', 'topic', 'dialogue', 'summary']
    )

# print the dataset shape
print(f'Shape of the datasets:')
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


  0%|          | 0/3 [00:00<?, ?it/s]



DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})




Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Shape of the datasets:
Training: (12460, 2)
Validation: (500, 2)
Test: (1500, 2)


In [5]:
# train the model
  # full fine tuning
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    learning_rate=1e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    # per_device_train_batch_size=16,
    # per_device_eval_batch_size=16
    # max_steps=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

# train the data
st = time.time()
trainer.train()
print(f'time taken to train the model: {time.time()-st} sec')

OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 14.75 GiB total capacity; 14.48 GiB already allocated; 41.06 MiB free; 14.52 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Compare the orignial model summaries & Fine-tuned model summaries

from tqdm import tqdm

instruct_model = trainer.model.to('cpu')

dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

# dialogues = dataset['test']['dialogue']
# human_baseline_summaries = dataset['test']['summary']

original_model_summaries = []
trained_model_summaries = []

pbar = tqdm(total=len(dialogues))
for _, dialogue in enumerate(dialogues):
  prompt = f"""
SUmmarize the following conversation.
{dialogue}
Summary: """

  input_ids = tokenizer(prompt, return_tensors='pt').input_ids
  original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
  original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
  original_model_summaries.append(original_model_text_output)

  trained_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
  trained_model_text_output = tokenizer.decode(trained_model_outputs[0], skip_special_tokens=True)
  trained_model_summaries.append(trained_model_text_output)
  pbar.update(1)
pbar.close()

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, trained_model_summaries))

df = pd.DataFrame(zipped_summaries, columns=['huam_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df.head()

In [None]:
# evaluate the summaries using Rogue
import evaluate

rouge = evaluate.load('rouge')

test = df.loc[2:4, :]
original_model_summaries = test['original_model_summaries'].to_list()
human_baseline_summaries = test['huam_baseline_summaries'].to_list()
trained_model_summaries = test['instruct_model_summaries'].to_list()

original_model_result = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries,
    use_aggregator=True,
    use_stemmer=True
)

instruction_model_result = rouge.compute(
    predictions=trained_model_summaries,
    references=human_baseline_summaries,
    use_aggregator=True,
    use_stemmer=True
)

print('Original Model')
print(original_model_result)
print('Instruction Tuned')
print(instruction_model_result)