<a href="https://colab.research.google.com/github/alinakhodotovych2022/Project3/blob/main/project3_dialogue_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project 3 — Dialogue Summarization with BART-Large
**Author: Alina K.**

## 1. Setup
Install required libraries.

In [1]:
!pip install -q transformers datasets evaluate rouge_score sentencepiece accelerate

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


## 2. Imports

In [2]:
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset
from transformers import BartTokenizer, BartForConditionalGeneration, DataCollatorForSeq2Seq, Trainer, TrainingArguments
import evaluate


## 3. Load SAMSum Dataset

In [3]:
samsum = load_dataset('samsum')
samsum

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetNotFoundError: Dataset 'samsum' doesn't exist on the Hub or cannot be accessed.

## 4. Preprocessing & Tokenization

In [None]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

def preprocess(batch):
    inputs = tokenizer(batch['dialogue'], max_length=512, truncation=True)
    labels = tokenizer(batch['summary'], max_length=96, truncation=True)
    inputs['labels'] = labels['input_ids']
    return inputs

tokenized_samsum = samsum.map(preprocess, batched=True, remove_columns=samsum['train'].column_names)


## 5. Model Initialization

In [None]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

## 6. Training Configuration

In [None]:
batch_size = 2

args = TrainingArguments(
    output_dir='bart-samsum',
    evaluation_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    lr_scheduler_type='linear',
    learning_rate=3e-5,
    fp16=True,
    report_to='none'
)

collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_samsum['train'],
    eval_dataset=tokenized_samsum['validation'],
    data_collator=collator,
    tokenizer=tokenizer
)


## 7. Train Model

In [None]:
trainer.train()

## 8. Evaluation — ROUGE

In [None]:
rouge = evaluate.load('rouge')
preds = trainer.predict(tokenized_samsum['test'])

decoded_preds = tokenizer.batch_decode(preds.predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(preds.label_ids, skip_special_tokens=True)

rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_labels)
rouge_results

## 9. Example Inference

In [None]:
example = samsum['test'][0]['dialogue']
inputs = tokenizer(example, return_tensors='pt', truncation=True, max_length=512)
summary_ids = model.generate(**inputs, max_length=80, num_beams=4)
print(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

## 10. Save Model

In [None]:
model.save_pretrained('bart-samsum')
tokenizer.save_pretrained('bart-samsum')