# Importing Necessary Libraries

In [1]:
!pip install transformers datasets nltk evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=62c6a80554bed3f9b505e705aca4eb76e3f5f311c42ff13f386bd477fb8d82b0
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.3 rouge_score-0.1.2


In [2]:
import numpy as np
import torch
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer, pipeline, EarlyStoppingCallback
import nltk
from nltk.tokenize import sent_tokenize
import evaluate
import warnings
warnings.filterwarnings("ignore")

In [3]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Preparing the Data

In [4]:
dataset = load_dataset('xsum', trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [5]:
# Randomly select 10,000 rows from the train dataset
train_subset = dataset['train'].shuffle(seed=42).select(range(10000))

# Randomly select 1,000 rows from the validation and test datasets
val_set = dataset['validation'].shuffle(seed=42).select(range(1000))
test_set = dataset['test'].shuffle(seed=42).select(range(1000))

# Combine all sets into a DatasetDict
xsum_dataset = DatasetDict({
    'train': train_subset,
    'validation': val_set,
    'test': test_set
})

In [6]:
xsum_dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 1000
    })
})

In [7]:
xsum_dataset["train"][0]

{'document': 'In Wales, councils are responsible for funding and overseeing schools.\nBut in England, Mr Osborne\'s plan will mean local authorities will cease to have a role in providing education.\nAcademies are directly funded by central government and head teachers have more freedom over admissions and to change the way the school works.\nIt is a significant development in the continued divergence of schools systems on either side of Offa\'s Dyke.\nAnd although the Welsh Government will get extra cash to match the money for English schools to extend the school day, it can spend it on any devolved policy area.\nMinisters have no plans to follow suit.\nAt the moment, governing bodies are responsible for setting school hours and they need ministerial permission to make significant changes.\nThere are already more than 2,000 secondary academies in England and its extension to all state schools is unlikely to shake the Welsh Government\'s attachment to what they call a "community, compr

# Preprocessing

In [8]:
model_checkpoint = "facebook/bart-large"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [10]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

In [11]:
max_input_length = 1024
max_target_length = 128


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["document"],
        max_length=max_input_length,
        truncation=True,
    )
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], 
                           max_length=max_target_length, 
                           truncation=True,
                          )
        
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
tokenized_datasets = xsum_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=xsum_dataset["train"].column_names
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

In [14]:
tokenized_datasets["train"][0]

{'input_ids': [0,
  1121,
  5295,
  6,
  14751,
  32,
  2149,
  13,
  1435,
  8,
  14264,
  1304,
  4,
  50118,
  1708,
  11,
  1156,
  6,
  427,
  17093,
  18,
  563,
  40,
  1266,
  400,
  1247,
  40,
  14342,
  7,
  33,
  10,
  774,
  11,
  1976,
  1265,
  4,
  50118,
  26145,
  625,
  26804,
  32,
  2024,
  6140,
  30,
  1353,
  168,
  8,
  471,
  2948,
  33,
  55,
  3519,
  81,
  18054,
  8,
  7,
  464,
  5,
  169,
  5,
  334,
  1364,
  4,
  50118,
  243,
  16,
  10,
  1233,
  709,
  11,
  5,
  1143,
  37178,
  9,
  1304,
  1743,
  15,
  1169,
  526,
  9,
  4995,
  102,
  18,
  10179,
  1071,
  4,
  50118,
  2409,
  1712,
  5,
  12093,
  1621,
  40,
  120,
  1823,
  1055,
  7,
  914,
  5,
  418,
  13,
  2370,
  1304,
  7,
  4442,
  5,
  334,
  183,
  6,
  24,
  64,
  1930,
  24,
  15,
  143,
  8709,
  19084,
  714,
  443,
  4,
  50118,
  20086,
  16729,
  33,
  117,
  708,
  7,
  1407,
  3235,
  4,
  50118,
  3750,
  5,
  1151,
  6,
  8182,
  3738,
  32,
  2149,
  13,
  2749,
  33

# Fine-Tuning and Evaluation

In [15]:
batch_size = 8
num_train_epochs = 15

logging_steps = len(tokenized_datasets["train"])
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"./{model_name}-finetuned-xsum",
    report_to='none',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    load_best_model_at_end=True,
)

# Define the EarlyStoppingCallback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0,
)

In [16]:
rouge_score = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    
    if isinstance(result, dict):
        # Extract the median scores if available
        result = {key: (value.mid.fmeasure * 100 if hasattr(value, 'mid') else value * 100) 
                  for key, value in result.items()}
    else:
        # Handle the case where the result is just a scalar
        result = {key: value * 100 for key, value in result.items()}
    
    # Return rounded results
    return {k: round(v, 4) for k, v in result.items()}

In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [19]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback],
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,1.954776,35.0318,14.3833,28.5962,28.6004
2,No log,1.9264,35.6681,14.9305,29.3805,29.3722
3,No log,1.997019,35.3385,14.5819,28.9208,28.94
4,No log,2.10295,35.5346,14.7639,29.2719,29.2856
5,No log,2.25299,34.9008,14.4824,28.751,28.7493


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=3125, training_loss=1.4599825, metrics={'train_runtime': 13421.6187, 'train_samples_per_second': 11.176, 'train_steps_per_second': 0.698, 'total_flos': 1.0470098383129805e+17, 'train_loss': 1.4599825, 'epoch': 5.0})

In [21]:
trainer.evaluate()

{'eval_loss': 1.9263995885849,
 'eval_rouge1': 35.6681,
 'eval_rouge2': 14.9305,
 'eval_rougeL': 29.3805,
 'eval_rougeLsum': 29.3722,
 'eval_runtime': 346.6377,
 'eval_samples_per_second': 2.885,
 'eval_steps_per_second': 0.182,
 'epoch': 5.0}

In [22]:
trainer.save_model(f"./{model_name}-finetuned-xsum")

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


# Testing the Model

In [23]:
device = 0 if torch.cuda.is_available() else -1

In [24]:
model_checkpoint = f"./{model_name}-finetuned-xsum"
summarizer = pipeline("summarization", model=model_checkpoint, device=device)

In [25]:
def print_summary(idx):
    article = xsum_dataset["test"][idx]["document"]
    reference_summary = xsum_dataset["test"][idx]["summary"]
    generated_summary = summarizer(article)[0]["summary_text"]
    print(f"'>>> Article: {article}'")
    print(f"\n'>>> Reference_summary: {reference_summary}'")
    print(f"\n'>>> Generated_summary: {generated_summary}'")

In [26]:
print_summary(0)

'>>> Article: Sarah Johnson was one of 21 women heading to Liverpool when their minibus was hit by a lorry on the M62.
Her friend Bethany Jones, 18, was killed while Ms Johnson and several others were badly hurt.
Minibus driver James Johnson was jailed for more than six years for causing Bethany's death, in April 2013.
Ms Johnson, who broke her shoulder, back and pelvis, said the help she received from a charity while in hospital led her to want to support others.
Speaking publicly for the first time about the crash, Ms Johnson described how everyone was "excited and giddy" for the hen party.
"To me the impact was just a massive explosion," she said.  "I thought the bus had blown up.
"I remember the bus dropping on its side. The next thing, I woke up on the roadside so I'd actually come out of the window."
Ms Johnson was taken to Leeds General Infirmary where she, along with Bethany's sister Amy Firth, underwent major surgery and spent time in intensive care.
Whilst she was there she g

In [27]:
print_summary(100)

'>>> Article: In a series of Freedom of Information requests, BBC Scotland asked how many cases, concerns and complaints of child exploitation, and child sexual exploitation were recorded in the past four years.
But the force said the figures were not held on a single system.
Scotland's Children's Commissioner said the revelation was "disturbing".
It comes on the day that officers launched a new unit to help with what police acknowledge is the complex problem of child abuse and neglect across Scotland.
It will provide expertise to allow local teams to robustly investigate cases, including child sexual exploitation, online offences and other types of abuse.
In its response to the BBC's FOI requests, Police Scotland said: "Child exploitation covers a broad range of criminal activity and doesn't correlate to a specific offence, but rather can involve a range of differing offences...The level of data you have requested is not held on a single system."
It comes after the publication last su