<a href="https://colab.research.google.com/github/afylers/text-summarization/blob/master/TransformerPractice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Author : Ramandeep Singh
# Contact : afylers@gmail.com
# Project/Research : Text Summarization
# Last Updated : 04/16/2023

In [1]:
# Installation of packages

# Install Transformers for using models with pipeline API
!pip install transformers

# Install datasets for loading datasets directly from hugging face
!pip install datasets

# Install rouge package for calculating accuracy
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, http

# New Section

In [2]:
# Loading Libraries

# Load transformers library for text-summarization model
from transformers import pipeline

# Load datasets for dataset load utility
from datasets import list_datasets, load_dataset

# Load rouge for accuracy calculation
from rouge import Rouge



In [3]:
# Load model and dataset
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
#dataset = load_dataset("cnn_dailymail", "3.0.0")
billsum = load_dataset("billsum")
dataset = billsum
# Load the Rouge metric with desired settings
rouge = Rouge(metrics=['rouge-1','rouge-2','rouge-l'])
print(dataset)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.70k [00:00<?, ?B/s]

Downloading and preparing dataset billsum/default to /root/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc...


Downloading data:   0%|          | 0.00/67.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

Dataset billsum downloaded and prepared to /root/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1237
    })
})


In [4]:
# Generate summaries using your text summarization model
generated_summaries = []
reference_summaries = []
try:
  length=len(dataset['test'])
  length=1
  for i in range(length):
    generated_summary = summarizer(dataset['test']['text'][i] ,max_length=50, min_length=20, truncation=True)
    generated_summaries.append(generated_summary[0]['summary_text'])
    reference_summaries.append(dataset['test']['summary'][i])

except Exception as e:
  print(e)

In [5]:
print(generated_summaries)

['The Water Resources Development Act of 1992 is amended to include a $20,000,000 cap on the amount of money that can be spent on a single project. The project must be technically sound, environmentally acceptable, and economically justified.']


In [6]:
# Calculate ROUGE scores for each summary pair
scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)
print(scores)

# Print the ROUGE scores and their averages for the dataset
for metric, values in scores.items():
    print(f"ROUGE-{metric} scores:")
    print(values)
    print(f"Average ROUGE-{metric} score: {values['f']}")

{'rouge-1': {'r': 0.10989010989010989, 'p': 0.625, 'f': 0.18691588530701372}, 'rouge-2': {'r': 0.03146853146853147, 'p': 0.24324324324324326, 'f': 0.05572755215098399}, 'rouge-l': {'r': 0.1043956043956044, 'p': 0.59375, 'f': 0.17757009091449039}}
ROUGE-rouge-1 scores:
{'r': 0.10989010989010989, 'p': 0.625, 'f': 0.18691588530701372}
Average ROUGE-rouge-1 score: 0.18691588530701372
ROUGE-rouge-2 scores:
{'r': 0.03146853146853147, 'p': 0.24324324324324326, 'f': 0.05572755215098399}
Average ROUGE-rouge-2 score: 0.05572755215098399
ROUGE-rouge-l scores:
{'r': 0.1043956043956044, 'p': 0.59375, 'f': 0.17757009091449039}
Average ROUGE-rouge-l score: 0.17757009091449039


In [None]:
print(dataset['train'][0])

{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char

In [7]:
import numpy as np
from datasets import load_dataset
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Load the dataset
#dataset = load_dataset('cnn_dailymail', '3.0.0')

# Define a function to preprocess the text
def preprocess_function(examples):
    # Tokenize the inputs and targets
    inputs = examples['text']
    targets = examples['summary']
    inputs = tokenizer(inputs, padding=True, truncation=True, max_length=1024)
    targets = tokenizer(targets, padding=True, truncation=True, max_length=128)

    # Convert the input and target lists to NumPy arrays
    input_ids = np.array(inputs['input_ids'])
    attention_mask = np.array(inputs['attention_mask'])
    decoder_input_ids = np.array(targets['input_ids'])[:, :-1]
    decoder_attention_mask = np.array(targets['attention_mask'])[:, :-1]
    labels = np.array(targets['input_ids'])[:, 1:]

    # Create a new dictionary with the preprocessed inputs and targets
    result = {'input_ids': input_ids, 'attention_mask': attention_mask}
    result['decoder_input_ids'] = decoder_input_ids
    result['decoder_attention_mask'] = decoder_attention_mask
    result['labels'] = labels
    return result

# Apply the preprocess function to the dataset
dataset = dataset.map(preprocess_function, batched=True)

# Split the dataset into training and validation sets
train_dataset = dataset['train']
eval_dataset = dataset['test']


Map:   0%|          | 0/18949 [00:00<?, ? examples/s]

Map:   0%|          | 0/3269 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [1]:
import torch
from datasets import load_dataset
from transformers import BartForConditionalGeneration, BartTokenizer, TrainingArguments, Trainer

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Train the model
model.train()
#model.to('cuda') # use GPU if available
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=100,
    save_strategy='steps',
    save_steps=500,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
trainer.train()


NameError: ignored