# Chapter 6 Text Summarization

In [1]:
# ! pip install datasets
# ! pip install transformers
# ! pip install evaluate

In [2]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")

print(f"features: {dataset['train'].features}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


features: {'article': Value(dtype='string', id=None), 'highlights': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None)}


In [3]:
sample = dataset['train'][0]

print(f"sample length: {len(sample['article'])}")

print(sample['article'][:500])
print("summary:", len(sample['highlights']))
print(sample['highlights'])


sample length: 2527
LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as s
summary: 217
Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


In [4]:
# we will collect the generated summaries in a dictionary
summaries = {}

sample_text = sample['article'][:2000]
print(sample_text)

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how

Separate sentences by new line with NLTK

In [5]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

string = "The U.S. are a country. The U.N. is an organization."

sentences = sent_tokenize(string)

print(sentences)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['The U.S. are a country.', 'The U.N. is an organization.']


### Baseline

In [6]:
def three_sentence_summary(text):
    sentences = sent_tokenize(text)
    return "\n".join(sentences[:3])

summaries["baseline"] = three_sentence_summary(sample_text)

print(three_sentence_summary(sample_text))

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.
Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties.
"I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month.


## GPT-2

In [7]:
from transformers import pipeline, set_seed

set_seed(42)
pipe = pipeline("text-generation", model="gpt2-xl", tokenizer="gpt2-xl")

gpt2_query = sample_text + "\nTL;DR:\n"
print(gpt2_query)

pipe_out = pipe(gpt2_query, truncation=True, max_length=512, clean_up_tokenization_spaces=True)

print(sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query):]))

summaries["gpt2"] = "/n".join(sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query):]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how

In [8]:
pipe_out[0]["generated_text"]

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details o

## T5

In [9]:
pipe = pipeline("summarization", model="t5-large", tokenizer="t5-large")

pipe_out = pipe(sample_text, clean_up_tokenization_spaces=True)

print(sent_tokenize(pipe_out[0]["summary_text"]))

summaries["t5"] = "/n".join(sent_tokenize(pipe_out[0]["summary_text"]))

['Harry Potter star Daniel Radcliffe turns 18 on monday.', 'the young actor says he has no plans to fritter his cash away.', "details of how he'll mark his landmark birthday are under wraps."]


In [9]:
import evaluate

In [8]:
from evaluate import load

bleu_metric = load("bleu")

bleu_metric.add(predictions="the cat is on mat", references=["the cat is on the mat"])

results = bleu_metric.compute()

print(results)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.5789300674674098, 'precisions': [1.0, 0.75, 0.6666666666666666, 0.5], 'brevity_penalty': 0.8187307530779819, 'length_ratio': 0.8333333333333334, 'translation_length': 5, 'reference_length': 6}
