In [1]:
!nvidia-smi

Sun Jul  3 19:22:59 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
%%capture
!pip install datasets
!pip install transformers
!pip install rouge-score
!pip install nltk
!pip install sentencepiece

In [3]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from datasets import load_dataset, load_metric
import torch, random
import nltk
from nltk import sent_tokenize

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
rouge = load_metric("rouge", seed=2022)

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [7]:
model_checkpoint = 'google/pegasus-cnn_dailymail'

tokenizer = PegasusTokenizer.from_pretrained(model_checkpoint)

model = PegasusForConditionalGeneration.from_pretrained(model_checkpoint)
model.to(device)

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0): PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): Lay

In [8]:
data_test = load_dataset('cnn_dailymail', '3.0.0', split='test')

Downloading builder script:   0%|          | 0.00/3.23k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 (download: 558.32 MiB, generated: 1.28 GiB, post-processed: Unknown size, total: 1.82 GiB) to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


In [9]:
batch_size = 4

In [10]:
# map data correclty
def generate_summary(batch):
    inputs = tokenizer(batch['article'],
                       padding=True,
                       truncation=True,
                       max_length=1024,
                       add_special_tokens=False,
                       return_tensors='pt')
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    # The following decoding parameters are set according to the PEGASUS's config.json file
    outputs = model.generate(input_ids,
                             attention_mask=attention_mask,
                             min_length=32,
                             max_length=128,
                             num_beams=8,
                             length_penalty=0.8
                             )

    # all special tokens will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch['pred'] = output_str

    return batch

In [11]:
results = data_test.map(generate_summary,
                        batched=True,
                        batch_size=batch_size,
                        remove_columns=['article'])

labels = results['highlights']
predictions = results['pred']



  0%|          | 0/2873 [00:00<?, ?ba/s]

In [13]:
# ROUGE expects a newline after each sentence
clean_preds = ["\n".join(sent_tokenize(pred.replace("<n>", " " ))) for pred in predictions]
clean_labels = [label.replace(" .", ".") for label in labels]

for i in range(len(predictions)):
    print(f"Item {i}:")
    print(f"Ground truth: {clean_labels[i]}")
    print(f"Prediction: {clean_preds[i]}")
    print("\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Prediction: Franck Ribery, Mehdi Benatia, David Alaba and Arjen Robben were all injured for Bayern Munich's clash with Porto.
Pep Guardiola's side raced into a 5-0 lead at half-time before claiming a 6-1 victory to win the tie 7-4 on aggregate.
Holland international Arjen Robben was unavailable for the Champions League clash with an abdominal injury.


Item 4257:
Ground truth: Cristiano Ronaldo's practice shot flies into the crowd hitting young fan.
Ronaldo shows concern but continues his warm-up until the final drill.
Portuguese icon wheels away to behind goal where the stricken fan stands.
Ballon d'Or holder takes off his training shirt and presents it to tearful boy.
CLICK HERE to see who Ronaldo will be facing in the Champions League.
Prediction: Real Madrid beat Atletico Madrid 1-0 at the Bernabeu on Wednesday night.
Cristiano Ronaldo scored the only goal of the game in the 88th minute.
Before the game the Portuguese

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [14]:
rouge_output = rouge.compute(predictions=clean_preds, references=clean_labels, use_stemmer=True)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(rouge_output[rn].mid.fmeasure * 100, 2)) for rn in rouge_names)

print(rouge_dict)

{'rouge1': 44.14, 'rouge2': 21.39, 'rougeL': 31.09, 'rougeLsum': 41.14}


In [15]:
# ROUGE expects a newline after each sentence
clean_preds = ["\n".join(sent_tokenize(pred.replace("<n>", "\n"))) for pred in predictions]
clean_labels = [label.replace(" .", ".") for label in labels]

for i in range(len(predictions)):
    print(f"Item {i}:")
    print(f"Ground truth: {clean_labels[i]}")
    print(f"Prediction: {clean_preds[i]}")
    print("\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
They came to the fore in FP1, finishing first and second respectively.
Hamilton was faster than Rosberg in sectors one and two, but a lock-up in sector three cost him four tenths of a second.
Sebastian Vettel had problems throughout the day, and his car was clipped by Sergio Perez in FP2, ripping off his front-wing endplate on the left side.
Prediction: Nico Rosberg was marginally quicker than Mercedes team-mate Lewis Hamilton.
Hamilton suffered a lock-up in the second sector of the Bahrain International Circuit.
That allowed Rosberg to lead the way with a lap of one minute 34.647secs.
Sebastian Vettel was fourth, behind Rosberg, Hamilton and third-placed Kimi Raikkonen in Bahrain.


Item 3506:
Ground truth: Scans have revealed All Blacks flyhalf Aaron Cruden needs knee surgery.
The 26-year-old injured his knee during clash with Canterbury Crusaders.
Surgery is likely to rule him out of a minimum of six months of action.


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [16]:
rouge_output = rouge.compute(predictions=clean_preds, references=clean_labels, use_stemmer=True)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(rouge_output[rn].mid.fmeasure * 100, 2)) for rn in rouge_names)

print(rouge_dict)

{'rouge1': 44.14, 'rouge2': 21.39, 'rougeL': 31.09, 'rougeLsum': 41.23}
