In [None]:
!nvidia-smi

Thu Jun 30 16:21:04 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
%%capture
!pip install datasets
!pip install transformers
!pip install rouge-score
!pip install nltk

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import load_dataset, load_metric
import torch, random
import nltk
from nltk import sent_tokenize

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
rouge = load_metric("rouge", seed=2022)

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
model_checkpoint = 'facebook/bart-large-cnn'

In [None]:
tokenizer = BartTokenizer.from_pretrained(model_checkpoint)

model = BartForConditionalGeneration.from_pretrained(model_checkpoint)
model.to(device)

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((102

In [None]:
data_test = load_dataset('cnn_dailymail', '3.0.0', split='test')

Downloading builder script:   0%|          | 0.00/3.23k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 (download: 558.32 MiB, generated: 1.28 GiB, post-processed: Unknown size, total: 1.82 GiB) to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


In [None]:
batch_size = 8

In [None]:
# map data correclty
def generate_summary(batch):
    inputs = tokenizer(batch['article'],
                       padding=True,
                       truncation=True,
                       max_length=1024,
                       add_special_tokens=False,
                       return_tensors='pt')
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    # The following decoding parameters are set according to the BART's config.json file
    outputs = model.generate(input_ids,
                             attention_mask=attention_mask,
                             min_length=56,
                             max_length=142,
                             num_beams=4,
                             no_repeat_ngram_size=3,
                             length_penalty=2.0,
                             early_stopping=True
                             )

    # all special tokens will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch['pred'] = output_str

    return batch

In [None]:
results = data_test.map(generate_summary,
                        batched=True,
                        batch_size=batch_size,
                        remove_columns=['article'])

labels = results['highlights']
predictions = results['pred']

  0%|          | 0/1437 [00:00<?, ?ba/s]

In [None]:
# ROUGE expects a newline after each sentence
clean_preds = ["\n".join(sent_tokenize(pred.replace('[X_SEP]', ' '))) for pred in predictions]
clean_labels = [label.replace(" .", ".") for label in labels]

for i in range(len(predictions)):
    print(f"Item {i}:")
    print("Label:")
    print(clean_labels[i])
    print("\n")
    print("Prediction:")
    print(clean_preds[i])
    print("\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Blackpool went on to earn a point after a 1-1 draw with Reading.
Lee Clark admitted the club face a challenge to win back their stay-away fans.


Item 3533:
Label:
Former Brazil striker Ronaldo played for Real Madrid between 2002-07.
The two-time World Cup winner officially retired from playing in 2011.
Cristiano Ronaldo scored his 50th goal of the season against  Malaga.
Real Madrid beat Malaga 3-1 to keep up the pressure on leaders Barcelona.


Prediction:
The 38-year-old ended his illustrious playing career in 2011.
Ronaldo admits a return to top level football is past him but would pick the current Real Madrid talisman to play alongside.
'It's not possible but it would be spectacular!'
he said of the current Ballon d'Or winners Lionel Messi and Cristiano Ronaldo.


Item 3534:
Label:
Blackpool goalkeeper Joe Lewis's shirt was meant to go to a club sponsor.
Lewis tried to get another but was told they had no spares to r

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Label:
Seven years ago, insurance saleswoman Katia Apalategui lost her father.
Her grieving mother coped with loss by sniffing late husband's pillowcase.
Inspired her to come up with permanent way to capture person's scent.
But bottles of loved ones perfume will set customers back £400 a bottle.


Prediction:
Katia Apalategui was inspired after seeing her mother cope with the loss of her husband by clinging to his pillowcase.
She came up with a more permanent way to capture a person's individual scent in a bid to help others in mourning.
To help develop her idea, she teamed up with the Havre university in France, where researchers have developed a technique to reproduce the human smell.


Item 11160:
Label:
SNL castmember skewers Hillary Clinton, Obama and Republican hopefuls.
Twenty-minute speech poked fun at BuzzFeed, Brian Williams scandal.
Jokes touched on police brutality and Secret Service security lapses.


Predict

In [None]:
rouge_output = rouge.compute(predictions=clean_preds, references=clean_labels, use_stemmer=True)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(rouge_output[rn].mid.fmeasure * 100, 2)) for rn in rouge_names)

print(rouge_dict)

{'rouge1': 43.84, 'rouge2': 20.94, 'rougeL': 30.28, 'rougeLsum': 40.75}
