In [None]:
!pip install datasets rouge sentencepiece Transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m98.0 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     

# Fine tunning pegasus, usando Xsum

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

In [None]:
from rouge import Rouge

In [None]:
"""Script for fine-tuning Pegasus
Example usage:
  # use XSum dataset as example, with first 1000 docs as training data
  from datasets import load_dataset
  dataset = load_dataset("xsum")
  train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]

  # use Pegasus Large model as base for fine-tuning
  model_name = 'google/pegasus-large'
  train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
  trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
  trainer.train()

Reference:
  https://huggingface.co/transformers/master/custom_datasets.html
"""

class PegasusDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)


In [None]:
def prepare_data(model_name,
                 train_texts, train_labels,
                 val_texts=None, val_labels=None,
                 test_texts=None, test_labels=None):
    """
    Prepare input data for model fine-tuning
    """
    tokenizer = PegasusTokenizer.from_pretrained(model_name)

    prepare_val = False if val_texts is None or val_labels is None else True
    prepare_test = False if test_texts is None or test_labels is None else True

    def tokenize_data(texts, labels):
        encodings = tokenizer(texts, truncation=True, padding=True)
        decodings = tokenizer(labels, truncation=True, padding=True)
        dataset_tokenized = PegasusDataset(encodings, decodings)
        return dataset_tokenized

    train_dataset = tokenize_data(train_texts, train_labels)
    val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
    test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

    return train_dataset, val_dataset, test_dataset, tokenizer

In [None]:
def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=True,
                        output_dir='./results'):
    """
    Prepare configurations and base model for fine-tuning
    """
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    if freeze_encoder:
        for param in model.model.encoder.parameters():
            param.requires_grad = False


    training_args = TrainingArguments(
        output_dir=output_dir,  # output directory
        num_train_epochs=5,  # total number of training epochs
        per_device_train_batch_size=1,  # batch size per device during training, can increase if memory allows
        per_device_eval_batch_size=1,  # batch size for evaluation, can increase if memory allows
        save_steps=1000,  # number of updates steps before checkpoint saves
        save_total_limit=5,  # limit the total amount of checkpoints and deletes the older checkpoints
        #evaluation_strategy='steps',  # evaluation strategy to adopt during training
        #eval_steps=100,  # number of update steps before evaluation
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        logging_steps=10,
        gradient_accumulation_steps=1
    )

    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=train_dataset,  # training dataset
        #eval_dataset=val_dataset,  # evaluation dataset
        tokenizer=tokenizer,
        compute_metrics=rouge_metric
    )

    return trainer

In [None]:
# use XSum dataset as example, with first 1000 docs as training data
from datasets import load_dataset

dataset = load_dataset("xsum")
train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]
val_texts, val_labels = dataset['validation']['document'][:100], dataset['validation']['summary'][:100]

# use Pegasus Large model as base for fine-tuning
model_name = 'google/pegasus-large'
train_dataset, val_dataset, test_dataset, tokenizer = prepare_data(model_name, train_texts, train_labels, val_texts,
                                                                   val_labels)
                                                                   
trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset)
trainer.train()

Downloading builder script:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

Downloading and preparing dataset xsum/default to /root/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

***** Running training *****
  Num examples = 1000
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 5000
  Number of trainable parameters = 268748800


Step,Training Loss
10,8.902
20,8.8862
30,8.4613
40,9.5213
50,8.8431
60,8.1711
70,8.5965
80,9.9416
90,8.508
100,7.9616


Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Configuration saved in ./results/checkpoint-1000/generation_config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Configuration saved in ./results/checkpoint-2000/generation_config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-2000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3000/config.json
Configuration saved in ./results/checkpoint-3000/generation_conf

TrainOutput(global_step=5000, training_loss=5.1586850715637205, metrics={'train_runtime': 2274.2654, 'train_samples_per_second': 2.199, 'train_steps_per_second': 2.199, 'total_flos': 1.444732207104e+16, 'train_loss': 5.1586850715637205, 'epoch': 5.0})

# Inferências:

https://drive.google.com/drive/folders/1Kj4DWV7x5_an0sI2SxiuvudQwKGctqMt

>Clique no link acima, crie um atalho no drive da pasta "Results".

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

## Definindo função Rouge para avaliação da sumarização

In [None]:
from rouge import Rouge

In [None]:
summaries_predict = []
label_predict = []

In [None]:
def rouge_metric(preds, summaries, i):
    """
    Computes ROUGE scores between predicted summaries and reference summaries
    """
    predictions = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    summaries = summaries["input_ids"]
    for label in summaries:
        label[label < 0] = tokenizer.pad_token_id

    reference = tokenizer.batch_decode(summaries, skip_special_tokens=True)    


    rouge = Rouge()
    result = rouge.get_scores(predictions, reference, avg=True)
    
    result = {"Rouge-1": result["rouge-1"]["f"], "Rouge-2": result["rouge-2"]["f"], "RougeL": result["rouge-l"]["f"]}
    
    if i % 200 == 0:
        print("Summary predicted:", predictions)
        print("Summary label:", reference, "\n\n")
        summaries_predict.append(predictions)
        label_predict.append(reference)
    
    return result

## Instanciando o dataset de teste

Usamos o Xsum para fazer os testes.

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("xsum", split="test")

Downloading builder script:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

Downloading and preparing dataset xsum/default to /root/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71. Subsequent calls will reuse this data.


In [None]:
raw_datasets = raw_datasets.train_test_split(
    train_size=0.1, test_size=0.1
)

In [None]:
dataset_teste = raw_datasets["test"][:2000]

In [None]:
del raw_datasets

## Instanciando o modelo e tokenizador

In [None]:
PATH = "/content/drive/MyDrive/results/checkpoint-5000"

In [None]:
tokenizer = PegasusTokenizer.from_pretrained(PATH)

In [None]:
model = PegasusForConditionalGeneration.from_pretrained(PATH).to(device)

## Realizando a tokenização e inferências

In [None]:
summaries_preds_rouge = []

for i in range(0,len(dataset_teste['document']), 10):
    entrada = tokenizer(dataset_teste["document"][i:i+10], max_length=512, truncation=True, 
                         padding="longest", return_tensors="pt").to(device)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        label = tokenizer(
            dataset_teste["summary"][i:i+10], max_length=512, truncation=True, 
            padding="longest", return_tensors="pt").to(device)

    translated = model.generate(**entrada)
    
    metrics = rouge_metric(translated, label, i)
    summaries_preds_rouge.append(metrics)
    



Summary predicted: ['A relative of one of the First World War soldiers whose remains were found in a mass grave in France has been able to identify him using a DNA sample.', "India's economy grew at its fastest pace in a year in the three months to the end of September, official data shows.", 'Police have released CCTV footage of a man they want to speak to in connection with the murder of a taxi driver in Edinburgh.', 'A former Wales cricketer has been jailed for life for murdering a man in Cardiff.', "Wales head coach Warren Gatland has named his 31-man squad for next month's three-Test series against New Zealand.", 'A college in Coventry is challenging staff to teach GCSEs in English and maths to students who did not achieve the benchmark grades at school.', 'Labour leader Ed Miliband has accused the Conservatives of trying to "threaten the future of the UK" by opposing a second independence referendum.', "Venezuela's Supreme Court has ruled that the country's opposition-controlled 

In [None]:
summaries_predict

[['A relative of one of the First World War soldiers whose remains were found in a mass grave in France has been able to identify him using a DNA sample.',
  "India's economy grew at its fastest pace in a year in the three months to the end of September, official data shows.",
  'Police have released CCTV footage of a man they want to speak to in connection with the murder of a taxi driver in Edinburgh.',
  'A former Wales cricketer has been jailed for life for murdering a man in Cardiff.',
  "Wales head coach Warren Gatland has named his 31-man squad for next month's three-Test series against New Zealand.",
  'A college in Coventry is challenging staff to teach GCSEs in English and maths to students who did not achieve the benchmark grades at school.',
  'Labour leader Ed Miliband has accused the Conservatives of trying to "threaten the future of the UK" by opposing a second independence referendum.',
  "Venezuela's Supreme Court has ruled that the country's opposition-controlled cong

In [None]:
label_predict

[['Ten soldiers who died in World War One and whose bodies were found in France five years ago have been named after DNA analysis of samples from relatives.',
  "India's economy grew 7.5% in the three months ending in March, higher than the previous quarter and above expectations.",
  'A missing man last spotted on CCTV footage taken at an Edinburgh train station has now been seen in central London.',
  'Tributes have been paid to a former Glamorgan cricketer who was found dead at his Swansea flat on Friday.',
  "British and Irish Lions head coach Warren Gatland says he will not repeat Graham Henry's 2001 mistake by splitting the squad early in the tour.",
  'As thousands of 16-year-olds in England sit GCSEs in maths and English, BBC education correspondent Gillian Hargreaves goes to meet teenagers trying to achieve good grades second time around.',
  'Plaid Cymru leader Leanne Wood has accused rival parties of "dangerous and divisive rhetoric" in a "desperate" attempt to win votes.',


In [None]:
rouges1 = []
rouges2 = []
rougesL = []
for metric in summaries_preds_rouge:
    rouges1.append(metric['Rouge-1'])
    rouges2.append(metric['Rouge-2'])
    rougesL.append(metric['RougeL'])

In [None]:
len(summaries_predict)

18

In [None]:
len(label_predict)

18

## Resultados

In [None]:
import numpy as np

In [None]:
print(f"Resultados Pegasus após 5 épocas - 1000 amostras de treino")

print(f"Rouge 1: {np.mean(rouges1)}")
print(f"Rouge 2: {np.mean(rouges2)}")
print(f"Rouge L: {np.mean(rougesL)}")

Resultados Pegasus após 5 épocas - 1000 amostras de treino
Rouge 1: 0.3143802281663199
Rouge 2: 0.1167084542949439
Rouge L: 0.2581034167949539


In [None]:
summaries_predict_np = np.array(summaries_predict)

In [None]:
label_predict = np.array(label_predict)

In [None]:
label_predict = label_predict.reshape(1, -1)

In [None]:
summaries_predict = summaries_predict_np.reshape(1, -1)

In [None]:
dataset_teste["document"][0]

'Since the discovery of the bodies in 2009 the Ministry of Defence (MoD) has been tracking down potential relatives in the hope of identifying them.\nThe remains were spotted during construction work near the French village of Beaucamps-Ligny.\nThey were found alongside five other bodies which are yet to be named.\nAll the soldiers were with 2nd Battalion The York and Lancaster Regiment, and are believed to have died in battle on 18 October 1914.\nThe men are due to be given a funeral with full military honours in October, while investigations continue to try and track down relatives for the remaining bodies.\nRetired computer programmer Peter Hague, 70, of Chinley, Derbyshire said he was "astonished" to find that his cousin twice-removed Cpl Francis Carr Dyson was among those identified.\n"It is always strange, and poignant moment when you discover you are related to someone like this, I suppose the sadness of his death is mitigated when you know they died during service for their cou

In [None]:
for i in range(0, 180, 36):
    print(f"\n\nSumarização Real:\n-> {label_predict[0, i]}\n")
    print(f"Sumarização Predita:\n-> {summaries_predict[0, i]}")



Sumarização Real:
-> Ten soldiers who died in World War One and whose bodies were found in France five years ago have been named after DNA analysis of samples from relatives.

Sumarização Predita:
-> A relative of one of the First World War soldiers whose remains were found in a mass grave in France has been able to identify him using a DNA sample.


Sumarização Real:
-> A nursery suggested referring a four-year-old boy to a de-radicalisation programme after he mispronounced the word "cucumber", it is alleged.

Sumarização Predita:
-> A five-year-old boy who drew a picture of a man cutting a cucumber has been referred to police.


IndexError: ignored

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


def calculate_tfidf(text, texts):
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(texts)
    feature_names = tfidf.get_feature_names_out()
    tfidf_values = tfidf.transform([text]).toarray()[0]
    tfidf_df = pd.DataFrame({'feature_names': feature_names, 'tfidf_values': tfidf_values})
    hashtags = tfidf_df.nlargest(5, 'tfidf_values').feature_names.to_numpy()
    result = [f"#{h}" for h in hashtags]
    return result

In [None]:
calculate_tfidf(dataset_teste["document"][0], dataset_teste["document"])

['#account', '#messages', '#accounts', '#48hours', '#60minutes']