In [1]:
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(palette='summer')

In [2]:
!pip install -q accelerate -U

In [3]:
!pip install -q datasets

In [4]:
!pip install -q transformers

In [5]:
!pip install -q evaluate

In [6]:
import transformers
from datasets import load_dataset
import evaluate

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Подготовка данных

In [8]:
billsum = load_dataset("billsum", split="ca_test")

Found cached dataset parquet (/home/arsen/.cache/huggingface/datasets/parquet/billsum-35284aeeba767e98/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [9]:
billsum

Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 1237
})

In [10]:
billsum = billsum.train_test_split(test_size=0.1)

In [11]:
billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1113
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 124
    })
})

In [20]:
tokenizer = transformers.AutoTokenizer.from_pretrained("ainize/bart-base-cnn")

In [22]:
tokenizer

BartTokenizerFast(name_or_path='ainize/bart-base-cnn', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)

In [13]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples["text"], max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/1113 [00:00<?, ? examples/s]

Map:   0%|          | 0/124 [00:00<?, ? examples/s]

In [15]:
model = transformers.AutoModelForSeq2SeqLM.from_pretrained("ainize/bart-base-cnn")

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [16]:
data_collator = transformers.DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [17]:
training_args = transformers.Seq2SeqTrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=2,
    )

In [18]:
trainer = transformers.Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


In [19]:
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 

###  Предсказания на тестовом множестве

In [None]:
text_example = billsum["test"]['text'][0]
print(text_example)

In [None]:
input_ids = tokenizer.encode(
    text_example,
    return_tensors="pt",
    max_length=1024,
    truncation=True,
    ).to(device)

In [None]:
input_ids.shape

In [None]:
summary_text_ids = model.generate(
    input_ids=input_ids,
    bos_token_id=model.config.bos_token_id,
    eos_token_id=model.config.eos_token_id,
    max_length=142,
    min_length=56,
    num_beams=4,
)

In [None]:
summary_text_ids

In [None]:
decoded_text = tokenizer.decode(summary_text_ids[0], skip_special_tokens=True)

In [None]:
len(decoded_text), len(text_example)

In [None]:
decoded_text

In [None]:
summaries = []

for text in tqdm(billsum["test"]['text']):
    input_ids = tokenizer.encode(
        text,
        return_tensors="pt",
        max_length=1024,
        truncation=True,
        ).to(device)

    summary_text_ids = model.generate(
        input_ids=input_ids,
        bos_token_id=model.config.bos_token_id,
        eos_token_id=model.config.eos_token_id,
        max_length=142,
        min_length=56,
        num_beams=4,
    )

    decoded_text = tokenizer.decode(summary_text_ids[0], skip_special_tokens=True)
    summaries.append(decoded_text)

## Считаем качество

### ROUGE

In [None]:
!pip install rouge_score

In [None]:
rouge = evaluate.load('rouge')

In [None]:
%%time
results = rouge.compute(
        predictions=summaries,
        references=billsum["test"]['summary']
    )

In [None]:
results