**Задание**
1.  Дообучить берт на задачу NER;
2.  Дообучить GPT на генерацию текста;
3*. Дообучить T5 на задачу суммаризации текста.

## T5

In [None]:
!pip install razdel networkx pymorphy2[fast] nltk rouge
!pip install --upgrade datasets tqdm transformers

In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

In [3]:
dataset_train = load_dataset('IlyaGusev/gazeta', revision="v1.0", split='train[:10%]')
dataset_test = load_dataset('IlyaGusev/gazeta', revision="v1.0", split='test[:10%]')

Downloading builder script:   0%|          | 0.00/2.98k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/48.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/52400 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5770 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5265 [00:00<?, ? examples/s]

In [4]:
dataset_train

Dataset({
    features: ['text', 'summary', 'title', 'date', 'url'],
    num_rows: 5240
})

In [5]:
dataset_test

Dataset({
    features: ['text', 'summary', 'title', 'date', 'url'],
    num_rows: 577
})

In [6]:
dataset_test['title'][13]

'«Крайне важно»: Госдума одобрила поправки к Конституции'

In [7]:
dataset_test['summary'][13]

'Госдума поддержала в первом чтении президентский законопроект о поправках к Конституции. Инициатива главы государства была принята единогласно, во втором чтении законопроект будет рассмотрен уже в феврале. При этом спикер Госдумы Вячеслав Володин назвал продолжение работы по документу крайне важным.'

In [8]:
def len_tok(text):
    return len(text.split())

In [9]:
max_len_sum, max_len_tl = max(map(len_tok, dataset_train['summary'])), max(map(len_tok, dataset_train['title']))
max_len_sum, max_len_tl

(75, 18)

In [10]:
max_len_sum, max_len_tl = 60, 15

In [11]:
model_name = "IlyaGusev/rut5_base_sum_gazeta"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    tokenized_input = tokenizer(batch['summary'], padding='max_length', truncation=True, max_length=max_len_sum)
    tokenized_label = tokenizer(batch['title'], padding='max_length', truncation=True, max_length=max_len_tl)

    tokenized_input['labels'] = tokenized_label['input_ids']

    return tokenized_input

dataset_train = dataset_train.map(tokenize, batched=True, batch_size=8)
dataset_test = dataset_test.map(tokenize, batched=True, batch_size=8)

dataset_train.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
dataset_test.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])

Downloading (…)okenizer_config.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/828k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Map:   0%|          | 0/5240 [00:00<?, ? examples/s]

Map:   0%|          | 0/577 [00:00<?, ? examples/s]

In [13]:
dataset_train.save_to_disk('gazeta/train')
dataset_test.save_to_disk('gazeta/test')

Saving the dataset (0/1 shards):   0%|          | 0/5240 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/577 [00:00<?, ? examples/s]

In [14]:
model = T5ForConditionalGeneration.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/766 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/977M [00:00<?, ?B/s]

In [15]:
# !pip install accelerate -U

In [16]:
output_dir = 'gazeta/output'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_accumulation_steps=1, # Number of eval steps to keep in GPU (the higher, the mor vRAM used)
    prediction_loss_only=True, # If I need co compute only loss and not other metrics, setting this to true will use less RAM
    learning_rate=0.00001,
    evaluation_strategy='steps', # Run evaluation every eval_steps
    save_steps=1000, # How often to save a checkpoint
    save_total_limit=1, # Number of maximum checkpoints to save
    remove_unused_columns=True, # Removes useless columns from the dataset
    run_name='run_gazeta', # Wandb run name
    logging_steps=500, # How often to log loss to wandb
    eval_steps=500, # How often to run evaluation on the val_set
    logging_first_step=False, # Whether to log also the very first training step to wandb
    load_best_model_at_end=True, # Whether to load the best model found at each evaluation.
    metric_for_best_model="loss", # Use loss to evaluate best model.
    greater_is_better=False # Best model is the one with the lowest loss, not highest.
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test
)

In [18]:
trainer.train()

Step,Training Loss,Validation Loss
500,8.6846,3.374449
1000,4.1232,3.115216
1500,2.6329,2.978302
2000,2.56,2.909791
2500,2.4774,2.869247
3000,2.4649,2.84871
3500,2.4552,2.822928
4000,2.3906,2.817625
4500,2.3911,2.799202
5000,2.3812,2.796134


TrainOutput(global_step=6550, training_loss=3.043093757920593, metrics={'train_runtime': 1902.35, 'train_samples_per_second': 27.545, 'train_steps_per_second': 3.443, 'total_flos': 4174008606720000.0, 'train_loss': 3.043093757920593, 'epoch': 10.0})

In [19]:
trainer.save_model(output_dir + '/model')

In [23]:
INX = 13
print("Summary: | {}".format(dataset_test['summary'][INX]))
print("Title: | {}".format(dataset_test['title'][INX]))

Summary: | Госдума поддержала в первом чтении президентский законопроект о поправках к Конституции. Инициатива главы государства была принята единогласно, во втором чтении законопроект будет рассмотрен уже в феврале. При этом спикер Госдумы Вячеслав Володин назвал продолжение работы по документу крайне важным.
Title: | «Крайне важно»: Госдума одобрила поправки к Конституции


In [24]:
device = "cuda"

In [25]:
input_text = dataset_test['summary'][INX]

with torch.no_grad():
    tokenized_text = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')

    source_ids = tokenized_text['input_ids'].to(device, dtype = torch.long)
    source_mask = tokenized_text['attention_mask'].to(device, dtype = torch.long)

    generated_ids = model.generate(
        input_ids = source_ids,
        attention_mask = source_mask,
        max_length=512,
        num_beams=7,
        temperature = 1.3,
        repetition_penalty=1,
        length_penalty=1,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

print("\noutput:\n" + pred)


output:
Госдума одобрила поправки к Конституции
