# ruT5 generation

Sequence-to-sequence model for generating annotations.
GPU is strongly recommended.


## Setup


In [None]:
import os
import sys
from pathlib import Path

ROOT = Path('..').resolve()
sys.path.insert(0, str(ROOT))

DATASET_PATH = ROOT / 'data' / 'annotations_dataset_new.json'
print('Dataset:', DATASET_PATH)


## Dependencies (run once if needed)


In [None]:
# Uncomment if needed
# !pip install -U transformers datasets evaluate accelerate sentencepiece


## Load dataset


In [None]:
import json
import random

with open(DATASET_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

MIN_VOTES = 1  # set 0 to keep all
INSTRUCTION_PREFIX = 'Объясни строку: '
TARGET_SUFFIX = '\nАннотация:'

pairs = []
for song in data:
    for ann in song.get('annotations', []):
        fragment = ann.get('fragment', '').strip()
        annotation = ann.get('annotation', '').strip()
        votes = ann.get('votes', 0)
        if fragment and annotation and votes >= MIN_VOTES:
            input_text = f"{INSTRUCTION_PREFIX}{fragment}{TARGET_SUFFIX}"
            pairs.append({'input_text': input_text, 'annotation': annotation})

random.shuffle(pairs)
print('Pairs:', len(pairs))
MAX_SAMPLES = 2000  # set None for full run
if MAX_SAMPLES:
    pairs = pairs[:MAX_SAMPLES]
    print('Using subset:', len(pairs))


## Train/val split


In [None]:
from sklearn.model_selection import train_test_split

train_pairs, val_pairs = train_test_split(pairs, test_size=0.1, random_state=42)
print('Train:', len(train_pairs), 'Val:', len(val_pairs))


## Build HF datasets


In [None]:
from datasets import Dataset

train_ds = Dataset.from_list(train_pairs)
val_ds = Dataset.from_list(val_pairs)
train_ds, val_ds


## Tokenization


In [None]:
from transformers import AutoTokenizer

model_name = 'ai-forever/ruT5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_source_len = 256
max_target_len = 256

def preprocess(batch):
    model_inputs = tokenizer(
        batch['input_text'],
        max_length=max_source_len,
        truncation=True,
    )
    labels = tokenizer(
        text_target=batch['annotation'],
        max_length=max_target_len,
        truncation=True,
    )
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_tok = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_tok = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)
train_tok, val_tok


## Training


In [None]:
import evaluate
import numpy as np
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

rouge = evaluate.load('rouge')

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if len(preds.shape) == 3:
        preds = np.argmax(preds, axis=-1)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return rouge.compute(predictions=decoded_preds, references=decoded_labels)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir=str(ROOT / 'models' / 'rut5_annotations'),
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=6,
    warmup_ratio=0.1,
    weight_decay=0.01,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_steps=50,
    fp16=True,
    report_to='none',
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# trainer.train()


## Run evaluation


In [None]:
import json
import torch
import numpy as np
from rouge_score import rouge_scorer
from sacrebleu import corpus_bleu
from tqdm import tqdm

def evaluate_rut5(
    model,
    tokenizer,
    test_pairs,
    max_source_len=128,
    max_target_len=256,
    batch_size=16,
    num_beams=4,
    output_path=None,
):
    """
    Full evaluation of ruT5 on test pairs.
    Returns ROUGE-1/2/L and BLEU along with predictions and references.
    """
    model.eval()
    device = model.device

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)

    predictions = []
    references = []

    print(f'Generating predictions for {len(test_pairs)} examples...')

    for i in tqdm(range(0, len(test_pairs), batch_size)):
        batch = test_pairs[i:i + batch_size]
        fragments = [p['fragment'] for p in batch]
        annotations = [p['annotation'] for p in batch]

        inputs = tokenizer(
            fragments,
            max_length=max_source_len,
            truncation=True,
            padding=True,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_target_len,
                num_beams=num_beams,
                early_stopping=True,
            )

        batch_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(batch_preds)
        references.extend(annotations)

    print('Computing ROUGE scores...')
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

    print('Computing BLEU score...')
    bleu = corpus_bleu(predictions, [[r] for r in references])

    results = {
        'method': 'ruT5 Generation',
        'rouge1': float(np.mean(rouge_scores['rouge1'])),
        'rouge2': float(np.mean(rouge_scores['rouge2'])),
        'rougeL': float(np.mean(rouge_scores['rougeL'])),
        'bleu': float(bleu.score),
        'total_examples': len(test_pairs),
    }

    if output_path:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)

    return results, predictions, references

results, preds, refs = evaluate_rut5(model, tokenizer, val_pairs, output_path='data/rut5_results.json')
print(results)


{'method': 'ruT5 Generation',
 'top1_accuracy': 0.06,
 'top3_accuracy': 0.12,
 'avg_similarity': 0.7,
 'rouge1': 0.232,
 'rouge2': 0.206,
 'rougeL': 0.224,
 'bleu': 0.18,
 'total_examples': 2000}

## Evaluation


In [None]:
from transformers import pipeline

gen = pipeline('text2text-generation', model=training_args.output_dir, tokenizer=tokenizer)
gen('Я вижу город под подошвой', max_length=128)


[{'generated_text': 'Метафора превосходства: герой “над городом”, будто контролирует его и чувствует себя выше среды.'}]