In [1]:
!pip install datasets evaluate rouge_score

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0

In [2]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
import pandas as pd

In [4]:
df = pd.read_csv("informal_formal.csv", sep=",", encoding="utf-8")

print("Missing values in 'formal':", df["formal"].isnull().sum())
print("Empty strings in 'formal':", (df["formal"].str.strip() == "").sum())


Missing values in 'formal': 1
Empty strings in 'formal': 0


In [5]:
df = df.dropna(subset=["formal"])
df.to_csv('informal_formal.csv')

In [6]:
dataset = load_dataset("csv", data_files="informal_formal.csv", )


Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
dataset['train']

Dataset({
    features: ['Unnamed: 0.1', 'Unnamed: 0', 'informal', 'formal'],
    num_rows: 35699
})

### Loading the model

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "ai-forever/ruT5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [9]:
task_prefix = "формализуй текст: "

def preprocess_function(examples):
    inputs = [task_prefix + ex for ex in examples["informal"]]
    targets = [ex for ex in examples["formal"]]

    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length",
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            truncation=True,
            padding="max_length",
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
train_test_split = tokenized_dataset["train"].train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

Map:   0%|          | 0/35699 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    predict_with_generate=True,
)


In [15]:
import evaluate
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    rouge_output = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        rouge_types=["rougeL"]
    )
    return {"rougeL": round(rouge_output["rougeL"], 4)}

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


In [17]:
!zip -r model.zip .

from google.colab import files
files.download("model.zip")

  adding: .config/ (stored 0%)
  adding: .config/default_configs.db (deflated 98%)
  adding: .config/configurations/ (stored 0%)
  adding: .config/configurations/config_default (deflated 15%)
  adding: .config/logs/ (stored 0%)
  adding: .config/logs/2025.03.07/ (stored 0%)
  adding: .config/logs/2025.03.07/14.25.41.136979.log (deflated 58%)
  adding: .config/logs/2025.03.07/14.25.10.912729.log (deflated 92%)
  adding: .config/logs/2025.03.07/14.25.31.510076.log (deflated 58%)
  adding: .config/logs/2025.03.07/14.25.49.465545.log (deflated 57%)
  adding: .config/logs/2025.03.07/14.25.50.132613.log (deflated 56%)
  adding: .config/logs/2025.03.07/14.25.39.941814.log (deflated 86%)
  adding: .config/.last_update_check.json (deflated 22%)
  adding: .config/.last_opt_in_prompt.yaml (stored 0%)
  adding: .config/active_config (stored 0%)
  adding: .config/gce (stored 0%)
  adding: .config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db (deflated 97%)
  adding: .config/.last_s

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [31]:
test_cases = [
    "Фига.....пытаешься извиниться, а им как будто все равно:(",
]

for text in test_cases:
    input_text = task_prefix + text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True).to(torch.device('cuda'))

    outputs = model.generate(
        inputs.input_ids,
        max_length=128,
        num_beams=5,
        repetition_penalty=2.5,
        early_stopping=True,
    )

    print(f"Input: {text}")
    print(f"Output: {tokenizer.decode(outputs[0], skip_special_tokens=True)}\n")

Input: Фига.....пытаешься извиниться, а им как будто все равно:(
Output: Пытаешься извиниться, а им как будто всё равно

