In [1]:
import torch
from datasets import load_dataset
import pandas as pd
import zipfile
import os
import re
from pathlib import Path
import argparse
from argparse import ArgumentParser
import glob
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation
import math
import numpy as np
from torch.optim import AdamW
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from nlp import load_metric
import string
from pathlib import Path
from transformers import (
    Adafactor,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5Config,
    get_linear_schedule_with_warmup,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from torch.utils.data import RandomSampler
import textwrap
from tqdm.auto import tqdm
from nlp import load_dataset

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

####  Load the data

In [3]:
import pandas as pd
from datasets import Dataset

In [4]:
df = pd.read_csv("informal_formal1.csv", sep=',', encoding="utf-8").dropna()
for column in df.columns:
    if column not in ['informal', 'formal']:
      df.drop(column, axis=1, inplace=True)
dataset = Dataset.from_pandas(df, preserve_index=False)


In [5]:
df

Unnamed: 0,informal,formal
0,"Спасибо :3\nНо, к сожалению, мы не застали тот...",Благодарю вас
1,поговорите со мной иначе я усну... ааааа я даж...,"Пожалуйста, поговорите со мной иначе я усну......"
2,"ОБоже, невыносимо голова болит. Не высыпаюсь у...","Боже, невыносимо голова болит. Я не высыпаюсь ..."
3,у меня не получилось через activator сделать T...,"К сожалению, у меня не получилось через activa..."
4,Новый дизайн твиттера... Не очень если честно :(,"Новый дизайн твиттера… Не очень, если честно"
...,...,...
35835,"Да ну, бред!",Это утверждение нелогично.
35836,Ты меня достал.,Ваше поведение меня раздражает.
35837,Какой кошмар!,Ситуация крайне неприятная.
35838,Давайте быстрее!,"Пожалуйста, ускорьте процесс."


In [6]:
class T5Seq2SeqFinetuner(pl.LightningModule):
    def __init__(self, hparams):
        super().__init__()
        self.save_hyperparameters(hparams)
        self.model = T5ForConditionalGeneration.from_pretrained(self.hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(self.hparams.tokenizer_name_or_path)

    def forward(self, input_ids, attention_mask, labels):
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

    def training_step(self, batch, batch_idx):
        outputs = self(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )
        loss = outputs.loss
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )
        loss = outputs.loss

        # Generate predictions
        preds = self.model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=self.hparams.max_output_length
        )

        # Decode predictions and labels
        pred_texts = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
        label_texts = self.tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

        return {
            "val_loss": loss,
            "preds": pred_texts,
            "labels": label_texts
        }

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.hparams.learning_rate)



#### Load the model from local file

In [7]:
args_dict = dict(
    output_dir="",
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_input_length=128,
    max_output_length=128,
    freeze_encoder=False,
    freeze_embeds=False,
    learning_rate=1e-5,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=4,
    eval_batch_size=4,
    num_train_epochs=2,
    gradient_accumulation_steps=1,
    n_gpu=1,
    resume_from_checkpoint=None,
    val_check_interval = 1.0,
    n_val=0,
    val_percent_check= 0,
    n_train=-1,
    n_test=-1,
    early_stop_callback=False,
    fp_16=False,
    opt_level='O1',
    max_grad_norm=1.0,
    seed=101,
)

args_dict.update({'output_dir': 't5_finetuning',
                'train_batch_size': 8, 'eval_batch_size': 8, 'learning_rate': 2e-4})
args = argparse.Namespace(**args_dict)

In [8]:
# 1. Load model
model = T5Seq2SeqFinetuner.load_from_checkpoint(
    'iter-step=5000.ckpt',
    hparams=args,
    map_location='cuda' if torch.cuda.is_available() else 'cpu'
)

In [9]:

import torch
from torch.utils.data import DataLoader

# 1. First ensure your preprocessing returns tensors
def preprocess_function(examples):
    # Tokenize with return_tensors="pt" to get PyTorch tensors directly
    inputs = model.tokenizer(
        examples["informal"],
        max_length=args.max_input_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    with model.tokenizer.as_target_tokenizer():
        labels = model.tokenizer(
            examples["formal"],
            max_length=args.max_output_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

    # Convert to proper tensors and handle padding
    labels = labels["input_ids"]
    # labels[labels == model.tokenizer.pad_token_id] = -100

    return {
        "input_ids": inputs["input_ids"].squeeze(0),  # Remove batch dimension
        "attention_mask": inputs["attention_mask"].squeeze(0),
        "labels": labels.squeeze(0)
    }

# 2. Apply preprocessing
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=False,  # Process one example at a time
    remove_columns=["informal", "formal"]
)

# 3. Robust collate function
def custom_collate_fn(batch):
    # Convert all elements to tensors if they aren't already
    def ensure_tensor(data):
        if isinstance(data, list):
            return torch.tensor(data)
        return data

    return {
        'input_ids': torch.stack([ensure_tensor(x['input_ids']) for x in batch]),
        'attention_mask': torch.stack([ensure_tensor(x['attention_mask']) for x in batch]),
        'labels': torch.stack([ensure_tensor(x['labels']) for x in batch])
    }



Map:   0%|          | 0/35840 [00:00<?, ? examples/s]

In [10]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 32256
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3584
    })
})

In [29]:

# 4. Create DataLoaders
train_dataloader = DataLoader(
    tokenized_dataset["train"],
    batch_size=args.train_batch_size,
    shuffle=True,
    collate_fn=custom_collate_fn
)

val_dataloader = DataLoader(
    tokenized_dataset["test"],
    batch_size=args.eval_batch_size,
    collate_fn=custom_collate_fn
)

# 5. Verify one batch
sample_batch = next(iter(train_dataloader))
print({k: v.shape for k, v in sample_batch.items()})  # Should show tensor shapes

{'input_ids': torch.Size([8, 128]), 'attention_mask': torch.Size([8, 128]), 'labels': torch.Size([8, 128])}


#### Train the model

In [30]:
trainer = pl.Trainer(
    max_epochs=1,
    accelerator='gpu' if torch.cuda.is_available() else 'cpu'
)
trainer.fit(model, train_dataloader, val_dataloader)

INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M  | eval
------------------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)
0         Modules in train mode
541       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [91]:
test_cases = [
    "Ну как-то хреново",
]

for text in test_cases:
    input_text = text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True).to(torch.device('cuda'))

    outputs = model.generate(
        inputs.input_ids,
        max_length=128,
        num_beams=5,
        repetition_penalty=2.5,
        early_stopping=True,
    )

    print(f"Input: {text}")
    print(f"Output: {tokenizer.decode(outputs[0], skip_special_tokens=True)}\n")

Input: Ну как-то хреново
Output: Как-то нехорошо

