## Классификация стихотворений с использованием NLP
### Часть 4 T5-large
Botasheva Zhanna

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
from google.colab import drive

# Монтируем Google Диск
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Загрузка данных
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/train_data.csv')
val = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/test_data.csv')

In [None]:
train = train.dropna(subset=['Poem'])

In [None]:
df=pd.concat([train,val],axis=0)
df

Unnamed: 0,Genre,Poem
1,Music,In the thick brushthey spend the...
2,Music,Storms are generous. ...
3,Music,—After Ana Mendieta Did you carry around the ...
4,Music,for Aja Sherrard at 20The portent may itself ...
5,Music,"for Bob Marley, Bavaria, November 1980 Here i..."
...,...,...
145,Environment,"To pick a tulip from the garden, the red one. ..."
146,Environment,We are as clouds that veil the midnight moon; ...
147,Environment,"When pulled, the spider web took another form...."
148,Environment,Whose woods these are I think I know. His hous...


In [None]:
from sklearn.model_selection import train_test_split as tts

train,val = tts(df,test_size=0.3,random_state=42)

## T5


*   токенизатор t5-large
*   оптимизатор lr=2e-5
*   3 эпохи
*   размер батча 8

In [None]:
# Кодирование меток
label_encoder = preprocessing.LabelEncoder()
train['Genre_Code'] = label_encoder.fit_transform(train['Genre'])
val['Genre_Code'] = label_encoder.transform(val['Genre'])

# Инициализация токенизатора
tokenizer = T5Tokenizer.from_pretrained('t5-large')

# Создание входных данных в формате T5
train_encodings = tokenizer(
    ["classify: " + text for text in train['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

val_encodings = tokenizer(
    ["classify: " + text for text in val['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Создание datasets
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

# Создание загрузчиков данных
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Определение модели
model = T5ForConditionalGeneration.from_pretrained('t5-large')
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Параметры
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

# Параметры ранней остановки
num_epochs = 10
patience = 3
best_accuracy = 0
patience_counter = 0

# Обучение модели с ранней остановкой
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
        attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
        labels = batch['labels'].to('cuda' if torch.cuda.is_available() else 'cpu')

        # Создаем запрашиваемые метки (результат)
        labels_str = [label_encoder.inverse_transform([label.item()])[0] for label in labels]
        labels_encodings = tokenizer(labels_str, padding=True, truncation=True, return_tensors='pt', max_length=5).input_ids.to('cuda' if torch.cuda.is_available() else 'cpu')

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels_encodings)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Валидация
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
            attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=2)

            decoded_preds = []
            for output in outputs:
                pred_str = tokenizer.decode(output, skip_special_tokens=True).strip()
                if pred_str in label_encoder.classes_:
                    decoded_preds.append(label_encoder.transform([pred_str])[0])
                else:
                    decoded_preds.append(-1)

            total += len(decoded_preds)
            correct += sum(pred == label for pred, label in zip(decoded_preds, batch['labels'].tolist()))

    accuracy = correct / total
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {accuracy:.2f}")

    # Ранняя остановка
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping triggered.")
        break

print(f"Model's Optimized Accuracy is: {best_accuracy:.2f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch [1/10], Loss: 0.5423, Validation Accuracy: 0.37
Epoch [2/10], Loss: 1.0736, Validation Accuracy: 0.43
Epoch [3/10], Loss: 0.8598, Validation Accuracy: 0.47
Epoch [4/10], Loss: 0.6999, Validation Accuracy: 0.43
Epoch [5/10], Loss: 1.0730, Validation Accuracy: 0.46
Epoch [6/10], Loss: 0.8482, Validation Accuracy: 0.44
Early stopping triggered.
Model's Optimized Accuracy is: 0.47


## T5


*   токенизатор t5-large
*   оптимизатор lr=0.0001
*   1 эпоха
*   размер батча 8

In [None]:
# Кодирование меток
label_encoder = preprocessing.LabelEncoder()
train['Genre_Code'] = label_encoder.fit_transform(train['Genre'])
val['Genre_Code'] = label_encoder.transform(val['Genre'])

# Инициализация токенизатора
tokenizer = T5Tokenizer.from_pretrained('t5-large')

# Создание входных данных в формате T5
train_encodings = tokenizer(
    ["classify: " + text for text in train['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

val_encodings = tokenizer(
    ["classify: " + text for text in val['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Создание datasets
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

# Создание загрузчиков данных
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Определение модели
model = T5ForConditionalGeneration.from_pretrained('t5-large')
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Параметры
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Параметры ранней остановки
num_epochs = 10
patience = 3
best_accuracy = 0
patience_counter = 0

# Обучение модели с ранней остановкой
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
        attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
        labels = batch['labels'].to('cuda' if torch.cuda.is_available() else 'cpu')

        # Создаем запрашиваемые метки (результат)
        labels_str = [label_encoder.inverse_transform([label.item()])[0] for label in labels]
        labels_encodings = tokenizer(labels_str, padding=True, truncation=True, return_tensors='pt', max_length=5).input_ids.to('cuda' if torch.cuda.is_available() else 'cpu')

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels_encodings)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Валидация
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
            attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=2)

            decoded_preds = []
            for output in outputs:
                pred_str = tokenizer.decode(output, skip_special_tokens=True).strip()
                if pred_str in label_encoder.classes_:
                    decoded_preds.append(label_encoder.transform([pred_str])[0])
                else:
                    decoded_preds.append(-1)

            total += len(decoded_preds)
            correct += sum(pred == label for pred, label in zip(decoded_preds, batch['labels'].tolist()))

    accuracy = correct / total
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {accuracy:.2f}")

    # Ранняя остановка
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping triggered.")
        break

print(f"Model's Optimized Accuracy is: {best_accuracy:.2f}")

Epoch [1/10], Loss: 0.3535, Validation Accuracy: 0.44
Epoch [2/10], Loss: 1.4343, Validation Accuracy: 0.39
Epoch [3/10], Loss: 0.4804, Validation Accuracy: 0.42
Epoch [4/10], Loss: 0.0143, Validation Accuracy: 0.36
Early stopping triggered.
Model's Optimized Accuracy is: 0.44


## T5


*   токенизатор t5-large
*   оптимизатор lr=1e-5
*   6 эпох
*   размер батча 8

In [None]:
# Кодирование меток
label_encoder = preprocessing.LabelEncoder()
train['Genre_Code'] = label_encoder.fit_transform(train['Genre'])
val['Genre_Code'] = label_encoder.transform(val['Genre'])

# Инициализация токенизатора
tokenizer = T5Tokenizer.from_pretrained('t5-large')

# Создание входных данных в формате T5
train_encodings = tokenizer(
    ["classify: " + text for text in train['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

val_encodings = tokenizer(
    ["classify: " + text for text in val['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Создание datasets
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

# Создание загрузчиков данных
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Определение модели
model = T5ForConditionalGeneration.from_pretrained('t5-large')
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Параметры
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Параметры ранней остановки
num_epochs = 10
patience = 3
best_accuracy = 0
patience_counter = 0

# Обучение модели с ранней остановкой
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
        attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
        labels = batch['labels'].to('cuda' if torch.cuda.is_available() else 'cpu')

        # Создаем запрашиваемые метки (результат)
        labels_str = [label_encoder.inverse_transform([label.item()])[0] for label in labels]
        labels_encodings = tokenizer(labels_str, padding=True, truncation=True, return_tensors='pt', max_length=5).input_ids.to('cuda' if torch.cuda.is_available() else 'cpu')

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels_encodings)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Валидация
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
            attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=2)

            decoded_preds = []
            for output in outputs:
                pred_str = tokenizer.decode(output, skip_special_tokens=True).strip()
                if pred_str in label_encoder.classes_:
                    decoded_preds.append(label_encoder.transform([pred_str])[0])
                else:
                    decoded_preds.append(-1)

            total += len(decoded_preds)
            correct += sum(pred == label for pred, label in zip(decoded_preds, batch['labels'].tolist()))

    accuracy = correct / total
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {accuracy:.2f}")

    # Ранняя остановка
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping triggered.")
        break

print(f"Model's Optimized Accuracy is: {best_accuracy:.2f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch [1/10], Loss: 3.9396, Validation Accuracy: 0.01
Epoch [2/10], Loss: 1.1586, Validation Accuracy: 0.40
Epoch [3/10], Loss: 1.5815, Validation Accuracy: 0.40
Epoch [4/10], Loss: 0.5006, Validation Accuracy: 0.42
Epoch [5/10], Loss: 0.8030, Validation Accuracy: 0.42
Epoch [6/10], Loss: 0.7954, Validation Accuracy: 0.46
Epoch [7/10], Loss: 0.1867, Validation Accuracy: 0.46
Epoch [8/10], Loss: 0.5872, Validation Accuracy: 0.44
Epoch [9/10], Loss: 0.8078, Validation Accuracy: 0.45
Early stopping triggered.
Model's Optimized Accuracy is: 0.46
