## Классификация стихотворений с использованием NLP
### Часть 2 T5
Botasheva Zhanna

In [None]:
from google.colab import drive

# Монтируем Google Диск
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
# Загрузка данных
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/train_data.csv')
val = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/test_data.csv')

In [None]:
# Удаление NULL в тренировочных данных
train.dropna(subset=['Poem'], inplace=True)
train.reset_index(inplace=True, drop=True)

In [None]:
# Кодирование меток
label_encoder = preprocessing.LabelEncoder()
train['Genre_Code'] = label_encoder.fit_transform(train['Genre'])
val['Genre_Code'] = label_encoder.transform(val['Genre'])

## T5

*   токенизатор t5-small
*   оптимизатор lr = 2e-5
*   5 эпох
*   размер батча 8





In [None]:
# Инициализация токенизатора
tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [None]:
# Создание входных данных в формате T5
train_encodings = tokenizer(
    ["classify: " + text for text in train['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

val_encodings = tokenizer(
    ["classify: " + text for text in val['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Создание datasets
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

# Создание загрузчиков данных
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Определение модели
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Параметры
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

# Обучение модели
for epoch in range(5):  # Количество эпох
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Создаем запрашиваемые метки (результат)
        labels_str = [label_encoder.inverse_transform([label.item()])[0] for label in labels]
        labels_encodings = tokenizer(labels_str, padding=True, truncation=True, return_tensors='pt', max_length=5).input_ids

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels_encodings)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/5], Loss: {loss.item():.4f}')

# Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=2)

        decoded_preds = []
        for output in outputs:
            pred_str = tokenizer.decode(output, skip_special_tokens=True).strip()
            if pred_str in label_encoder.classes_:
                decoded_preds.append(label_encoder.transform([pred_str])[0])
            else:
                decoded_preds.append(-1)  # Фиксация предсказания, если оно неизвестно

        total += len(decoded_preds)
        correct += sum(pred == label for pred, label in zip(decoded_preds, batch['labels'].tolist()))

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch [1/5], Loss: 2.5825
Epoch [2/5], Loss: 1.3587
Epoch [3/5], Loss: 0.9205
Epoch [4/5], Loss: 0.4790
Epoch [5/5], Loss: 0.6033
Model's Optimized Accuracy is: 0.21


## T5

*   токенизатор t5-small
*   оптимизатор lr = 2e-5
*   6 эпох
*   размер батча 8

In [None]:
# Инициализация токенизатора
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Создание входных данных в формате T5
train_encodings = tokenizer(
    ["classify: " + text for text in train['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

val_encodings = tokenizer(
    ["classify: " + text for text in val['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Создание datasets
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

# Создание загрузчиков данных
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Определение модели
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Параметры
optimizer = torch.optim.Adam(model.parameters(),  lr=2e-5)

# Обучение модели
for epoch in range(6):  # Количество эпох
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Создаем запрашиваемые метки (результат)
        labels_str = [label_encoder.inverse_transform([label.item()])[0] for label in labels]
        labels_encodings = tokenizer(labels_str, padding=True, truncation=True, return_tensors='pt', max_length=5).input_ids

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels_encodings)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/6], Loss: {loss.item():.4f}')

# Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=2)

        decoded_preds = []
        for output in outputs:
            pred_str = tokenizer.decode(output, skip_special_tokens=True).strip()
            if pred_str in label_encoder.classes_:
                decoded_preds.append(label_encoder.transform([pred_str])[0])
            else:
                decoded_preds.append(-1)  # Фиксация предсказания, если оно неизвестно

        total += len(decoded_preds)
        correct += sum(pred == label for pred, label in zip(decoded_preds, batch['labels'].tolist()))

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")


Epoch [1/6], Loss: 3.2155
Epoch [2/6], Loss: 1.0787
Epoch [3/6], Loss: 0.9036
Epoch [4/6], Loss: 0.5114
Epoch [5/6], Loss: 0.4200
Epoch [6/6], Loss: 0.8772
Model's Optimized Accuracy is: 0.08


## T5

*   токенизатор t5-small
*   оптимизатор lr = 0.0001
*   3 эпох
*   размер батча 8

In [None]:
# Инициализация токенизатора
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Создание входных данных в формате T5
train_encodings = tokenizer(
    ["classify: " + text for text in train['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

val_encodings = tokenizer(
    ["classify: " + text for text in val['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Создание datasets
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

# Создание загрузчиков данных
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Определение модели
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Параметры
optimizer = torch.optim.Adam(model.parameters(),  lr=1e-5)

# Обучение модели
for epoch in range(3):  # Количество эпох
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Создаем запрашиваемые метки (результат)
        labels_str = [label_encoder.inverse_transform([label.item()])[0] for label in labels]
        labels_encodings = tokenizer(labels_str, padding=True, truncation=True, return_tensors='pt', max_length=5).input_ids

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels_encodings)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/3], Loss: {loss.item():.4f}')

# Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=2)

        decoded_preds = []
        for output in outputs:
            pred_str = tokenizer.decode(output, skip_special_tokens=True).strip()
            if pred_str in label_encoder.classes_:
                decoded_preds.append(label_encoder.transform([pred_str])[0])
            else:
                decoded_preds.append(-1)  # Фиксация предсказания, если оно неизвестно

        total += len(decoded_preds)
        correct += sum(pred == label for pred, label in zip(decoded_preds, batch['labels'].tolist()))

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Epoch [1/3], Loss: 0.5016
Epoch [2/3], Loss: 0.6376
Epoch [3/3], Loss: 0.6120
Model's Optimized Accuracy is: 0.17


Сбалансируем данные и попробуем еще раз провести обучение

In [None]:
# Загрузка данных
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/train_data.csv')
val = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/test_data.csv')

In [None]:
train = train.dropna(subset=['Poem'])

In [None]:
df=pd.concat([train,val],axis=0)
df

Unnamed: 0,Genre,Poem
1,Music,In the thick brushthey spend the...
2,Music,Storms are generous. ...
3,Music,—After Ana Mendieta Did you carry around the ...
4,Music,for Aja Sherrard at 20The portent may itself ...
5,Music,"for Bob Marley, Bavaria, November 1980 Here i..."
...,...,...
145,Environment,"To pick a tulip from the garden, the red one. ..."
146,Environment,We are as clouds that veil the midnight moon; ...
147,Environment,"When pulled, the spider web took another form...."
148,Environment,Whose woods these are I think I know. His hous...


In [None]:
from sklearn.model_selection import train_test_split as tts

train,val = tts(df,test_size=0.3,random_state=42)

## T5

*   токенизатор t5-small
*   оптимизатор lr = 0.0001
*   3 эпохи
*   размер батча 8

In [None]:
# Кодирование меток
label_encoder = preprocessing.LabelEncoder()
train['Genre_Code'] = label_encoder.fit_transform(train['Genre'])
val['Genre_Code'] = label_encoder.transform(val['Genre'])

# Инициализация токенизатора
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Создание входных данных в формате T5
train_encodings = tokenizer(
    ["classify: " + text for text in train['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

val_encodings = tokenizer(
    ["classify: " + text for text in val['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Создание datasets
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

# Создание загрузчиков данных
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Определение модели
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Параметры
optimizer = torch.optim.Adam(model.parameters(),  lr=0.0001)

# Обучение модели
for epoch in range(3):  # Количество эпох
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Создаем запрашиваемые метки (результат)
        labels_str = [label_encoder.inverse_transform([label.item()])[0] for label in labels]
        labels_encodings = tokenizer(labels_str, padding=True, truncation=True, return_tensors='pt', max_length=5).input_ids

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels_encodings)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/3], Loss: {loss.item():.4f}')

# Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=2)

        decoded_preds = []
        for output in outputs:
            pred_str = tokenizer.decode(output, skip_special_tokens=True).strip()
            if pred_str in label_encoder.classes_:
                decoded_preds.append(label_encoder.transform([pred_str])[0])
            else:
                decoded_preds.append(-1)  # Фиксация предсказания, если оно неизвестно

        total += len(decoded_preds)
        correct += sum(pred == label for pred, label in zip(decoded_preds, batch['labels'].tolist()))

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Epoch [1/3], Loss: 1.0417
Epoch [2/3], Loss: 0.6784
Epoch [3/3], Loss: 1.9393
Model's Optimized Accuracy is: 0.24


## T5

*   токенизатор t5-small
*   оптимизатор lr = 0.0001
*   2 эпохи
*   размер батча 8

In [None]:
# Кодирование меток
label_encoder = preprocessing.LabelEncoder()
train['Genre_Code'] = label_encoder.fit_transform(train['Genre'])
val['Genre_Code'] = label_encoder.transform(val['Genre'])

# Инициализация токенизатора
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Создание входных данных в формате T5
train_encodings = tokenizer(
    ["classify: " + text for text in train['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

val_encodings = tokenizer(
    ["classify: " + text for text in val['Poem']],
    padding=True, truncation=True, return_tensors='pt', max_length=128
)

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Создание datasets
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

# Создание загрузчиков данных
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Определение модели
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Параметры
optimizer = torch.optim.Adam(model.parameters(),  lr=0.0001)

# Обучение модели
for epoch in range(2):  # Количество эпох
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Создаем запрашиваемые метки (результат)
        labels_str = [label_encoder.inverse_transform([label.item()])[0] for label in labels]
        labels_encodings = tokenizer(labels_str, padding=True, truncation=True, return_tensors='pt', max_length=5).input_ids

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels_encodings)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/2], Loss: {loss.item():.4f}')

# Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=2)

        decoded_preds = []
        for output in outputs:
            pred_str = tokenizer.decode(output, skip_special_tokens=True).strip()
            if pred_str in label_encoder.classes_:
                decoded_preds.append(label_encoder.transform([pred_str])[0])
            else:
                decoded_preds.append(-1)  # Фиксация предсказания, если оно неизвестно

        total += len(decoded_preds)
        correct += sum(pred == label for pred, label in zip(decoded_preds, batch['labels'].tolist()))

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Epoch [1/2], Loss: 1.9084
Epoch [2/2], Loss: 0.7036
Model's Optimized Accuracy is: 0.35


Accuracy 0.35 лучший результат модели T5