## Классификация стихотворений с использованием NLP
### Часть 3 Roberta
Botasheva Zhanna

In [2]:
from google.colab import drive

# Монтируем Google Диск
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from transformers import RobertaTokenizer, RobertaModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [4]:
# Загрузка данных
train = pd.read_csv('/content/drive/MyDrive/data/train_data.csv')
val = pd.read_csv('/content/drive/MyDrive/data/test_data.csv')

In [None]:
# Удаление NULL в тренировочных данных
train.dropna(inplace=True)
train.reset_index(inplace=True, drop=True)

# Кодирование меток
label_encoder = preprocessing.LabelEncoder()
train['Genre_Code'] = label_encoder.fit_transform(train['Genre'])
val['Genre_Code'] = label_encoder.transform(val['Genre'])

Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 2e-5
*   3 эпохи
*   размер батча 8

In [5]:
# Инициализация токенизатора
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [6]:
# Предварительная обработка данных
def encode_texts(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128)

train_encodings = encode_texts(train['Poem'])
val_encodings = encode_texts(val['Poem'])

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [7]:
# Создание datasets
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

# Создание загрузчиков данных
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

In [24]:
# Определение модели
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]  # Берем [CLS] токен
        return self.fc(x)

# Параметры
num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Обучение модели
for epoch in range(3):  # Количество эпох
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/3], Loss: {loss.item():.4f}')

Epoch [1/3], Loss: 1.3879
Epoch [2/3], Loss: 1.2545
Epoch [3/3], Loss: 0.6861


In [None]:
# Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Model's Optimized Accuracy is: 0.17


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 2e-5
*   2 эпохи
*   размер батча 8

In [12]:
# Обучение модели
for epoch in range(2):  # Количество эпох
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/7], Loss: {loss.item():.4f}')

Epoch [1/7], Loss: 1.0473
Epoch [2/7], Loss: 1.1761


In [13]:
# Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Model's Optimized Accuracy is: 0.51


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 0.0001
*   2 эпохи
*   размер батча 8

In [12]:
# Определение модели
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]  # Берем [CLS] токен
        return self.fc(x)

# Параметры
num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Обучение модели
for epoch in range(2):  # Количество эпох
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/2], Loss: {loss.item():.4f}')

    # Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/2], Loss: 1.2274
Epoch [2/2], Loss: 1.5827
Model's Optimized Accuracy is: 0.09


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 1e-5
*   2 эпохи
*   размер батча 8

In [13]:
# Определение модели
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]  # Берем [CLS] токен
        return self.fc(x)

# Параметры
num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr= 1e-5)
criterion = nn.CrossEntropyLoss()

# Обучение модели
for epoch in range(2):  # Количество эпох
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/2], Loss: {loss.item():.4f}')

    # Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/2], Loss: 0.8671
Epoch [2/2], Loss: 0.8661
Model's Optimized Accuracy is: 0.45


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 1e-5
*   3 эпохи
*   размер батча 8

In [25]:
# Определение модели
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]  # Берем [CLS] токен
        return self.fc(x)

# Параметры
num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr= 1e-5)
criterion = nn.CrossEntropyLoss()

# Обучение модели
for epoch in range(3):  # Количество эпох
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/3], Loss: {loss.item():.4f}')

    # Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/3], Loss: 0.8893
Epoch [2/3], Loss: 1.1375
Epoch [3/3], Loss: 1.0420
Model's Optimized Accuracy is: 0.61


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 1e-5
*   5 эпох
*   размер батча 8

In [26]:
# Определение модели
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]  # Берем [CLS] токен
        return self.fc(x)

# Параметры
num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr= 1e-5)
criterion = nn.CrossEntropyLoss()

# Обучение модели
for epoch in range(5):  # Количество эпох
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/5], Loss: {loss.item():.4f}')

    # Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/5], Loss: 1.4630
Epoch [2/5], Loss: 1.6404
Epoch [3/5], Loss: 0.9203
Epoch [4/5], Loss: 0.5834
Epoch [5/5], Loss: 0.7828
Model's Optimized Accuracy is: 0.55


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 1e-5
*   10 эпох
*   размер батча 8

In [8]:
# Определение модели
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]  # Берем [CLS] токен
        return self.fc(x)

# Параметры
num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr= 1e-5)
criterion = nn.CrossEntropyLoss()

# Обучение модели
for epoch in range(10):  # Количество эпох
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/10], Loss: {loss.item():.4f}')

    # Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Loss: 1.1911
Epoch [2/10], Loss: 1.2654
Epoch [3/10], Loss: 0.8737
Epoch [4/10], Loss: 0.9226
Epoch [5/10], Loss: 0.6412
Epoch [6/10], Loss: 0.7236
Epoch [7/10], Loss: 0.0632
Epoch [8/10], Loss: 0.1082
Epoch [9/10], Loss: 0.4587
Epoch [10/10], Loss: 0.1001
Model's Optimized Accuracy is: 0.47


Accuracy 0.61 лучший результат модели Roberta