## Классификация стихотворений с использованием NLP
### Часть 3 Roberta
Botasheva Zhanna

In [None]:
from google.colab import drive

# Монтируем Google Диск
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from transformers import RobertaTokenizer, RobertaModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
# Загрузка данных
train = pd.read_csv('/content/drive/MyDrive/data/train_data.csv')
val = pd.read_csv('/content/drive/MyDrive/data/test_data.csv')

In [None]:
# Удаление NULL в тренировочных данных
train.dropna(inplace=True)
train.reset_index(inplace=True, drop=True)

# Кодирование меток
label_encoder = preprocessing.LabelEncoder()
train['Genre_Code'] = label_encoder.fit_transform(train['Genre'])
val['Genre_Code'] = label_encoder.transform(val['Genre'])

Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 2e-5
*   3 эпохи
*   размер батча 8

In [None]:
# Инициализация токенизатора
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
# Предварительная обработка данных
def encode_texts(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128)

train_encodings = encode_texts(train['Poem'])
val_encodings = encode_texts(val['Poem'])

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Создание datasets
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

# Создание загрузчиков данных
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

In [None]:
# Определение модели
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]  # Берем [CLS] токен
        return self.fc(x)

# Параметры
num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Обучение модели
for epoch in range(3):  # Количество эпох
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/3], Loss: {loss.item():.4f}')

Epoch [1/3], Loss: 1.3879
Epoch [2/3], Loss: 1.2545
Epoch [3/3], Loss: 0.6861


In [None]:
# Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Model's Optimized Accuracy is: 0.17


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 2e-5
*   2 эпохи
*   размер батча 8

In [None]:
# Обучение модели
for epoch in range(2):  # Количество эпох
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/7], Loss: {loss.item():.4f}')

Epoch [1/7], Loss: 1.0473
Epoch [2/7], Loss: 1.1761


In [None]:
# Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Model's Optimized Accuracy is: 0.51


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 0.0001
*   2 эпохи
*   размер батча 8

In [None]:
# Определение модели
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]  # Берем [CLS] токен
        return self.fc(x)

# Параметры
num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

# Обучение модели
for epoch in range(2):  # Количество эпох
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/2], Loss: {loss.item():.4f}')

    # Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/2], Loss: 1.2274
Epoch [2/2], Loss: 1.5827
Model's Optimized Accuracy is: 0.09


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 1e-5
*   2 эпохи
*   размер батча 8

In [None]:
# Определение модели
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]  # Берем [CLS] токен
        return self.fc(x)

# Параметры
num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr= 1e-5)
criterion = nn.CrossEntropyLoss()

# Обучение модели
for epoch in range(2):  # Количество эпох
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/2], Loss: {loss.item():.4f}')

    # Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/2], Loss: 0.8671
Epoch [2/2], Loss: 0.8661
Model's Optimized Accuracy is: 0.45


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 1e-5
*   3 эпохи
*   размер батча 8

In [None]:
# Определение модели
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]  # Берем [CLS] токен
        return self.fc(x)

# Параметры
num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr= 1e-5)
criterion = nn.CrossEntropyLoss()

# Обучение модели
for epoch in range(3):  # Количество эпох
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/3], Loss: {loss.item():.4f}')

    # Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/3], Loss: 0.8893
Epoch [2/3], Loss: 1.1375
Epoch [3/3], Loss: 1.0420
Model's Optimized Accuracy is: 0.61


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 1e-5
*   5 эпох
*   размер батча 8

In [None]:
# Определение модели
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]  # Берем [CLS] токен
        return self.fc(x)

# Параметры
num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr= 1e-5)
criterion = nn.CrossEntropyLoss()

# Обучение модели
for epoch in range(5):  # Количество эпох
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/5], Loss: {loss.item():.4f}')

    # Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/5], Loss: 1.4630
Epoch [2/5], Loss: 1.6404
Epoch [3/5], Loss: 0.9203
Epoch [4/5], Loss: 0.5834
Epoch [5/5], Loss: 0.7828
Model's Optimized Accuracy is: 0.55


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 1e-5
*   10 эпох
*   размер батча 8

In [None]:
# Определение модели
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]  # Берем [CLS] токен
        return self.fc(x)

# Параметры
num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr= 1e-5)
criterion = nn.CrossEntropyLoss()

# Обучение модели
for epoch in range(10):  # Количество эпох
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/10], Loss: {loss.item():.4f}')

    # Прогнозирование на валидационном наборе
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Model's Optimized Accuracy is: {accuracy:.2f}")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Loss: 1.1911
Epoch [2/10], Loss: 1.2654
Epoch [3/10], Loss: 0.8737
Epoch [4/10], Loss: 0.9226
Epoch [5/10], Loss: 0.6412
Epoch [6/10], Loss: 0.7236
Epoch [7/10], Loss: 0.0632
Epoch [8/10], Loss: 0.1082
Epoch [9/10], Loss: 0.4587
Epoch [10/10], Loss: 0.1001
Model's Optimized Accuracy is: 0.47


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 1e-5
*   2 эпохи
*   размер батча 8

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from transformers import RobertaTokenizer, RobertaModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# 1. Data
train = pd.read_csv('/content/drive/MyDrive/data/train_data.csv')
val = pd.read_csv('/content/drive/MyDrive/data/test_data.csv')

# 2. Removing NULL in training data & Encoding labels
train.dropna(inplace=True)
train.reset_index(inplace=True, drop=True)

label_encoder = preprocessing.LabelEncoder()
train['Genre_Code'] = label_encoder.fit_transform(train['Genre'])
val['Genre_Code'] = label_encoder.transform(val['Genre'])

# 3. Tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 4. Pre-processing
def encode_texts(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128)

train_encodings = encode_texts(train['Poem'])
val_encodings = encode_texts(val['Poem'])

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# 5. DataLoader, batch size
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# 6. Roberta-base, optimizer
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]
        return self.fc(x)

num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes).to('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

# 7. Train with Early Stopping
num_epochs = 10
patience = 3
best_accuracy = 0
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
        attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
        labels = batch['labels'].to('cuda' if torch.cuda.is_available() else 'cpu')

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation section
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
            attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
            labels = batch['labels'].to('cuda' if torch.cuda.is_available() else 'cpu')

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {accuracy:.2f}")

    # Early stopping logic
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping triggered.")
        break

print(f"Model's Optimized Accuracy is: {best_accuracy:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Loss: 1.0611, Validation Accuracy: 0.23
Epoch [2/10], Loss: 1.2283, Validation Accuracy: 0.59
Epoch [3/10], Loss: 0.9242, Validation Accuracy: 0.57
Epoch [4/10], Loss: 0.3575, Validation Accuracy: 0.45
Epoch [5/10], Loss: 0.9267, Validation Accuracy: 0.52
Early stopping triggered.
Model's Optimized Accuracy is: 0.59


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 2e-5
*   2 эпохи
*   размер батча 8

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from transformers import RobertaTokenizer, RobertaModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# 1. Data
train = pd.read_csv('/content/drive/MyDrive/data/train_data.csv')
val = pd.read_csv('/content/drive/MyDrive/data/test_data.csv')

# 2. Removing NULL in training data & Encoding labels
train.dropna(inplace=True)
train.reset_index(inplace=True, drop=True)

label_encoder = preprocessing.LabelEncoder()
train['Genre_Code'] = label_encoder.fit_transform(train['Genre'])
val['Genre_Code'] = label_encoder.transform(val['Genre'])

# 3. Tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 4. Pre-processing
def encode_texts(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128)

train_encodings = encode_texts(train['Poem'])
val_encodings = encode_texts(val['Poem'])

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# 5. DataLoader, batch size
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# 6. Roberta-base, optimizer
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]
        return self.fc(x)

num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes).to('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# 7. Train with Early Stopping
num_epochs = 10
patience = 3
best_accuracy = 0
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
        attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
        labels = batch['labels'].to('cuda' if torch.cuda.is_available() else 'cpu')

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation section
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
            attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
            labels = batch['labels'].to('cuda' if torch.cuda.is_available() else 'cpu')

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {accuracy:.2f}")

    # Early stopping logic
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping triggered.")
        break

print(f"Model's Optimized Accuracy is: {best_accuracy:.2f}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Loss: 1.1805, Validation Accuracy: 0.18
Epoch [2/10], Loss: 1.1668, Validation Accuracy: 0.61
Epoch [3/10], Loss: 0.6862, Validation Accuracy: 0.43
Epoch [4/10], Loss: 0.2636, Validation Accuracy: 0.59
Epoch [5/10], Loss: 0.8924, Validation Accuracy: 0.59
Early stopping triggered.
Model's Optimized Accuracy is: 0.61


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 3e-5
*   6 эпох
*   размер батча 8

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from transformers import RobertaTokenizer, RobertaModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# 1. Data
train = pd.read_csv('/content/drive/MyDrive/data/train_data.csv')
val = pd.read_csv('/content/drive/MyDrive/data/test_data.csv')

# 2. Removing NULL in training data & Encoding labels
train.dropna(inplace=True)
train.reset_index(inplace=True, drop=True)

label_encoder = preprocessing.LabelEncoder()
train['Genre_Code'] = label_encoder.fit_transform(train['Genre'])
val['Genre_Code'] = label_encoder.transform(val['Genre'])

# 3. Tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 4. Pre-processing
def encode_texts(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128)

train_encodings = encode_texts(train['Poem'])
val_encodings = encode_texts(val['Poem'])

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# 5. DataLoader, batch size
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# 6. Roberta-base, optimizer
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]
        return self.fc(x)

num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes).to('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
criterion = nn.CrossEntropyLoss()

# 7. Train with Early Stopping
num_epochs = 10
patience = 3
best_accuracy = 0
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
        attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
        labels = batch['labels'].to('cuda' if torch.cuda.is_available() else 'cpu')

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation section
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
            attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
            labels = batch['labels'].to('cuda' if torch.cuda.is_available() else 'cpu')

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {accuracy:.2f}")

    # Early stopping logic
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping triggered.")
        break

print(f"Model's Optimized Accuracy is: {best_accuracy:.2f}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Loss: 1.5398, Validation Accuracy: 0.19
Epoch [2/10], Loss: 0.9778, Validation Accuracy: 0.35
Epoch [3/10], Loss: 0.8547, Validation Accuracy: 0.39
Epoch [4/10], Loss: 0.2747, Validation Accuracy: 0.48
Epoch [5/10], Loss: 0.3827, Validation Accuracy: 0.47
Epoch [6/10], Loss: 0.1106, Validation Accuracy: 0.55
Epoch [7/10], Loss: 0.4018, Validation Accuracy: 0.53
Epoch [8/10], Loss: 0.1622, Validation Accuracy: 0.50
Epoch [9/10], Loss: 0.0305, Validation Accuracy: 0.53
Early stopping triggered.
Model's Optimized Accuracy is: 0.55


In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from transformers import RobertaTokenizer, RobertaModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# 1. Data
train = pd.read_csv('/content/drive/MyDrive/data/train_data.csv')
val = pd.read_csv('/content/drive/MyDrive/data/test_data.csv')

In [None]:
train = train.dropna(subset=['Poem'])

In [None]:
df=pd.concat([train,val],axis=0)
df

Unnamed: 0,Genre,Poem
1,Music,In the thick brushthey spend the...
2,Music,Storms are generous. ...
3,Music,—After Ana Mendieta Did you carry around the ...
4,Music,for Aja Sherrard at 20The portent may itself ...
5,Music,"for Bob Marley, Bavaria, November 1980 Here i..."
...,...,...
145,Environment,"To pick a tulip from the garden, the red one. ..."
146,Environment,We are as clouds that veil the midnight moon; ...
147,Environment,"When pulled, the spider web took another form...."
148,Environment,Whose woods these are I think I know. His hous...


In [None]:
from sklearn.model_selection import train_test_split as tts

train,val = tts(df,test_size=0.3,random_state=42)

Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 2e-5
*   4 эпохи
*   размер батча 8

In [None]:
label_encoder = preprocessing.LabelEncoder()
train['Genre_Code'] = label_encoder.fit_transform(train['Genre'])
val['Genre_Code'] = label_encoder.transform(val['Genre'])

# 3. Tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 4. Pre-processing
def encode_texts(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128)

train_encodings = encode_texts(train['Poem'])
val_encodings = encode_texts(val['Poem'])

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# 5. DataLoader, batch size
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# 6. Roberta-base, optimizer
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]
        return self.fc(x)

num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes).to('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# 7. Train with Early Stopping
num_epochs = 10
patience = 3
best_accuracy = 0
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
        attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
        labels = batch['labels'].to('cuda' if torch.cuda.is_available() else 'cpu')

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation section
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
            attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
            labels = batch['labels'].to('cuda' if torch.cuda.is_available() else 'cpu')

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {accuracy:.2f}")

    # Early stopping logic
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping triggered.")
        break

print(f"Model's Optimized Accuracy is: {best_accuracy:.2f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Loss: 1.2713, Validation Accuracy: 0.37
Epoch [2/10], Loss: 0.9644, Validation Accuracy: 0.50
Epoch [3/10], Loss: 0.3322, Validation Accuracy: 0.50
Epoch [4/10], Loss: 1.0038, Validation Accuracy: 0.52
Epoch [5/10], Loss: 0.5639, Validation Accuracy: 0.47
Epoch [6/10], Loss: 0.0994, Validation Accuracy: 0.47
Epoch [7/10], Loss: 0.1109, Validation Accuracy: 0.47
Early stopping triggered.
Model's Optimized Accuracy is: 0.52


Roberta

*   токенизатор roberta-base
*   оптимизатор lr = 1e-5
*   4 эпохи
*   размер батча 8

In [None]:
label_encoder = preprocessing.LabelEncoder()
train['Genre_Code'] = label_encoder.fit_transform(train['Genre'])
val['Genre_Code'] = label_encoder.transform(val['Genre'])

# 3. Tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 4. Pre-processing
def encode_texts(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128)

train_encodings = encode_texts(train['Poem'])
val_encodings = encode_texts(val['Poem'])

class PoemDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# 5. DataLoader, batch size
train_dataset = PoemDataset(train_encodings, train['Genre_Code'].values)
val_dataset = PoemDataset(val_encodings, val['Genre_Code'].values)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# 6. Roberta-base, optimizer
class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state[:, 0, :]
        return self.fc(x)

num_classes = len(label_encoder.classes_)
model = TextClassificationModel(num_classes=num_classes).to('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

# 7. Train with Early Stopping
num_epochs = 10
patience = 3
best_accuracy = 0
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
        attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
        labels = batch['labels'].to('cuda' if torch.cuda.is_available() else 'cpu')

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation section
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
            attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
            labels = batch['labels'].to('cuda' if torch.cuda.is_available() else 'cpu')

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {accuracy:.2f}")

    # Early stopping logic
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping triggered.")
        break

print(f"Model's Optimized Accuracy is: {best_accuracy:.2f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Loss: 1.5622, Validation Accuracy: 0.42
Epoch [2/10], Loss: 0.5822, Validation Accuracy: 0.44
Epoch [3/10], Loss: 0.6813, Validation Accuracy: 0.49
Epoch [4/10], Loss: 0.3173, Validation Accuracy: 0.52
Epoch [5/10], Loss: 0.1155, Validation Accuracy: 0.48
Epoch [6/10], Loss: 0.0954, Validation Accuracy: 0.41
Epoch [7/10], Loss: 0.1612, Validation Accuracy: 0.47
Early stopping triggered.
Model's Optimized Accuracy is: 0.52


Accuracy 0.61 лучший результат модели Roberta