In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AlbertTokenizer, AlbertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch

In [2]:
# Завантаження датасету
df = pd.read_csv("cleaned_toxic_comments.csv")

In [3]:
# Перевірка наявності стовпців
print(df.columns)

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate', 'cleaned_comment'],
      dtype='object')


In [4]:
# Обираємо потрібні стовпці
df['label'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist()

In [5]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_comment,label
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...,"[0, 0, 0, 0, 0, 0]"
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww! he matches this background colour i'm s...,"[0, 0, 0, 0, 0, 0]"
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man, i'm really not trying to edit war. it...","[0, 0, 0, 0, 0, 0]"
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,more i can't make any real suggestions on impr...,"[0, 0, 0, 0, 0, 0]"
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...","[0, 0, 0, 0, 0, 0]"


In [6]:
# Конвертація у формат Dataset
dataset = Dataset.from_pandas(df[['cleaned_comment', 'label']])

In [7]:
# Розподіл на тренувальний і тестовий набори
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [8]:
# Ініціалізуємо токенізатор
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

In [9]:
# Функція для токенізації даних
def tokenize_function(examples):
    return tokenizer(examples['cleaned_comment'], padding='max_length', truncation=True)

In [None]:
# Токенізуємо тренувальний та тестовий набори
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/126396 [00:00<?, ? examples/s]

In [11]:
# Визначаємо формат даних для тренування
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
# Завантаження моделі
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=6)

In [None]:
# Налаштування Data Collator для багатокласових міток
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Налаштування параметрів тренування
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [14]:
# Створення тренера
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

In [None]:
# Запуск тренування
trainer.train()