# Обучение трансформеров для классификации токсичных комментариев

## Импорт библиотек

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EvalPrediction
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score, average_precision_score
from sklearn.model_selection import train_test_split
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Загрузка и подготовка данных

In [2]:
df = pd.read_csv('modified_train.csv', usecols=['comment_text', 'toxicity_b', 'created_date'],parse_dates=['created_date'],
    date_format='ISO8601')
df = df[df['created_date'] >= '2016-04-01']
df = df.sort_values(by='created_date')[['comment_text', 'toxicity_b']].reset_index(drop=True)

### Очистка текста

In [3]:
url_re = re.compile(r'(https?://\S+|www\.\S+)', re.IGNORECASE)
spaces_re = re.compile(r'\s+')

def clean_text_roberta(text):
    # Удаление ссылок
    text = url_re.sub('', text)
    # Очистка лишних пробелов
    text = spaces_re.sub(' ', text).strip()
    return text

In [4]:
df['comment_text'] = df['comment_text'].parallel_apply(clean_text_roberta)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=297552), Label(value='0 / 297552')…

In [5]:
df.head()

Unnamed: 0,comment_text,toxicity_b
0,"If there is a special session,it should be hel...",0
1,"As they should my friend, as they should. It's...",0
2,Just got back from the 5th Ave mall and was pa...,0
3,"Why bother going into Dutch, Just run the Trus...",0
4,"Sounds like a lot of stereotyping going on, CS...",0


### Разделение данных на Train-Val-Test и подготовка данных для использования в моделях

In [6]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.texts = dataframe['comment_text'].values
        self.labels = dataframe['toxicity_b'].values
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [7]:
def compute_train_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    probs = torch.nn.functional.softmax(torch.tensor(p.predictions), dim=-1).numpy()[:, 1]
    
    return {
        'roc_auc': roc_auc_score(p.label_ids, probs),
        'pr_auc': average_precision_score(p.label_ids, probs),
        'f1': f1_score(p.label_ids, preds),
        'accuracy': accuracy_score(p.label_ids, preds),
        'precision': precision_score(p.label_ids, preds),
        'recall': recall_score(p.label_ids, preds)
    }

In [8]:
def freeze_layers(model, num_frozen_layers=18):
    for param in model.roberta.embeddings.parameters():
        param.requires_grad = False
    
    for i in range(num_frozen_layers):
        for param in model.roberta.encoder.layer[i].parameters():
            param.requires_grad = False
    
    for param in model.classifier.parameters():
        param.requires_grad = True
        
    return model

In [9]:
train_df, test_df = df.loc[:79999], df.loc[80000:]
train_df, val_df = train_df.loc[:63999], train_df.loc[64000:]

In [10]:
train_df.toxicity_b.value_counts(normalize = True)

toxicity_b
0    0.904375
1    0.095625
Name: proportion, dtype: float64

In [11]:
val_df.toxicity_b.value_counts(normalize = True)

toxicity_b
0    0.888563
1    0.111437
Name: proportion, dtype: float64

In [12]:
test_df.toxicity_b.value_counts(normalize = True)

toxicity_b
0    0.88749
1    0.11251
Name: proportion, dtype: float64

## Обучение моделей на 100K наблюдениях

### DistilBERT

In [66]:
tokenizer_dbert = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [67]:
train_dataset = TextDataset(train_df, tokenizer_dbert)
val_dataset = TextDataset(val_df, tokenizer_dbert)
test_dataset = TextDataset(test_df, tokenizer_dbert)

In [68]:
model_dbert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
training_args = TrainingArguments(
    output_dir='./dbert_training_results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    metric_for_best_model='f1',
    fp16=True,
    report_to='none'
)

In [70]:
trainer = Trainer(
    model=model_dbert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_train_metrics
)

In [71]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc,Pr Auc,F1,Accuracy,Precision,Recall
1,0.1544,0.178868,0.942403,0.737331,0.657282,0.925438,0.673734,0.641615
2,0.1132,0.199488,0.942881,0.749698,0.667223,0.92525,0.662065,0.672462
3,0.0822,0.227283,0.942714,0.749201,0.661359,0.925562,0.670704,0.652271


TrainOutput(global_step=6000, training_loss=0.12615878677368164, metrics={'train_runtime': 935.3046, 'train_samples_per_second': 205.281, 'train_steps_per_second': 6.415, 'total_flos': 2.5433740541952e+16, 'train_loss': 0.12615878677368164, 'epoch': 3.0})

In [72]:
from scipy.special import softmax

In [73]:
y_val_np = val_df['toxicity_b'].to_numpy()
preds = trainer.predict(val_dataset)
probs = softmax(preds.predictions, axis=1)[:, 1]


thresholds = np.linspace(probs.min(), probs.max(), 100)
best_threshold = 0
best_f1 = 0

for threshold in thresholds:
    binary_preds = (probs > threshold).astype(int)
    f1 = f1_score(y_val_np, binary_preds)
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Лучший порог: {best_threshold:.4f}")
print(f"Лучший F1: {best_f1:.4f}")

Лучший порог: 0.2820
Лучший F1: 0.6700


In [74]:
binary_preds = (probs > 0.5).astype(int)
f1_score(y_val_np, binary_preds)

0.6613591128802957

### BERT

In [75]:
tokenizer_bert = AutoTokenizer.from_pretrained('bert-base-uncased')

In [76]:
train_dataset = TextDataset(train_df, tokenizer_bert)
val_dataset = TextDataset(val_df, tokenizer_bert)
test_dataset = TextDataset(test_df, tokenizer_bert)

In [77]:
model_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [80]:
training_args = TrainingArguments(
    output_dir='./bert_training_results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    metric_for_best_model='f1',
    fp16=True,
    report_to='none'
)

In [81]:
trainer = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_train_metrics
)

In [82]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc,Pr Auc,F1,Accuracy,Precision,Recall
1,0.1546,0.178491,0.944058,0.743018,0.651759,0.927,0.695735,0.613012
2,0.1119,0.202396,0.94474,0.756716,0.669485,0.921813,0.632867,0.7106
3,0.0706,0.250403,0.942726,0.75116,0.667044,0.926375,0.672365,0.661806


TrainOutput(global_step=6000, training_loss=0.12013796742757162, metrics={'train_runtime': 1711.9156, 'train_samples_per_second': 112.155, 'train_steps_per_second': 3.505, 'total_flos': 5.051732262912e+16, 'train_loss': 0.12013796742757162, 'epoch': 3.0})

In [83]:
preds = trainer.predict(val_dataset)
probs = softmax(preds.predictions, axis=1)[:, 1]


thresholds = np.linspace(probs.min(), probs.max(), 100)
best_threshold = 0
best_f1 = 0

for threshold in thresholds:
    binary_preds = (probs > threshold).astype(int)
    f1 = f1_score(y_val_np, binary_preds)
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Лучший порог: {best_threshold:.4f}")
print(f"Лучший F1: {best_f1:.4f}")

Лучший порог: 0.2925
Лучший F1: 0.6708


### RoBERTa

In [84]:
tokenizer_roberta = AutoTokenizer.from_pretrained('roberta-base')

In [85]:
train_dataset = TextDataset(train_df, tokenizer_roberta)
val_dataset = TextDataset(val_df, tokenizer_roberta)
test_dataset = TextDataset(test_df, tokenizer_roberta)

In [86]:
model_roberta = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
training_args = TrainingArguments(
    output_dir='./roberta_training_results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    metric_for_best_model='f1',
    fp16=True,
    report_to='none'
)

In [88]:
trainer = Trainer(
    model=model_roberta,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_train_metrics
)

In [89]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc,Pr Auc,F1,Accuracy,Precision,Recall
1,0.1605,0.197114,0.946548,0.742811,0.660901,0.916687,0.604749,0.728547
2,0.1228,0.20079,0.946032,0.754868,0.665073,0.921188,0.631685,0.702187
3,0.0947,0.222156,0.945871,0.752541,0.667756,0.92375,0.649021,0.687605


TrainOutput(global_step=6000, training_loss=0.13461771583557128, metrics={'train_runtime': 1748.8667, 'train_samples_per_second': 109.785, 'train_steps_per_second': 3.431, 'total_flos': 5.051732262912e+16, 'train_loss': 0.13461771583557128, 'epoch': 3.0})

In [90]:
preds = trainer.predict(val_dataset)
probs = softmax(preds.predictions, axis=1)[:, 1]


thresholds = np.linspace(probs.min(), probs.max(), 100)
best_threshold = 0
best_f1 = 0

for threshold in thresholds:
    binary_preds = (probs > threshold).astype(int)
    f1 = f1_score(y_val_np, binary_preds)
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Лучший порог: {best_threshold:.4f}")
print(f"Лучший F1: {best_f1:.4f}")

Лучший порог: 0.4430
Лучший F1: 0.6704


### RoBERTa Large

In [12]:
tokenizer_robertal = AutoTokenizer.from_pretrained('roberta-large')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [13]:
train_dataset = TextDataset(train_df, tokenizer_robertal)
val_dataset = TextDataset(val_df, tokenizer_robertal)
test_dataset = TextDataset(test_df, tokenizer_robertal)

In [14]:
model_robertal = AutoModelForSequenceClassification.from_pretrained("roberta-large", num_labels=2)

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = TrainingArguments(
    output_dir='./roberta-large_training_results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    metric_for_best_model='f1',
    fp16=True,
    report_to='none'
)

In [16]:
trainer = Trainer(
    model=model_robertal,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_train_metrics
)

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc,Pr Auc,F1,Accuracy,Precision,Recall
1,0.1666,0.195231,0.946467,0.745489,0.667186,0.919937,0.621491,0.720135
2,0.1216,0.20571,0.949305,0.759203,0.691515,0.927063,0.654,0.733595
3,0.0802,0.239113,0.946066,0.752732,0.67966,0.927063,0.665591,0.694335


TrainOutput(global_step=6000, training_loss=0.12990824254353842, metrics={'train_runtime': 4831.0525, 'train_samples_per_second': 39.743, 'train_steps_per_second': 1.242, 'total_flos': 1.78930821758976e+17, 'train_loss': 0.12990824254353842, 'epoch': 3.0})

In [18]:
from scipy.special import softmax

y_val_np = val_df['toxicity_b'].to_numpy()
preds = trainer.predict(val_dataset)
probs = softmax(preds.predictions, axis=1)[:, 1]


thresholds = np.linspace(probs.min(), probs.max(), 100)
best_threshold = 0
best_f1 = 0

for threshold in thresholds:
    binary_preds = (probs > threshold).astype(int)
    f1 = f1_score(y_val_np, binary_preds)
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Лучший порог: {best_threshold:.4f}")
print(f"Лучший F1: {best_f1:.4f}")

Лучший порог: 0.4107
Лучший F1: 0.6819


### DeBERTa V3

Эта модель обучается параметром TextDataset равным 448, в то время как все остальные модели обучались с параметром 512. Это связанно с тем, что модель не поместилась в видеопамять с параметром 512.

In [11]:
tokenizer_deberta3 = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')



In [12]:
train_dataset = TextDataset(train_df, tokenizer_deberta3, max_length=504)
val_dataset = TextDataset(val_df, tokenizer_deberta3, max_length=504)
test_dataset = TextDataset(test_df, tokenizer_deberta3, max_length=504)

In [13]:
model_deberta3 = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=2)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
training_args = TrainingArguments(
    output_dir='./deberta3_training_results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    metric_for_best_model='f1',
    fp16=True,
    report_to='none'
)

In [15]:
trainer = Trainer(
    model=model_deberta3,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_train_metrics
)

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc,Pr Auc,F1,Accuracy,Precision,Recall
1,0.1504,0.19686,0.949212,0.761031,0.675436,0.919813,0.615207,0.748738
2,0.1179,0.196205,0.950499,0.76745,0.679175,0.924125,0.642179,0.720695
3,0.0883,0.212868,0.949917,0.762673,0.674986,0.925125,0.653705,0.697701


TrainOutput(global_step=6000, training_loss=0.1271060587565104, metrics={'train_runtime': 3281.4011, 'train_samples_per_second': 58.512, 'train_steps_per_second': 1.828, 'total_flos': 4.9728881276928e+16, 'train_loss': 0.1271060587565104, 'epoch': 3.0})

In [17]:
from scipy.special import softmax

y_val_np = val_df['toxicity_b'].to_numpy()
preds = trainer.predict(val_dataset)
probs = softmax(preds.predictions, axis=1)[:, 1]


thresholds = np.linspace(probs.min(), probs.max(), 100)
best_threshold = 0
best_f1 = 0

for threshold in thresholds:
    binary_preds = (probs > threshold).astype(int)
    f1 = f1_score(y_val_np, binary_preds)
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Лучший порог: {best_threshold:.4f}")
print(f"Лучший F1: {best_f1:.4f}")

Лучший порог: 0.6635
Лучший F1: 0.6775


### DeBERTa V3 Large

In [10]:
tokenizer_deberta3l = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')



In [11]:
train_dataset = TextDataset(train_df, tokenizer_deberta3l, max_length=192)
val_dataset = TextDataset(val_df, tokenizer_deberta3l, max_length=192)
test_dataset = TextDataset(test_df, tokenizer_deberta3l, max_length=192)

In [12]:
model_deberta3l = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-large', num_labels=2)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = TrainingArguments(
    output_dir='./deberta3l_training_results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    metric_for_best_model='f1',
    fp16=True,
    report_to='none'
)

In [14]:
trainer = Trainer(
    model=model_deberta3l,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_train_metrics
)

In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc,Pr Auc,F1,Accuracy,Precision,Recall
1,0.1749,0.196433,0.937542,0.71587,0.651934,0.92125,0.642352,0.661806
2,0.146,0.210829,0.922463,0.722788,0.655825,0.92725,0.693558,0.621985
3,0.1138,0.188305,0.9423,0.742174,0.675593,0.927312,0.672031,0.679192


TrainOutput(global_step=6000, training_loss=0.15766052627563476, metrics={'train_runtime': 2921.1628, 'train_samples_per_second': 65.727, 'train_steps_per_second': 2.054, 'total_flos': 6.7099511144448e+16, 'train_loss': 0.15766052627563476, 'epoch': 3.0})

In [16]:
from scipy.special import softmax

y_val_np = val_df['toxicity_b'].to_numpy()
preds = trainer.predict(val_dataset)
probs = softmax(preds.predictions, axis=1)[:, 1]


thresholds = np.linspace(probs.min(), probs.max(), 100)
best_threshold = 0
best_f1 = 0

for threshold in thresholds:
    binary_preds = (probs > threshold).astype(int)
    f1 = f1_score(y_val_np, binary_preds)
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Лучший порог: {best_threshold:.4f}")
print(f"Лучший F1: {best_f1:.4f}")

Лучший порог: 0.5851
Лучший F1: 0.6768


### FP32 DistilBERT

In [10]:
tokenizer_dbert = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [11]:
train_dataset = TextDataset(train_df, tokenizer_dbert)
val_dataset = TextDataset(val_df, tokenizer_dbert)
test_dataset = TextDataset(test_df, tokenizer_dbert)

In [12]:
model_dbert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = TrainingArguments(
    output_dir='./dbert_training_results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    metric_for_best_model='f1',
    #fp16=True,
    report_to='none'
)

In [14]:
trainer = Trainer(
    model=model_dbert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_train_metrics
)

In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc,Pr Auc,F1,Accuracy,Precision,Recall
1,0.1532,0.177036,0.943877,0.742914,0.662097,0.926063,0.674622,0.650028
2,0.1135,0.19845,0.943601,0.751792,0.663377,0.92325,0.648794,0.678632
3,0.082,0.231514,0.942326,0.747733,0.662334,0.925312,0.667426,0.657319


TrainOutput(global_step=6000, training_loss=0.12476190185546875, metrics={'train_runtime': 2052.2772, 'train_samples_per_second': 93.555, 'train_steps_per_second': 2.924, 'total_flos': 2.5433740541952e+16, 'train_loss': 0.12476190185546875, 'epoch': 3.0})

In [16]:
from scipy.special import softmax

y_val_np = val_df['toxicity_b'].to_numpy()
preds = trainer.predict(val_dataset)
probs = softmax(preds.predictions, axis=1)[:, 1]


thresholds = np.linspace(probs.min(), probs.max(), 100)
best_threshold = 0
best_f1 = 0

for threshold in thresholds:
    binary_preds = (probs > threshold).astype(int)
    f1 = f1_score(y_val_np, binary_preds)
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Лучший порог: {best_threshold:.4f}")
print(f"Лучший F1: {best_f1:.4f}")

Лучший порог: 0.3726
Лучший F1: 0.6647


### Xlmr Large Toxicity Classifier V2

In [11]:
textdetox_tokenizer = AutoTokenizer.from_pretrained('textdetox/xlmr-large-toxicity-classifier-v2')
textdetox_model = AutoModelForSequenceClassification.from_pretrained('textdetox/xlmr-large-toxicity-classifier-v2')

In [12]:
textdetox_frozen_model = freeze_layers(textdetox_model, num_frozen_layers=0)

In [13]:
train_dataset = TextDataset(train_df, textdetox_tokenizer)
val_dataset = TextDataset(val_df, textdetox_tokenizer)
test_dataset = TextDataset(test_df, textdetox_tokenizer)

In [14]:
training_args = TrainingArguments(
    output_dir='./textdetox_training_results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=True,
    report_to='none'
)

In [15]:
trainer = Trainer(
    model=textdetox_frozen_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_train_metrics
)

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc,Pr Auc,F1,Accuracy,Precision,Recall
1,0.1697,0.194411,0.93176,0.716111,0.641521,0.924562,0.681818,0.605721
2,0.1303,0.222938,0.942511,0.737749,0.658549,0.917625,0.61194,0.712844
3,0.0965,0.238487,0.94436,0.742468,0.663518,0.924312,0.657489,0.669658


TrainOutput(global_step=6000, training_loss=0.13873130416870116, metrics={'train_runtime': 4873.6608, 'train_samples_per_second': 39.395, 'train_steps_per_second': 1.231, 'total_flos': 1.78930821758976e+17, 'train_loss': 0.13873130416870116, 'epoch': 3.0})

In [17]:
from scipy.special import softmax

y_val_np = val_df['toxicity_b'].to_numpy()
preds = trainer.predict(val_dataset)
probs = softmax(preds.predictions, axis=1)[:, 1]


thresholds = np.linspace(probs.min(), probs.max(), 100)
best_threshold = 0
best_f1 = 0

for threshold in thresholds:
    binary_preds = (probs > threshold).astype(int)
    f1 = f1_score(y_val_np, binary_preds)
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Лучший порог: {best_threshold:.4f}")
print(f"Лучший F1: {best_f1:.4f}")

Лучший порог: 0.5217
Лучший F1: 0.6650


### Hate-speech-CNERG/dehatebert-mono-english

In [13]:
hs_tokenizer = AutoTokenizer.from_pretrained('Hate-speech-CNERG/dehatebert-mono-english')
hs_model = AutoModelForSequenceClassification.from_pretrained('Hate-speech-CNERG/dehatebert-mono-english')

tokenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [14]:
train_dataset = TextDataset(train_df, hs_tokenizer)
val_dataset = TextDataset(val_df, hs_tokenizer)
test_dataset = TextDataset(test_df, hs_tokenizer)

In [16]:
training_args = TrainingArguments(
    output_dir='./hs_training_results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=True,
    report_to='none'
)

In [17]:
trainer = Trainer(
    model=hs_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_train_metrics
)

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc,Pr Auc,F1,Accuracy,Precision,Recall
1,0.1959,0.245054,0.895098,0.621167,0.568571,0.905625,0.579499,0.558048
2,0.1711,0.24716,0.906541,0.658129,0.598286,0.909188,0.589967,0.606842
3,0.1655,0.238985,0.909721,0.664351,0.599116,0.914937,0.630893,0.570387


TrainOutput(global_step=6000, training_loss=0.1863182856241862, metrics={'train_runtime': 1720.8099, 'train_samples_per_second': 111.575, 'train_steps_per_second': 3.487, 'total_flos': 5.051732262912e+16, 'train_loss': 0.1863182856241862, 'epoch': 3.0})

In [19]:
from scipy.special import softmax

y_val_np = val_df['toxicity_b'].to_numpy()
preds = trainer.predict(val_dataset)
probs = softmax(preds.predictions, axis=1)[:, 1]


thresholds = np.linspace(probs.min(), probs.max(), 100)
best_threshold = 0
best_f1 = 0

for threshold in thresholds:
    binary_preds = (probs > threshold).astype(int)
    f1 = f1_score(y_val_np, binary_preds)
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Лучший порог: {best_threshold:.4f}")
print(f"Лучший F1: {best_f1:.4f}")

Лучший порог: 0.3623
Лучший F1: 0.6110


## Обучение моделей на полных данных

In [12]:
train_df, test_df = df.loc[:1428245], df.loc[1428246:]
train_df, val_df = train_df.loc[:1142596], train_df.loc[1142597:]

### DistilBERT

In [15]:
tokenizer_dbert = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [16]:
train_dataset = TextDataset(train_df, tokenizer_dbert)
val_dataset = TextDataset(val_df, tokenizer_dbert)
test_dataset = TextDataset(test_df, tokenizer_dbert)

In [17]:
model_dbert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
training_args = TrainingArguments(
    output_dir='./dbert_training_results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    metric_for_best_model='f1',
    fp16=True,
    report_to='none'
)

In [19]:
trainer = Trainer(
    model=model_dbert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_train_metrics
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc,Pr Auc,F1,Accuracy,Precision,Recall
1,0.1431,0.161231,0.959167,0.803648,0.709643,0.935225,0.742854,0.679274
2,0.1335,0.162525,0.960119,0.806491,0.714649,0.935655,0.73947,0.691441
3,0.099,0.187396,0.957346,0.797685,0.709071,0.934157,0.730844,0.688557


TrainOutput(global_step=107121, training_loss=0.13017315996813733, metrics={'train_runtime': 15894.8279, 'train_samples_per_second': 215.654, 'train_steps_per_second': 6.739, 'total_flos': 4.540705569064489e+17, 'train_loss': 0.13017315996813733, 'epoch': 3.0})

In [21]:
from scipy.special import softmax

y_val_np = val_df['toxicity_b'].to_numpy()
preds = trainer.predict(val_dataset)
probs = softmax(preds.predictions, axis=1)[:, 1]


thresholds = np.linspace(probs.min(), probs.max(), 100)
best_threshold = 0
best_f1 = 0

for threshold in thresholds:
    binary_preds = (probs > threshold).astype(int)
    f1 = f1_score(y_val_np, binary_preds)
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Лучший порог: {best_threshold:.4f}")
print(f"Лучший F1: {best_f1:.4f}")

Лучший порог: 0.3837
Лучший F1: 0.7110
