In [28]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bert_goit_toxic/pytorch/default/1/model_2.pth
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv


In [29]:
import transformers
import torch
from torch import cuda
import tensorflow as tf
from tensorflow.keras import layers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from datasets import load_dataset
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [30]:
# Налаштування та константи
device = 'cuda' if cuda.is_available() else 'cpu'
MAX_LEN = 200  # Максимальна довжина коментаря,обрано під час дослідження датасету
# обрано керюючись документацією, рекомендується робити дрібні батчі і виставляти малу швидкість навчання
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 2  # Кількість епох - якщо виставити більше, модель перенавчається. Перевірено експериментально
LEARNING_RATE = 1e-05  # Швидкість навчання
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [31]:
df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv')

In [32]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [33]:
# Балансування датасету
toxic_comments = df[(df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)) > 0]

aug_comments = df[(df[['severe_toxic', 'threat', 'identity_hate']].sum(axis=1)) > 0]

neutral_comments = df[(df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)) == 0]

balanced_neutral_comments = neutral_comments.sample(n=len(toxic_comments), random_state=42)

# 4. Об'єднуємо токсичні коментарі з підсетом нейтральних для збалансованого датасету
balanced_df = pd.concat([toxic_comments, balanced_neutral_comments])
balanced_df = pd.concat([aug_comments, balanced_df])

# 5. Перемішуємо дані для випадкового розподілу
df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Тепер 'df' — це збалансований датасет.
print(df['toxic'].value_counts())

toxic
1    18149
0    17288
Name: count, dtype: int64


In [34]:
import re

def clean_text(text):
    text = text.replace('\n', ' ')
    text = re.sub(r"[^a-zA-Z0-9\s!']", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['comment_text'] = df['comment_text'].apply(clean_text)

# Display the cleaned data
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,689e721bb62780ad,I don't know about the authors although you se...,0,0,0,0,0,0
1,372f2e5d352b93ce,piece of shit fuck your warning and fuck your ...,1,1,1,0,1,0
2,26b9c2721ab61f7d,your informing pink floyd fans as to how much ...,1,0,0,0,0,0
3,fede57c8b31fa05c,She mentions it in her bio probably because sh...,0,0,0,0,0,0
4,afbb9cb4fbdc6f57,UTC Of these 4 images apparently the first two...,0,0,0,0,0,0


In [35]:
df.shape

(35437, 8)

In [36]:
df['list'] = df[df.columns[2:]].values.tolist()
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,list
0,689e721bb62780ad,I don't know about the authors although you se...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
1,372f2e5d352b93ce,piece of shit fuck your warning and fuck your ...,1,1,1,0,1,0,"[1, 1, 1, 0, 1, 0]"
2,26b9c2721ab61f7d,your informing pink floyd fans as to how much ...,1,0,0,0,0,0,"[1, 0, 0, 0, 0, 0]"
3,fede57c8b31fa05c,She mentions it in her bio probably because sh...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
4,afbb9cb4fbdc6f57,UTC Of these 4 images apparently the first two...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"


In [37]:
formatted_df = df[['comment_text', 'list']].copy()
formatted_df.head()

Unnamed: 0,comment_text,list
0,I don't know about the authors although you se...,"[0, 0, 0, 0, 0, 0]"
1,piece of shit fuck your warning and fuck your ...,"[1, 1, 1, 0, 1, 0]"
2,your informing pink floyd fans as to how much ...,"[1, 0, 0, 0, 0, 0]"
3,She mentions it in her bio probably because sh...,"[0, 0, 0, 0, 0, 0]"
4,UTC Of these 4 images apparently the first two...,"[0, 0, 0, 0, 0, 0]"


In [38]:
class ToxicDataset(Dataset):
    
    """Клас ToxicDataset - Розроблений для обробки задач класифікації токсичних коментарів
    з використанням PyTorch.
    Він розширює клас Dataset з бібліотеки PyTorch
    та надає функціональність для токенізації коментарів,
    управління їх довжинами та підготовки їх до введення в модель."""

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list # мітки токсичності
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text) # повертає кількість коментарів у датасеті

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        # очистка
        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        # Ідентифікатори токенів
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }


In [39]:
# Розділення на тренувальний та тестовий набір і перетворення в кастомний клас
train_size = 0.8
train_dataset=formatted_df.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=formatted_df.drop(train_dataset.index).reset_index(drop=True)

training_set = ToxicDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = ToxicDataset(test_dataset, tokenizer, MAX_LEN)

In [40]:
# Налаштування параметрів для тренувального набору
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

In [41]:
# Налаштування параметрів для тесту
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

In [42]:
# Створення лоадера
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [43]:
# Створення моделі

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        outputs = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        cls_output = outputs[0][:, 0, :]  # Взяти перший токен [CLS] для кожного прикладу
        output_2 = self.l2(cls_output)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [44]:
# Функція втрат
def loss_function(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [45]:
# Оптимізатор
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=0.001)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [46]:
# Функція для тренування
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

         # Проганяємо входи через модель
        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad() # Очищення градієнтів
        
        # Обчислення функції втрат між прогнозами моделі
        loss = loss_function(outputs, targets)
        
         # Виведення значення втрат для контролю
        if _%1000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()  # Зворотне поширення
        optimizer.step() # Оновлення параметрів моделі


In [47]:
# Тренування 
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.7190647125244141
Epoch: 0, Loss:  0.1493789553642273
Epoch: 0, Loss:  0.14451864361763
Epoch: 0, Loss:  0.25945910811424255
Epoch: 1, Loss:  0.1101258248090744
Epoch: 1, Loss:  0.16045092046260834
Epoch: 1, Loss:  0.21150867640972137
Epoch: 1, Loss:  0.07990939170122147


In [48]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [49]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 score (Micro) = {f1_score_micro}")
    print(f"F1 score (Macro) = {f1_score_macro}")

Accuracy Score = 0.7781854099054607
F1 score (Micro) = 0.8974305191400105
F1 score (Macro) = 0.8444132223817057
Accuracy Score = 0.7781854099054607
F1 score (Micro) = 0.8974305191400105
F1 score (Macro) = 0.8444132223817057


In [50]:
print(classification_report(outputs, targets, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))

               precision    recall  f1-score   support

        toxic       0.98      0.93      0.96      3844
 severe_toxic       0.69      0.70      0.70       648
      obscene       0.92      0.91      0.92      2209
       threat       0.87      0.78      0.82       218
       insult       0.92      0.82      0.87      2253
identity_hate       0.90      0.73      0.81       654

    micro avg       0.93      0.87      0.90      9826
    macro avg       0.88      0.81      0.84      9826
 weighted avg       0.93      0.87      0.90      9826
  samples avg       0.50      0.47      0.47      9826



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
torch.save(model.state_dict(), '/kaggle/working/model_balanced_aug_with_2_epoch.pth')
print("Saved PyTorch Model State to model.pth")

Saved PyTorch Model State to model.pth


In [52]:
def predict_comment(comment):
    # Токенізація коментаря
    inputs = tokenizer.encode_plus(
        comment,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_token_type_ids=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)

    model.eval()

    # Передача через модель
    with torch.no_grad():
        outputs = model(input_ids, attention_mask, token_type_ids)
        # Застосування сигмоїди для отримання ймовірностей
        probabilities = torch.sigmoid(outputs)

    # Застосування порогу для класифікації (наприклад, 0.5)
    predictions = (probabilities > 0.5).int().cpu().numpy()

    return predictions

# Приклад коментаря
comment = "test example!"
predicted_scores = predict_comment(comment)
print(predicted_scores)


[[0 0 0 0 0 0]]


In [53]:
tokenizer.save_pretrained('/kaggle/working/custom_bert_tokenizer')

('/kaggle/working/custom_bert_tokenizer/tokenizer_config.json',
 '/kaggle/working/custom_bert_tokenizer/special_tokens_map.json',
 '/kaggle/working/custom_bert_tokenizer/vocab.txt',
 '/kaggle/working/custom_bert_tokenizer/added_tokens.json')

In [54]:
"""tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BERTClass()
model.load_state_dict(torch.load('/kaggle/working/model_2.pth'))
model.eval()"""

"tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n\nmodel = BERTClass()\nmodel.load_state_dict(torch.load('/kaggle/working/model_2.pth'))\nmodel.eval()"