In [1]:
import torch

import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


import os
import re
from string import punctuation

from nltk.stem.snowball import SnowballStemmer
from pymorphy2 import MorphAnalyzer

import random

import torch
from torch import nn, optim

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_checkpoint = 'cointegrated/rubert-tiny-toxicity'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()
    
def text2toxicity(text, aggregate=True):
    """ Calculate toxicity of a text (if aggregate=True) or a vector of toxicity aspects (if aggregate=False)"""
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
        proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()
    if isinstance(text, str):
        proba = proba[0]
    if aggregate:
        return 1 - proba.T[0] * (1 - proba.T[-1])
    return proba


In [4]:
df = pd.read_csv('lesson_9/train.csv')
val = pd.read_csv('lesson_9/val.csv')

In [5]:
df.head()

Unnamed: 0,id,text,class
0,0,@alisachachka не уезжаааааааай. :(❤ я тоже не ...,0
1,1,RT @GalyginVadim: Ребята и девчата!\nВсе в кин...,1
2,2,RT @ARTEM_KLYUSHIN: Кто ненавидит пробки ретви...,0
3,3,RT @epupybobv: Хочется котлету по-киевски. Зап...,1
4,4,@KarineKurganova @Yess__Boss босапопа есбоса н...,1


In [6]:
val.head()

Unnamed: 0,id,text,class
0,181467,RT @TukvaSociopat: Максимальный репост! ))) #є...,1
1,181468,чтоб у меня з.п. ежегодно индексировали на инд...,0
2,181469,@chilyandlime нехуя мне не хорошо !!! :((((,0
3,181470,"@inafish нее , когда ногами ахахах когда?ахаха...",0
4,181471,"Хочу сделать как лучше, а получаю как всегда. :(",0


In [7]:
df = df[['text','class']]
val = val[['text','class']]

In [8]:
df = df.drop_duplicates(keep='first')
val = val.drop_duplicates(keep='first')

In [9]:
stop_words = stopwords.words('russian')
stemmer = SnowballStemmer(language='russian')

In [10]:
patterns = '[A-Za-z0-9!@❤є"“’«»#$%&\'()*+,—/:;<=>?^_`{|}~\[\]]'
pattern_url = 'http[s]?://\S+|www\.\S+'
pattern_tags = '<.*?>'


def transformer(text):
    text = text.lower()
    text = re.sub(patterns, ' ', text)
    text = re.sub(pattern_url, ' ', text)
    text = re.sub(pattern_tags, ' ', text)
    text = re.sub(r'[^\w\s\n]', ' ', text)
    text = re.sub(r'ð', ' ', text)
    filtered_tokens = []
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    for token in tokens:
        if token not in stop_words and token != " " and token.strip() not in punctuation:
            filtered_tokens.append(token)
    tokens_word = [t for t in filtered_tokens if t not in stop_words]
    new_text = ' '.join(tokens_word)

    return new_text

In [11]:
df['text'] = df['text'].apply(transformer)
val['text'] = val['text'].apply(transformer)

In [12]:
df.head()

Unnamed: 0,text,class
0,уезжаааааааай хочу уезжала,0
1,ребята девчата кино это любовь сегодня завтра ...,1
2,ненавидит пробки ретвит,0
3,хочется котлету киевски запретный плод,1
4,босапопа есбоса боится мороза,1


In [13]:
val.head()

Unnamed: 0,text,class
0,максимальный репост вромайдан,1
1,з п ежегодно индексировали индекс инфляции тар...,0
2,нехуя,0
3,ногами ахахах ахаха честн помню тебе завтра шк...,0
4,хочу сделать получаю,0


In [14]:
pred = pd.DataFrame(columns=['text_pred', 'class_pred'])

In [15]:
import warnings

warnings.filterwarnings('ignore')

In [16]:
for text in val['text']:
    cls = int(text2toxicity(text).round())
    pred = pred.append({'text_pred': text, 'class_pred': cls}, ignore_index=True)

In [17]:
extracted_col = val[['text', 'class']]

In [18]:
result = pred.join(extracted_col)

In [19]:
result.head()

Unnamed: 0,text_pred,class_pred,text,class
0,максимальный репост вромайдан,0,максимальный репост вромайдан,1.0
1,з п ежегодно индексировали индекс инфляции тар...,0,з п ежегодно индексировали индекс инфляции тар...,0.0
2,нехуя,1,нехуя,0.0
3,ногами ахахах ахаха честн помню тебе завтра шк...,1,ногами ахахах ахаха честн помню тебе завтра шк...,0.0
4,хочу сделать получаю,0,хочу сделать получаю,0.0


In [20]:
result['acc'] = result['class'] == result['class_pred']

In [21]:
result['acc'] = np.where(result['class'] == result['class_pred'], 1, result.acc)

In [22]:
result['acc'].mean()

0.4927865348650815

## Дообучим модель

In [23]:
print(model)
print("Parameters full train:", sum([param.nelement() for param in model.parameters()]))

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29564, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, element

In [24]:
from torch import nn
from transformers import BertModel
from transformers import BertTokenizer, BertForSequenceClassification

In [25]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [26]:
class TwitterDataset(torch.utils.data.Dataset):
    
    def __init__(self, txts, labels):
        self._labels = labels
        
        self.tokenizer = BertTokenizer.from_pretrained('cointegrated/rubert-tiny-toxicity')
        self._txts = [self.tokenizer(text, padding='max_length', max_length=10,
                                     truncation=True, return_tensors="pt")
                      for text in txts]
        
    def __len__(self):
        return len(self._txts)
    
    def __getitem__(self, index):
        return self._txts[index], self._labels[index]

In [27]:
y_train = df['class'].values
y_val = val['class'].values

train_dataset = TwitterDataset(df['text'], y_train)
valid_dataset = TwitterDataset(val['text'], y_val)

train_loader = torch.utils.data.DataLoader(train_dataset,
                          batch_size=64,
                          shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset,
                          batch_size=64,
                          shuffle=False)

In [None]:
for txt, lbl in train_loader:
    print(txt.keys())
    print(txt['input_ids'].shape) 
    print(txt['attention_mask'].shape) 
    break

In [28]:
class BertClassification(nn.Module):

    def __init__(self, dropout=0.5):
        super().__init__()
        self.pretrained_model = BertForSequenceClassification.from_pretrained('cointegrated/rubert-tiny-toxicity')
        self.dropout = nn.Dropout(dropout)
        self.sigm = nn.Sigmoid()

    def forward(self, x, mask):
        pooled_output = self.pretrained_model(input_ids=x, attention_mask=mask, return_dict=False)[0]  
        dropout_output = self.dropout(pooled_output)
        out = self.sigm(dropout_output)
        return out

In [29]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [30]:
model = BertClassification().to(device)
print(model)
print("Parameters full train:", sum([param.nelement() for param in model.parameters()]))
print("Parameters transfer learning:", sum([param.nelement() for param in model.pretrained_model.classifier.parameters()]))

BertClassification(
  (pretrained_model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(29564, 312, padding_idx=0)
        (position_embeddings): Embedding(512, 312)
        (token_type_embeddings): Embedding(2, 312)
        (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=312, out_features=312, bias=True)
                (key): Linear(in_features=312, out_features=312, bias=True)
                (value): Linear(in_features=312, out_features=312, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=312, out_featur

In [31]:
from tqdm import tqdm

In [32]:
import warnings
warnings.filterwarnings("ignore")

In [33]:
from torch.optim import Adam

In [41]:
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.pretrained_model.parameters(), lr=0.01)

In [42]:
for epoch_num in range(5):
    total_acc_train = 0
    total_loss_train = 0

    model.train()
    for train_input, train_label in tqdm(train_loader):
        mask = train_input['attention_mask'].to(device)
        input_id = train_input['input_ids'].squeeze(1).to(device)
        train_label = train_label.to(device)

        output = model(input_id, mask)
                
        batch_loss = criterion(output, train_label)
        total_loss_train += batch_loss.item()
                
        acc = (output.argmax(dim=1) == train_label).sum().item()
        total_acc_train += acc

        model.zero_grad()
        batch_loss.backward()
        optimizer.step()
            
    model.eval()
    total_loss_val, total_acc_val = 0.0, 0.0
    for val_input, val_label in valid_loader:
        val_label = val_label.to(device)
        mask = val_input['attention_mask'].to(device)
        input_id = val_input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask)

        batch_loss = criterion(output, val_label)
        total_loss_val += batch_loss.item()
                    
        acc = (output.argmax(dim=1) == val_label).sum().item()
        total_acc_val += acc
            
    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataset): .3f} \
        | Train Accuracy: {total_acc_train / len(train_dataset): .3f} \
        | Val Loss: {total_loss_val / len(valid_dataset): .3f} \
        | Val Accuracy: {total_acc_val / len(valid_dataset): .3f}')

100%|██████████████████████████████████████████████████████████████████████████████| 2732/2732 [06:06<00:00,  7.46it/s]


Epochs: 1 | Train Loss:  0.021         | Train Accuracy:  0.495         | Val Loss:  0.018         | Val Accuracy:  0.504


100%|██████████████████████████████████████████████████████████████████████████████| 2732/2732 [06:12<00:00,  7.32it/s]


Epochs: 2 | Train Loss:  0.021         | Train Accuracy:  0.495         | Val Loss:  0.018         | Val Accuracy:  0.504


100%|██████████████████████████████████████████████████████████████████████████████| 2732/2732 [06:22<00:00,  7.14it/s]


Epochs: 3 | Train Loss:  0.021         | Train Accuracy:  0.495         | Val Loss:  0.018         | Val Accuracy:  0.504


100%|██████████████████████████████████████████████████████████████████████████████| 2732/2732 [06:30<00:00,  7.00it/s]


Epochs: 4 | Train Loss:  0.021         | Train Accuracy:  0.495         | Val Loss:  0.018         | Val Accuracy:  0.504


100%|██████████████████████████████████████████████████████████████████████████████| 2732/2732 [06:35<00:00,  6.91it/s]


Epochs: 5 | Train Loss:  0.021         | Train Accuracy:  0.497         | Val Loss:  0.018         | Val Accuracy:  0.504
