## s-nlp/russian_toxicity_classifier for twitter database

## Домашнее задание

1. Возьмите готовую модель из https://huggingface.co/models для классификации сентимента текста.
2. Сделайте предсказания на всем df_val. Посчитайте метрику качества.
3. Дообучите эту модель на df_train. Посчитайте метрику качества на df_val.

Данные на google drive: https://drive.google.com/file/d/1Mev_EEput0LlBj8MDHIJkBtahlJ6J901

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification


In [2]:
import numpy as np
import torch
import torch.nn as nn
#import torch.nn.functional as F
from torch.optim import Adam
from tqdm import tqdm
#from collections import Counter

import pandas as pd

from transformers import pipeline

unmasker = pipeline('fill-mask', model='bert-base-uncased')
unmasker("Pytorch is [MASK] than Tensorflow.")

sentiment = pipeline("text-classification", model='Skoltech/russian-inappropriate-messages')
sentiment("Этот ресторан отличный")

In [3]:
df_train = pd.read_csv("data/train.csv")
df_val = pd.read_csv("data/val.csv")

df_train.shape, df_val.shape

((181467, 3), (22683, 3))

In [4]:
df_train.head()

Unnamed: 0,id,text,class
0,0,@alisachachka не уезжаааааааай. :(❤ я тоже не ...,0
1,1,RT @GalyginVadim: Ребята и девчата!\nВсе в кин...,1
2,2,RT @ARTEM_KLYUSHIN: Кто ненавидит пробки ретви...,0
3,3,RT @epupybobv: Хочется котлету по-киевски. Зап...,1
4,4,@KarineKurganova @Yess__Boss босапопа есбоса н...,1


In [5]:
df_train['class'].value_counts()

1    92063
0    89404
Name: class, dtype: int64

In [6]:
df_train['text'] = df_train['text'].apply(lambda x: x.lower())
df_val['text'] = df_val['text'].apply(lambda x: x.lower())

In [7]:
# tokenizer.get_vocab()

In [8]:
class TwitterDataset(torch.utils.data.Dataset):

    def __init__(self, txts, labels):
        self._labels = labels

        self.tokenizer = BertTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
        self._txts = [self.tokenizer.encode(text, padding='max_length', max_length=10,
                                     truncation=True, return_tensors="pt")
                      for text in txts]
        
    def __len__(self):
        return len(self._txts)
    
    def __getitem__(self, index):
        return self._txts[index], self._labels[index]

In [9]:
y_train = df_train['class'].values
y_val = df_val['class'].values

train_dataset = TwitterDataset(df_train['text'], y_train)
valid_dataset = TwitterDataset(df_val['text'], y_val)

train_loader = torch.utils.data.DataLoader(train_dataset,
                          batch_size=64,
                          shuffle=True,
                          drop_last=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset,
                          batch_size=64,
                          shuffle=False)

In [10]:
len(train_loader)

2835

In [11]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):
        super().__init__()   
        self.bert = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
  #      self.dropout = nn.Dropout(dropout)
   #     self.linear = nn.Linear(768, 2)
        self.sigm = nn.Sigmoid()

    def forward(self, x, y):
        
        pooled_output = self.bert(input_ids=x, labels=y, return_dict=True)
        # _, pooled_output - набор эмбеддинигов слов, эмбеддинг предложения
 #       dropout_output = self.dropout(pooled_output)
 #       linear_output = self.linear(dropout_output)
 #       final_layer = self.sigm(linear_output)
        final_layer = self.sigm(pooled_output['logits'])
        return final_layer

In [13]:
model = BertClassifier()
criterion = nn.CrossEntropyLoss()

optimizer = Adam(model.parameters(), lr=0.001)  # полное обучение
#optimizer = Adam(model.linear.parameters(), lr=0.001)  # неполное обучение

In [16]:
print(model)
print("Parameters full train:", sum([param.nelement() for param in model.parameters()]))
#print("Parameters transfer learning:", sum([param.nelement() for param in model.linear.parameters()]))

BertClassifier(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tr

In [17]:
for txt, lbl in train_loader:
    print(txt.squeeze(1).size())
    print(txt.type())
    print(lbl)
    break

torch.Size([64, 10])
torch.LongTensor
tensor([0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
        1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
        1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0])


In [18]:
test_model = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')

In [19]:
test_output = []
for txt, lbl in train_loader:
    test_output = test_model(input_ids=txt.squeeze(1))
    break

In [20]:
test_output['logits'][-1]

tensor([ 2.8854, -2.1356], grad_fn=<SelectBackward0>)

In [21]:

for txt, lbl in train_loader:
    print(txt.squeeze(1).size())
    print(txt.type())
    print(lbl)
    break

torch.Size([64, 10])
torch.LongTensor
tensor([0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,
        0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0])


In [23]:
for epoch_num in range(2):
    total_acc_train = 0
    total_loss_train = 0

    model.train()
    for train_input, train_label in train_loader:
     #   mask = train_input['attention_mask']
        input_id = train_input.squeeze(1)
        train_label = train_label

        output = model(input_id, train_label)
                
        batch_loss = criterion(output, train_label)
        total_loss_train += batch_loss.item()
                
        acc = (output.argmax(dim=1) == train_label).sum().item()
        total_acc_train += acc

        model.zero_grad()
        batch_loss.backward()
        optimizer.step()
            
    model.eval()
    total_loss_val, total_acc_val = 0.0, 0.0
    for val_input, val_label in valid_loader:
        val_label = val_label
   #     mask = val_input['attention_mask']
        input_id = val_input.squeeze(1)

        output = model(input_id, val_label)

        batch_loss = criterion(output, val_label)
        total_loss_val += batch_loss.item()
                    
        acc = (output.argmax(dim=1) == val_label).sum().item()
        total_acc_val += acc
            
    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataset): .3f} \
        | Train Accuracy: {total_acc_train / len(train_dataset): .3f} \
        | Val Loss: {total_loss_val / len(valid_dataset): .3f} \
        | Val Accuracy: {total_acc_val / len(valid_dataset): .3f}')

Epochs: 1 | Train Loss:  0.011         | Train Accuracy:  0.502         | Val Loss:  0.011         | Val Accuracy:  0.505
Epochs: 2 | Train Loss:  0.011         | Train Accuracy:  0.494         | Val Loss:  0.011         | Val Accuracy:  0.495


**impressions on the model**  
The model was randomly chosen between https://huggingface.co/models that worked with russian language and proclaimed to be aimed at classification and finding toxic content.
Comparing with the 'bert-base-multilingual-cased' model, the 'SkolkovoInstitute/russian_toxicity_classifier' didn't have attention_mask output in it's tokenizer, only word/sentence tokens attached to the key ['logit']. 

The model took almost 6 hours to train through one epoch. The result a bit worse (for 3 % points) than 'bert-base-multilingual-cased' model.