In [1]:
# all that's needed
# !pip install transformers

import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

import pandas as pd
import numpy as np

import regex as re
import os

from tqdm.notebook import tqdm
tqdm.pandas()



In [2]:
"""
Originally it was meant to be class, but I don't wanna rewrite the code below
"""


# function to clean the text
# in this case we will remove usernames, hashtags, links and emojis
# also we will remove all punctuation
# and we will lowercase the text
def clean_text(text):
    text = text.lower()
    # remove usernames
    text = re.sub(r'@\w+', '', text)
    # remove hashtags
    text = re.sub(r'#\w+', '', text)
    # remove links
    text = re.sub(r'http\S+', '', text)
    # remove emojis
    text = re.sub(r'\\x\S+', '', text)
    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    return text

tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-sentence')

def tokenize(text):
    res = tokenizer.encode_plus(
        text,
        max_length=128,
        # pad_to_max_length=True,
        padding = 'max_length',
        truncation=True,
        return_tensors='pt'
    )
    return pd.Series([res['input_ids'], res['attention_mask']])

def predict(text, path_to_model = None) -> str:
    if path_to_model is None:
        path_to_model = 'models/bert_model.pt'
    text = clean_text(text)
    input_ids, attention_mask = tokenize(text)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    # load the bert model from the saved state if it's not already loaded
    if not os.path.exists(path_to_model):
        raise Exception('the model is not trained yet')
    bert_model.load_state_dict(torch.load(path_to_model))
    bert_model.to(device)

    bert_model.eval()
    with torch.no_grad():
        outputs = bert_model(
        input_ids=input_ids,
            attention_mask=attention_mask
        )
    logits = outputs[0]
    preds = torch.argmax(logits, dim=1).flatten()

    match preds.item():
        case 0:
            return "the sentence is skipped due to low confidence"
        case 1:
            return "positive"
        case 2:
            return "neutral, the text is balanced"
        case 3:
            return "speech, it's a regular text"
        case 4:
            return "negative, ban this user"
        case _:
            return "unknown"

def predict_with_input(text, path_to_model = None) -> (str, str):
    res = predict(text, path_to_model)
    return f"{text} -> {res}\n"



In [3]:
'''
Funciton for training the model
'''
from torch.optim import AdamW

def train(model, train_dataloader, test_dataloader, epochs = None, optimizer = None, scheduler = None):
    if epochs is None:
        epochs = 5
    if optimizer is None:
        optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    if scheduler is None:
        total_steps = len(train_dataloader) * epochs

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        train_f1 = 0
        for batch in tqdm(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch
            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs[0]
            train_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()
            model.zero_grad()
        train_loss /= len(train_dataloader)
        train_f1 /= len(train_dataloader)
        print(f"Epoch: {epoch+1}")
        print(f"Train Loss: {train_loss}")
        print(f"Train F1: {train_f1}")
        print("Evaluating...")
        model.eval()
        test_loss = 0
        test_f1 = 0
        for batch in tqdm(test_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch
            with torch.no_grad():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
            loss = outputs[0]
            logits = outputs[1]
            test_loss += loss.item()
            preds = torch.argmax(logits, dim=1).flatten()
            # test_f1 += f1_score(labels, preds, average='macro')
            # use Tensor.CPU() to convert the tensor to a numpy array
            test_f1 += f1_score(labels.cpu(), preds.cpu(), average='macro')
        test_loss /= len(test_dataloader)
        test_f1 /= len(test_dataloader)
        print(f"Test Loss: {test_loss}")
        print(f"Test F1: {test_f1}")


In [4]:
# Load the data
df = pd.read_csv("data/rusentitweet_full.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,text,label,id
0,0,@varlamov @McFaul На,skip,1327934765807308801
1,1,велл они всё равно что мусор так что ничего с...,negative,1252943181387350017
2,2,"""трезвая жизнь какая-то такая стрёмная""\r\n(с)...",negative,1323610669061677056
3,3,Ой какие неожиданные результаты 🤭 https://t.co...,neutral,1336231661160247297
4,4,@Shvonder_chief @dimsmirnov175 На заборе тоже ...,neutral,1292421736454127617


In [5]:
# change label column to int value
def change_label(label):
    match label:
        case 'skip':
            return 0
        case 'positive':
            return 1
        case 'neutral':
            return 2
        case 'speech':
            return 3
        case 'negative':
            return 4

df['label'] = df['label'].apply(change_label)

# print all unique labels
print(df['label'].unique())

[0 4 2 3 1]


In [6]:
# prepare the data
df['text'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0.1,Unnamed: 0,text,label,id
0,0,на,0,1327934765807308801
1,1,велл они всё равно что мусор так что ничего с...,4,1252943181387350017
2,2,трезвая жизнь какаято такая стрёмная\r\nс артё...,4,1323610669061677056
3,3,ой какие неожиданные результаты,2,1336231661160247297
4,4,на заборе тоже написаноа там другоео сборе д...,2,1292421736454127617


In [7]:

df[['input_ids', 'attention_mask']] = df['text'].progress_apply(tokenize)
df.head()

  0%|          | 0/13392 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,text,label,id,input_ids,attention_mask
0,0,на,0,1327934765807308801,"[[tensor(101), tensor(1469), tensor(102), tens...","[[tensor(1), tensor(1), tensor(1), tensor(0), ..."
1,1,велл они всё равно что мусор так что ничего с...,4,1252943181387350017,"[[tensor(101), tensor(12044), tensor(864), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
2,2,трезвая жизнь какаято такая стрёмная\r\nс артё...,4,1323610669061677056,"[[tensor(101), tensor(67121), tensor(1637), te...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
3,3,ой какие неожиданные результаты,2,1336231661160247297,"[[tensor(101), tensor(32589), tensor(19201), t...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."
4,4,на заборе тоже написаноа там другоео сборе д...,2,1292421736454127617,"[[tensor(101), tensor(1469), tensor(37272), te...","[[tensor(1), tensor(1), tensor(1), tensor(1), ..."


In [8]:
# split the data
test_size = 0.2
batch_size = 32

train_df, test_df = train_test_split(
    df,
    test_size=test_size,
    # shuffle=True,
    random_state=42,
    stratify=df['label'].values
)

train_set = TensorDataset(
    torch.cat(list(train_df["input_ids"].values), dim = 0),
    torch.cat(list(train_df["attention_mask"].values), dim = 0),
    torch.tensor(train_df["label"].values)
)

test_set = TensorDataset(
    torch.cat(list(test_df["input_ids"].values), dim = 0),
    torch.cat(list(test_df["attention_mask"].values), dim = 0),
    torch.tensor(test_df["label"].values)
)

train_dataloader = DataLoader(
    train_set, 
    batch_size=batch_size, 
    shuffle=True
)

test_dataloader = DataLoader(
    test_set, 
    batch_size=batch_size, 
    shuffle=True
)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [10]:
# model first
bert_model = BertForSequenceClassification.from_pretrained(
    'DeepPavlov/rubert-base-cased-sentence', 
    num_labels=5,
)

bert_model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-sentence and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [11]:
from transformers import AutoModelForSequenceClassification
auto_model = AutoModelForSequenceClassification.from_pretrained(
    "DeepPavlov/rubert-base-cased-sentence",
    num_labels=5,
)
auto_model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-sentence and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [12]:
train(bert_model, train_dataloader, test_dataloader)

# save the model
torch.save(bert_model.state_dict(), 'models/bert_model.pt')

  0%|          | 0/335 [00:00<?, ?it/s]

Epoch: 1
Train Loss: 1.0237127747108687
Train F1: 0.0
Evaluating...


  0%|          | 0/84 [00:00<?, ?it/s]

Test Loss: 0.9414595805463337
Test F1: 0.5840153834516689


  0%|          | 0/335 [00:00<?, ?it/s]

Epoch: 2
Train Loss: 0.746313807056911
Train F1: 0.0
Evaluating...


  0%|          | 0/84 [00:00<?, ?it/s]

Test Loss: 0.9418769347525778
Test F1: 0.6109494747011138


  0%|          | 0/335 [00:00<?, ?it/s]

Epoch: 3
Train Loss: 0.532930530674422
Train F1: 0.0
Evaluating...


  0%|          | 0/84 [00:00<?, ?it/s]

Test Loss: 1.0233185486424536
Test F1: 0.6195566545595559


  0%|          | 0/335 [00:00<?, ?it/s]

Epoch: 4
Train Loss: 0.36657874014840197
Train F1: 0.0
Evaluating...


  0%|          | 0/84 [00:00<?, ?it/s]

Test Loss: 1.1687264272144862
Test F1: 0.6217077278103409


  0%|          | 0/335 [00:00<?, ?it/s]

Epoch: 5
Train Loss: 0.27668073353037903
Train F1: 0.0
Evaluating...


  0%|          | 0/84 [00:00<?, ?it/s]

Test Loss: 1.2230118129934584
Test F1: 0.623463889079143


In [13]:
print(
    predict_with_input("Сегодня был очень хороший день"),
    predict_with_input("Я твой рот отчистил"),
    predict_with_input("Я не хочу тебя больше видеть"),
    predict_with_input("Собака ты сутулая"),
    predict_with_input("Тварь ты сутулая"),
    predict_with_input("Собака ты сутулая, ты мне нравишься"),
)

Сегодня был очень хороший день -> positive
 Я твой рот отчистил -> negative, ban this user
 Я не хочу тебя больше видеть -> negative, ban this user
 Собака ты сутулая -> neutral, the text is balanced
 Тварь ты сутулая -> negative, ban this user
 Собака ты сутулая, ты мне нравишься -> positive

