In [3]:
!pip install transformers &> /dev/null
!pip install datasets &> /dev/null
!pip install razdel &> /dev/null

In [1]:
import torch
import re
import datasets
import numpy as np
import pandas as pd

from tqdm import tqdm
from razdel import sentenize
from sklearn.model_selection import train_test_split

import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

import torchmetrics
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AutoModel

In [5]:
#для колаб
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


# Предобработка

In [2]:
def clean(string):
    string = re.sub('[^а-яА-Яa-zA-Z0-9)(+-/@.,# \n]', '', string)
    string = re.sub('\)+', ')', string)
    string = re.sub('\(+', '(', string)
    string = re.sub('\++', '+', string)
    string = re.sub('\-+', '-', string)
    string = re.sub('\/+', '/', string)
    string = re.sub('\@+', '@', string)
    string = re.sub('\.+', '.', string)
    string = re.sub('\,+', ',', string)
    string = re.sub('\#+', '#', string) #дискорд
    string = re.sub('\ +', ' ', string)
    string = re.sub('/\n', '\n', string) #подумать еще над этим
    string = re.sub('\n+', '\n', string)
    
    return string

def to_sents(text):
    
    paragraphs = [p for p in text.split('\n')]
    full_list = []
    for paragraph in paragraphs:
        sents = list(sentenize(paragraph)) #использует razdel
        full_list.append(sents)
    full_list = [sent for sents in full_list for sent in sents if sent]
    full_list = [sent.text for sent in full_list if sent.text]
    
    return full_list


## Загрузка трансформера-классификатора единичных предложений

In [21]:
#загружаем эмбеддер
inference_checkpoint = 'rubert-tiny-finetuned-class'
model = AutoModel.from_pretrained(inference_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(inference_checkpoint)

Some weights of the model checkpoint at rubert-tiny-finetuned-class were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
def embed(x):
    if len(x) == 0:
        x = 'Пусто' 
        
    tokenized_x = tokenizer(x, padding = True,
                             truncation = True,
                             max_length = 512,
                             return_tensors='pt')
    
    with torch.no_grad():      
        model_output = model(**tokenized_x)
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    
    return embeddings

## Выбор объектов и создание эмбеддингов

In [6]:
#загрузка данных
df = pd.read_csv('train/train.csv')
df['description_razdel'] = df['description'].apply(lambda x: to_sents(clean(x)))

In [None]:
#формируем сбалансированную выборку
n = 5000
df_to_train_big_balanced = pd.concat([df[(df['category'] == 'Бытовая электроника') & (df['is_bad'] == 0)].sample(n),
                         df[(df['category'] == 'Бытовая электроника') & (df['is_bad'] == 1)].sample(n),
                         df[(df['category'] == 'Для бизнеса') & (df['is_bad'] == 0)].sample(2*n),
                         df[(df['category'] == 'Для бизнеса') & (df['is_bad'] == 1)], #и так мало
                         df[(df['category'] == 'Для дома и дачи') & (df['is_bad'] == 0)].sample(n),
                         df[(df['category'] == 'Для дома и дачи') & (df['is_bad'] == 1)].sample(n),
                         df[(df['category'] == 'Животные') & (df['is_bad'] == 0)].sample(n),
                         df[(df['category'] == 'Животные') & (df['is_bad'] == 1)].sample(n),
                         df[(df['category'] == 'Личные вещи') & (df['is_bad'] == 0)].sample(2*n),
                         df[(df['category'] == 'Личные вещи') & (df['is_bad'] == 1)].sample(2*n),
                         df[(df['category'] == 'Недвижимость') & (df['is_bad'] == 0)].sample(n),
                         df[(df['category'] == 'Недвижимость') & (df['is_bad'] == 1)].sample(n),
                         df[(df['category'] == 'Работа') & (df['is_bad'] == 0)].sample(2*n),
                         df[(df['category'] == 'Работа') & (df['is_bad'] == 1)].sample(2*n),
                         df[(df['category'] == 'Транспорт') & (df['is_bad'] == 0)].sample(n),
                         df[(df['category'] == 'Транспорт') & (df['is_bad'] == 1)].sample(n),
                         df[(df['category'] == 'Услуги') & (df['is_bad'] == 0)].sample(2*n),
                         df[(df['category'] == 'Услуги') & (df['is_bad'] == 1)].sample(2*n),
                         df[(df['category'] == 'Хобби и отдых') & (df['is_bad'] == 0)].sample(n),
                         df[(df['category'] == 'Хобби и отдых') & (df['is_bad'] == 1)].sample(n)]
                                  ).reset_index(drop=True)

In [None]:
#создаем эмбеддинги - занимает время
tqdm.pandas()
df['embeddings'] = df['description_razdel'].progress_apply(embed)

## Создание датасетов из эмбеддингов

In [None]:
#создаем df с X и y и бьем его на части
from sklearn.model_selection import train_test_split
df_embeddings = df[['embeddings', 'is_bad']]
train, test = train_test_split(df_embeddings, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
class EmbeddingsDataset(torch.utils.data.Dataset):
    def __init__(self, input_df):
        self.df = input_df
       
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        x = self.df['embeddings'][idx]
        y = self.df['is_bad'][idx]
        return (x, y)

In [None]:
def collate_fn(batch):
    embedding_tensors = []
    labels = []
    
    for item in batch:
        x, y = item
        embedding_tensors.append(x)
        labels.append(y)

    x = pad_sequence(embedding_tensors, batch_first=True)
    y = torch.tensor(labels)

    return (x, y)

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, 
                                               shuffle=True, collate_fn=collate_fn)

val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=128, 
                                             shuffle=True, collate_fn=collate_fn)

# Модель и обучение

In [None]:
class LSTMClassifier(nn.Module):

    def __init__(self):   
        super(LSTMClassifier, self).__init__()
        embedding_dim = 312
        hidden_dim = 100
        
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            batch_first=True, 
                            bidirectional=True)  
        
        self.linear = nn.Linear(hidden_dim*2, 2)

    def forward(self, x):        
        x = self.lstm(x)[1][0]
        x = x.permute(1, 0, -1)
        x = torch.cat((torch.chunk(x, 2, dim=1)[0], 
                       torch.chunk(x, 2, dim=1)[1]), dim=2)
        x = x.squeeze(dim=1)
        x = self.linear(x)
        scores = F.log_softmax(x, dim=1)

        return scores

In [None]:
class PLModel(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        
        self.train_accuracy = torchmetrics.Accuracy()
        self.val_accuracy = torchmetrics.Accuracy()
        
        self.val_auroc = torchmetrics.AUROC(num_classes=2)

    def forward(self, x):
        return torch.exp(self.model(x))

    def training_step(self, batch, batch_idx):      
        x, y = batch
        neg_logs = self.model(x)
        loss = loss_fn(neg_logs, y)
        
        probs = torch.exp(neg_logs)
        train_accuracy = self.train_accuracy(probs, y)
    
        self.log("train loss", loss, prog_bar=True)
        self.log("train acc", train_accuracy, prog_bar=True)

        return loss
    
    def validation_step(self, batch, batch_idx):   
        x, y = batch
        neg_logs = self.model(x)
        loss = loss_fn(neg_logs, y)
        
        probs = torch.exp(neg_logs)
        val_accuracy = self.val_accuracy(probs, y)
        val_auroc = self.val_auroc(probs, y)
 
        self.log("val loss", loss, prog_bar=True)
        self.log("val acc", val_accuracy, prog_bar=True)
        self.log("val auroc", val_auroc, prog_bar=True)  
        
    def training_epoch_end(self, *args, **kwargs):
        self.train_accuracy.reset()
        
    def validation_epoch_end(self, outs):
        self.log('val acc', self.val_accuracy.compute(), prog_bar=True)
        self.val_accuracy.reset()
        
        self.log('val aucroc', self.val_auroc.compute(), prog_bar=True)
        self.val_auroc.reset()
        
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters())
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
        
        return [optimizer], [scheduler]

In [None]:
lstm_model = LSTMClassifier()
loss_fn = nn.NLLLoss()

checkpoint_callback = ModelCheckpoint(dirpath="lstm_checkpoints", 
                                      save_top_k=15, 
                                      monitor="val auroc"
                                     )

pl_model = PLModel(lstm_model)

logger = TensorBoardLogger('lstm_logs', default_hp_metric=False)

early_stop_callback = EarlyStopping(
    monitor="val auroc", min_delta=0.00, patience=5, 
    verbose=False, mode="max"
)

trainer = pl.Trainer(
    max_epochs=15, callbacks=[checkpoint_callback, early_stop_callback], 
    logger=logger
)

In [None]:
#нужно, чтобы график обучения нормально выглядел, как следует
trainer.fit(pl_model, train_dataloader, val_dataloader)

In [None]:
%load_ext tensorboard
%tensorboard --logdir lstm_logs/

In [None]:
torch.save(pl_model.model.state_dict(), 'state_dict.pt')