#Multi-label text classification, using BErT model

Contents:

* Imports and data loading
* Texts preprocessing
* Bert training
* Bert testing

## Imports and data loading

In [None]:
# save model
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [1]:
# Installing the transformers library and additional libraries if looking process 

!pip install -q transformers

[K     |████████████████████████████████| 3.8 MB 2.7 MB/s 
[K     |████████████████████████████████| 596 kB 61.1 MB/s 
[K     |████████████████████████████████| 895 kB 43.7 MB/s 
[K     |████████████████████████████████| 67 kB 5.3 MB/s 
[K     |████████████████████████████████| 6.5 MB 46.1 MB/s 
[?25h

In [8]:
%%capture
!pip install razdel
!pip install pymorphy2
!pip install slovnet
!pip install navec

In [9]:
import sklearn.metrics as skm

import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import transformers
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification

In [10]:
%%capture
from navec import Navec
from slovnet import NER
from gensim.utils import simple_preprocess

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import string
import random
from razdel import tokenize 
import pymorphy2
from tqdm import tqdm
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from tqdm import tqdm


nltk_stop_words = stopwords.words('russian') + list(string.punctuation)
morph = pymorphy2.MorphAnalyzer()

!wget https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar
!wget https://storage.yandexcloud.net/natasha-slovnet/packs/slovnet_ner_news_v1.tar

navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
ner = NER.load('slovnet_ner_news_v1.tar')
ner.navec(navec)

In [5]:
#train-test datasets

!wget https://boosters.pro/api/ch/files/pub/HeadHunter_train.csv
!wget https://boosters.pro/api/ch/files/pub/HeadHunter_test.csv

--2022-03-07 20:21:37--  https://boosters.pro/api/ch/files/pub/HeadHunter_train.csv
Resolving boosters.pro (boosters.pro)... 91.206.14.169
Connecting to boosters.pro (boosters.pro)|91.206.14.169|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24692086 (24M) [application/octet-stream]
Saving to: ‘HeadHunter_train.csv’


2022-03-07 20:21:39 (13.3 MB/s) - ‘HeadHunter_train.csv’ saved [24692086/24692086]

--2022-03-07 20:21:40--  https://boosters.pro/api/ch/files/pub/HeadHunter_test.csv
Resolving boosters.pro (boosters.pro)... 91.206.14.169
Connecting to boosters.pro (boosters.pro)|91.206.14.169|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10723842 (10M) [application/octet-stream]
Saving to: ‘HeadHunter_test.csv’


2022-03-07 20:21:41 (9.51 MB/s) - ‘HeadHunter_test.csv’ saved [10723842/10723842]



In [6]:
df_test = pd.read_csv('HeadHunter_test.csv')
df_test.fillna('неизвестно', inplace=True)

df_train = pd.read_csv('HeadHunter_train.csv')
df_test.fillna('неизвестно', inplace=True)

In [7]:
# class imbalance
df_train.target.value_counts()

8          24093
0          21003
1,8         1476
1           1269
3            905
6,8          473
6            368
7            326
3,8          209
1,6          141
5,8          121
5            102
1,5           78
1,6,8         48
4             38
4,8           36
1,5,8         33
7,8           25
1,4           24
1,7           15
5,7           13
2             12
1,3            9
5,6            9
1,5,6          7
5,6,8          6
4,6            5
3,7            5
1,4,8          4
1,4,6          3
6,7            3
1,3,8          3
1,3,5          2
1,7,8          2
1,5,6,8        1
3,6            1
5,7,8          1
4,6,8          1
1,3,6          1
3,6,8          1
3,5            1
1,2,6          1
3,5,8          1
3,5,7          1
Name: target, dtype: int64

## Texts preprocessing

In [None]:
def list_to_ints(x):
  l = []
  for stroka in x.split(','):
    l.append(int(stroka.split()[0]))
  return l

def int_list(x):
  l = []
  l.append(x)
  return l

# one-hot-encoding
def to_ohe(idx, num_of_classes=9):
  arr = [0]*num_of_classes
  for i in idx:
    arr[i] = 1
  return arr 

In [None]:
# Generate synthetic data (reviews)
def generate_reviews(dataframe, text_column, target_column, label, k, words_in_review):
  list_of_tokens = []
  for row in dataframe[dataframe[target_column] == label][text_column].values:
    tokens = list(tokenize(row))
    list_of_words = [t.text for t in tokens]
    list_of_tokens.extend(list_of_words)
  list_of_lemmas = [morph.parse(x)[0].word for x in list_of_tokens]
  counter_of_lemmas = Counter(list_of_lemmas)
  list_of_words = [random.choices(population=list(counter_of_lemmas.keys()), \
                                  weights=counter_of_lemmas.values(), k=words_in_review) for _ in range(k)]
  reviews = []
  for rev in list_of_words:
    reviews.append(' '.join([x for x in rev]))
  return reviews

In [None]:
# Generate specific number of reviews to balance classes 
def paraphraser(df_fe, df_orig):
  pos_words_counter = {}
  for x in range(1,8):
    pos_words_counter[x] = np.sum(df_orig[df_orig.target == str(x)].positive.str.len()) / len(df_orig[df_orig.target == str(x)])

  neg_words_counter = {}
  for x in range(1,8):
    neg_words_counter[x] = np.sum(df_orig[df_orig.target == str(x)].negative.str.len()) / len(df_orig[df_orig.target == str(x)])

  list_reviews_all_labels_pos = {}
  for k,v in tqdm(pos_words_counter.items()):
    list_reviews_all_labels_pos[k] = generate_reviews(df_orig, 'positive', 'target', '1', 5000, int(v))

  list_reviews_all_labels_neg = {}

  for k,v in tqdm(neg_words_counter.items()):
    list_reviews_all_labels_neg[k] = generate_reviews(df_orig, 'negative', 'target', '1', 5000, int(v))

  labels_count = [3117,   13, 1139,  111,  376, 1069,  391]
  df_sint_pos = pd.DataFrame(columns = ['reviews', 'target']) 
  for x in list_reviews_all_labels_pos.keys(): 
    l1 = list_reviews_all_labels_neg[x]
    l2 = [x]*5000
    df_interim = pd.DataFrame({'reviews': l1, 'target':l2})
    df_interim = df_interim.sample(n=5000-labels_count[x-1]) 
    df_sint_pos =  pd.concat([df_sint_pos, df_interim])

  df_sint_pos.target.value_counts()

  df_sint_neg = pd.DataFrame(columns = ['reviews2', 'target']) 
  for x in list_reviews_all_labels_pos.keys(): 
    l1 = list_reviews_all_labels_neg[x]
    l2 = [x]*5000
    df_interim = pd.DataFrame({'reviews2': l1, 'target':l2})
    df_interim = df_interim.sample(n=5000-labels_count[x-1]) 
    df_sint_neg =  pd.concat([df_sint_neg, df_interim])
  print(len(df_sint_neg))

  df_sint_neg = df_sint_neg.sort_values('target').reset_index().drop('index', axis=1)
  df_sint_pos = df_sint_pos.sort_values('target').reset_index().drop('index', axis=1)
  df_sint = pd.merge(df_sint_pos, df_sint_neg, left_index=True, right_index=True)

  df_sint['reviews'] = df_sint['reviews'] + df_sint['reviews2']
  df_sint.drop(['target_y', 'reviews2'], axis=1, inplace=True)

  df_sint.rename(columns = {'reviews':'review', 'target_x':'target'}, inplace=True)
  df_sint = df_sint[['target', 'review', ]]

  df_sint.target = df_sint.target.apply(int_list)

  df_fe = pd.concat([df_fe, df_sint], ignore_index=True)

  mask0 = df_fe.target.isin([[0]])
  mask8 = df_fe.target.isin([[8]])
  df0 = df_fe[mask0].sample(n=5000)
  df8 = df_fe[mask8].sample(n=5000)
  df_fe = df_fe[~mask0]
  df_fe = df_fe[~mask8]

  df_fe = pd.concat([df_fe, df0, df8])
  
  return df_fe


In [None]:
#Feature engineering and applying paraphraser function
def feature_engineering(df_fe, df_orig, type_of_df='train'):
  
  df_fe.drop('review_id', axis=1, inplace=True)
  df_fe[['salary_rating', 'team_rating', 'managment_rating', 'career_rating',
       'workplace_rating', 'rest_recovery_rating']] = df_fe[['salary_rating', 'team_rating', 'managment_rating', 'career_rating',
       'workplace_rating', 'rest_recovery_rating']].replace({1:"один",
                                                             2:"два",
                                                             3:"три",
                                                             4:"четыре",
                                                             5:"пять"})
  cols_eng = ['city', 'position', 'positive', 'negative', 'salary_rating',
       'team_rating', 'managment_rating', 'career_rating', 'workplace_rating',
       'rest_recovery_rating']

  cols_rus = ['город', "позиция", "негативный", "позитивный", "рейтинг зарплаты", 
                        "рейтинг команды", "рейтинг менеджмента", "рейтинг карьеры", 
                        "рейтинг рабочего места", "рейтинг места отдыха"]

  df_fe.rename(columns=dict(zip(cols_eng, cols_rus)), inplace=True)
  cols_to_val = [ "рейтинг зарплаты", 
                       "рейтинг команды", "рейтинг менеджмента", "рейтинг карьеры", 
                       "рейтинг рабочего места", "рейтинг места отдыха"]
  for col in cols_to_val:  
    df_fe[col] = df_fe[col].apply(lambda x: str(x) + ' ' + str(col))
  
  df_fe['review'] = ' '
  for col in cols_rus:
    df_fe['review'] = df_fe['review'] + df_fe[col] + str(' ') 
  df_fe.drop([x for x in cols_rus], axis=1, inplace=True)
  
  if type_of_df == 'train':
    df_fe.target = df_fe.target.apply(list_to_ints)
    #import paraphraser function
    df_fe = paraphraser(df_fe, df_orig)

    df_fe.target = df_fe.target.apply(to_ohe)
  elif type_of_df == 'test':
    pass
  else:
    print('Choose type of df')
    
  return df_fe

In [None]:
# making two datasets
df_orig = df_train.copy()
df_fe = df_train.copy()

#main function
df_bert = feature_engineering(df_fe, df_orig, type_of_df='train')

100%|██████████| 7/7 [01:00<00:00,  8.57s/it]
100%|██████████| 7/7 [01:37<00:00, 13.91s/it]


28784




## Bert training

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len, regime='train'):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.review = dataframe.review
        self.max_len = max_len
        self.regime = regime
        if self.regime == 'train':
          self.targets = self.data.target

    def __len__(self):
        return len(self.review)

    def __getitem__(self, index):
        review = str(self.review[index])
        review = " ".join(review.split())

        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=False
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        if self.regime == 'train':
          return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(mask, dtype=torch.long),
              'targets': torch.tensor(self.targets[index], dtype=torch.float)
          }
        elif self.regime == 'test':
          return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(mask, dtype=torch.long),
              #'targets': torch.tensor(self.targets[index], dtype=torch.float)
          }



In [None]:
model_path = 'cointegrated/rubert-tiny2'
model = BertForSequenceClassification.from_pretrained(model_path)

class BertClassifier:

    def __init__(self,dataframe, model_path, tokenizer_path, n_classes=2, epochs=1, save_path='/content/bert.pt', regime='train',
                 train_batch_size=32, max_seq_len=512, prob_sigmoid_threshhold=0.2, learning_rate=2e-4):
        self.model_save_path=save_path

        if regime == 'train':
          self.model = BertForSequenceClassification.from_pretrained(model_path)
          self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
          self.model.classifier = torch.nn.Linear(self.out_features, n_classes)
          self.regime = 'train'
          
        elif regime == 'test':
          self.model = torch.load(self.model_save_path, map_location ='cpu')
          self.regime = 'test'

        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.max_len = max_seq_len
        self.epochs = epochs
        self.model.to(self.device)  
        self.df = dataframe
        self.train_batch_size = train_batch_size
        self.max_seq_len = max_seq_len
        self.prob_sigmoid_threshhold = prob_sigmoid_threshhold
        self.learning_rate = learning_rate
        

    
    def preparation(self):

        train_size = 0.8
        train_dataset=self.df.sample(frac=train_size,random_state=42)
        test_dataset=self.df.drop(train_dataset.index).reset_index(drop=True)
        train_dataset = train_dataset.reset_index(drop=True)


        print("FULL Dataset: {}".format(self.df.shape))
        print("TRAIN Dataset: {}".format(train_dataset.shape))
        print("TEST Dataset: {}".format(test_dataset.shape))

        self.training_set = CustomDataset(train_dataset, self.tokenizer, self.max_len, regime=self.regime)
        self.testing_set = CustomDataset(test_dataset, self.tokenizer, self.max_len, regime=self.regime)

        # create data loaders
        self.train_loader = DataLoader(self.training_set, batch_size=self.train_batch_size, shuffle=True)
        self.valid_loader = DataLoader(self.testing_set, batch_size=8,  shuffle=True)

        # helpers initialization
        self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, correct_bias=False)
        self.scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=0,
                num_training_steps=len(self.train_loader) * self.epochs
            )
        self.loss_fn = torch.nn.BCEWithLogitsLoss().to(self.device)
            
    def fit(self):
        self.model = self.model.train()
        losses = []
        correct_predictions = 0

        for data in self.train_loader:
            input_ids = data["ids"].to(self.device)
            attention_mask = data["mask"].to(self.device)
            targets = data["targets"].to(self.device)

            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )
            
            loss = self.loss_fn(outputs.logits, targets)
  
            preds = torch.sigmoid(outputs.logits)  # torch.Size([N, C]) e.g. tensor([[0., 0.5, 0.]])
            preds[preds >= self.prob_sigmoid_threshhold] = 1


            summa = torch.sum(preds == targets)

            correct_predictions += summa

            losses.append(loss.item())
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            self.optimizer.zero_grad()

        train_acc = correct_predictions.double() / len(self.training_set)
        train_loss = np.mean(losses)
        return train_acc, train_loss
    
    def eval(self):
        self.model = self.model.eval()
        losses = []
        correct_predictions = 0
        predslist = []
        targetslist = []

        with torch.no_grad():
            for data in self.valid_loader:
                input_ids = data["ids"].to(self.device)
                attention_mask = data["mask"].to(self.device)
                targets = data["targets"].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                    )

                preds = torch.sigmoid(outputs.logits)  # torch.Size([N, C]) e.g. tensor([[0., 0.5, 0.]])
                preds[preds >= self.prob_sigmoid_threshhold] = 1
                
                loss = self.loss_fn(outputs.logits, targets)

                correct_predictions += torch.sum(preds == targets)
                losses.append(loss.item())

                #f1
                preds[preds < self.prob_sigmoid_threshhold] = 0
                predslist.append(preds.cpu())
                targetslist.append(targets.cpu())
        

        predslist = np.concatenate(predslist)
        targetslist = np.concatenate(targetslist)
        f1_score_macro = skm.f1_score(targetslist, predslist, average='macro')
        print(f1_score_macro)
        cm = skm.multilabel_confusion_matrix(targetslist, predslist)
        print(cm)
        print(skm.classification_report(targetslist, predslist))

        val_acc = correct_predictions.double() / len(self.testing_set)
        val_loss = np.mean(losses)
        return val_acc, val_loss, f1_score_macro
    
    def train(self):
        best_f1 = 0
        for epoch in range(self.epochs):
            print(f'Epoch {epoch + 1}/{self.epochs}')
            _, train_loss = self.fit()
            print(f'Train loss {train_loss}')

            _, val_loss, f1_score_eval = self.eval()
            print(f'Val loss {val_loss}')
            print('-' * 10)
            if f1_score_eval > best_f1:
                torch.save(self.model, self.model_save_path)
                best_f1 = f1_score_eval
        self.model = torch.load(self.model_save_path)
    
    def predict(self, text):

      
      inputs = self.tokenizer.encode_plus(
          text,
          None,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True

      )


      out = {
            'input_ids': torch.tensor(inputs['input_ids'],dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'],  dtype=torch.float)
        }
      
      input_ids = out["input_ids"].to(self.device)
      attention_mask = out["attention_mask"].to(self.device)

      outputs = self.model(
          input_ids=input_ids.unsqueeze(0),
          attention_mask=attention_mask.unsqueeze(0)
      )
      
      # print(outputs.logits)
      preds = torch.sigmoid(outputs.logits)  # torch.Size([N, C]) e.g. tensor([[0., 0.5, 0.]])
      print(f'after sigm {preds}')
      preds[preds >= 0.4] = 1
      list_of_preds = [i for i,j in enumerate(preds[0]) if j == 1]
      if list_of_preds == []:
        list_of_preds.append(torch.argmax(preds[0]).cpu().detach().numpy().tolist())

      return list_of_preds 

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [None]:
model_save_name = 'hh_classifier.pt'
save_path = f"/content/gdrive/My Drive/{model_save_name}"

classifier = BertClassifier(
        dataframe=df_bert,
        model_path='cointegrated/rubert-tiny2',
        tokenizer_path='cointegrated/rubert-tiny2',
        n_classes=9,
        epochs=8,
        save_path=save_path,
        regime='train',
        train_batch_size=16, 
        max_seq_len=1024, 
        prob_sigmoid_threshhold=0.2, 
        learning_rate=3e-5
)


In [None]:
classifier.preparation()
classifier.fit()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


FULL Dataset: (44564, 2)
TRAIN Dataset: (35651, 2)
TEST Dataset: (8913, 2)




KeyboardInterrupt: ignored

## Bert testing

In [None]:
classifier = BertClassifier(
        dataframe=df_bert,
        model_path='cointegrated/rubert-tiny2',
        tokenizer_path='cointegrated/rubert-tiny2',
        n_classes=9,
        epochs=8,
        save_path=save_path,
        regime='test',
        train_batch_size=16, 
        max_seq_len=1024, 
        prob_sigmoid_threshhold=0.2, 
        learning_rate=3e-5
)

In [None]:
p = []
for x in tqdm(range(len(df_bert))):
  p.append(classifier.predict(df_bert.iloc[x].review))
  print(x)

In [None]:
df_res = pd.DataFrame({'review_id': df_test.review_id, 'target': p})

In [None]:
df_res.target = df_res.target.apply(lambda x: ','.join(map(str, x)))

In [None]:
df_res.target.value_counts()

In [None]:
# result to csv file
df_res.to_csv('/content/result.csv', index=False)