In [16]:
!pip install nerus
!pip install pymorphy2
!pip install transformers




In [None]:
!wget https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz

--2021-03-14 22:13:40--  https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1961465886 (1.8G) [application/octet-stream]
Saving to: ‘nerus_lenta.conllu.gz’


2021-03-14 22:15:18 (19.1 MB/s) - ‘nerus_lenta.conllu.gz’ saved [1961465886/1961465886]



In [17]:
from PIL import Image
import io
import os

import numpy as np
from transformers import AutoTokenizer, AutoModel

import re
from tqdm.notebook import tqdm
import numpy as np
import requests
import torch
import torch.nn as nn
import nltk


from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from torch.utils.data import random_split, DataLoader

from torch.utils.tensorboard import SummaryWriter
import torchtext
from nerus import load_nerus

import pymorphy2
morph = pymorphy2.MorphAnalyzer()


device = 'cuda' if torch.cuda.is_available() else 'cpu'
device 

'cpu'

As usually, helper functions for easier life

In [18]:
def prepare_dataset(max_ind: int):
    '''
    extracting fucntion, with the help of neus api 
    Parameters: max_ind: int, number of documnets from corpus to take
    Returns:
    -- documents: list
    -- tokens_mapping: list of dicts, it contains key as POS token and list([offset: int, token_len: int])
    '''
    import pandas as pd
    corpus = load_nerus('nerus_lenta.conllu.gz')
    assert max_ind < 80000, "be careful with RAM usage"
    ind = 0
    documents = []
    tokens_mapping = []

    for ind in tqdm(range(max_ind), total = max_ind):
        document = next(corpus)
        for sentence in document.sents: 
            sent = re.findall(r'[а-яА-Яa-zA-ZёЁ]+[-][а-яА-Яa-zA-ZёЁ]+|[а-яА-Яa-zA-ZёЁ]+|[^\w\s]|\d+', sentence.text.lower(), re.UNICODE)
            documents.append(' '.join([word for word in sent]))
            
            tokens_mapping.append([[token.pos, len(token.text)] for token in sentence.tokens])

    return documents, tokens_mapping

def apply_mapping(tokenized_text: list, tokens_mapping: list) -> list:
  '''
  applies mapping rules for pos tags for given tokenizer
  '''
  mapped_tags = []
  ind =  0
  for token in tokens_mapping:
    tmp_len = 0;
    while tmp_len != token[1]:
        tmp_len += len(tokenized_text[ind].replace("##", "")) ## for BPE's encoding
        ind += 1
    
    mapped_tags.extend([token[0]] *  (ind - len(mapped_tags)))
    # print(mapped_tags)
  return mapped_tags 


def build_vocab(dataset, mode = 'tokens', truncate = False, most_k = None): 
    '''
    dataset: list,
    mode: str, return a vocab of tokens or tags
    '''
    import itertools
    assert mode == 'tokens' or mode == 'tags', "no other modes implemented"
    if mode == 'tokens': vocab = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 3, '[SEP]': 4}
    elif mode == 'tags': vocab = {'[PAD]': 0, '[UNK]': 1}
 
    for doсument in dataset: 
    
        for word in doсument.split() if mode == 'tokens' else doсument:
          if mode == 'tags': word = word[0]
          if word not in vocab: vocab[word] = vocab.__len__()
    if truncate: return dict(itertools.islice(vocab.items(), most_k))
    return vocab

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")

def build_vocab_(dataset, truncate = False, most_k = None): 
    '''
    for la bse
    dataset: list,
    mode: str, return a vocab of tokens or tags
    '''
    import itertools
    vocab = {}
    for sentence in dataset:
        tokens = tokenizer(sentence, return_tensors = 'pt')['input_ids'][0]
        for ind, token in enumerate(tokenizer.decode(tokens).split()):
            vocab[token] = tokens[ind].item() 

    if truncate: return dict(itertools.islice(vocab.items(), most_k))
    return vocab


def test_train_split(dataset, split = 0.8):    
    """
    Looks similar to sklearn one.
    Params:
    --------------
    --dataset -- torch.utils.data.Dataset to split
    --split -- float, split (to train size)
    Returns:
    --------------
    --train_set, valid_set -- splitted Datasets of size:
    (split * len(dataset), len(dataset) - len(train_set))
    """
    train_size = int(split * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
    return train_dataset, test_dataset

In [19]:


class Tokenizer(object):
    def __init__(self, tokens_vocab: dict, tags_vocab: dict, tokens_mapping: list, tokenizer):
        self.word_to_ind = tokens_vocab
        self.tag_to_ind = tags_vocab
        self.tokenizer = tokenizer
        self.tokens_mapping = tokens_mapping

    def form_sentence(self, max_length: int, tokens: list, vocab: dict):
        tokens = [['[CLS]'] + s + ['[SEP]'] + ['[PAD]'] * (max_length - len(s)) \
          if len(s) < max_length \
          else ['[CLS]'] + s[:max_length] + ['[SEP]'] \
          for s in tokens]
    

        ids = [[vocab.get(w, vocab['[UNK]']) for w in sent] for sent in tokens]
        return ids
    def __call__(self, sentences: list, max_length: int, pad_to_max_length = False, normalize = False, custom_tok_type = "nltk"):
        assert custom_tok_type == 'LaBSE' or custom_tok_type == 'nltk', "Just others are not implemented"
        self.tokens, self.mapped_tags = [], []

        if custom_tok_type == 'nltk':
            for sentence, mapp in zip(sentences, self.tokens_mapping):
                token = self.tokenizer(sentence) 
                mapped_tag = []
                try: mapped_tag = apply_mapping(token, mapp)
                except IndexError: continue
                self.tokens.append(token if not normalize else [morph.parse(t)[0].normal_form for t in token]); self.mapped_tags.append(mapped_tag)
            if not pad_to_max_length: max_length = min(max_length, max(map(len, self.tokens)))

            return torch.tensor(self.form_sentence(max_length, self.tokens, self.word_to_ind)), \
                   torch.tensor(self.form_sentence(max_length, self.mapped_tags, self.tag_to_ind))
        elif custom_tok_type == 'LaBSE':
            tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
            for sentence, mapp  in zip(sentences, self.tokens_mapping):
                token =  list(tokenizer(sentence,
                          truncation = True,
                          padding = True,
                          max_length = max_length,
                          return_tensors = 'pt')['input_ids'][0].numpy())
                mapped_tag = []
                try: mapped_tag = apply_mapping(tokenizer.decode(torch.tensor(token)), mapp)
                except IndexError: continue
                self.mapped_tags.append(mapped_tag)
                self.tokens.append(token)
              
            return torch.tensor([token[:max_length] if len(token) > max_length +2 else token + [0] * (max_length +2 - len(token)) for token in self.tokens]), \
                     torch.tensor(self.form_sentence(max_length, self.mapped_tags, self.tag_to_ind))


In [20]:
max_ind = 1500
documents, tokens_mapping = prepare_dataset(max_ind)

train_data, test_data, train_tags, test_tags = train_test_split(documents, tokens_mapping, test_size = 0.1, random_state = 0)
tokens_vocab = build_vocab(train_data, 'tokens')
tags_vocab = build_vocab(train_tags, 'tags')
inv_tags_vocab = {v: k for k, v in tags_vocab.items()}

HBox(children=(FloatProgress(value=0.0, max=1500.0), HTML(value='')))




In [21]:
class callback():
    def __init__(self, writer, dataset, loss_function, delimeter = 100, batch_size = 64, model_name = "LSTM", custom_tok_type = "LaBSE"):
        """
        This class provides a callback for Traniner class:
        Parameters:
        -- writer: TensorBoard writer to display stats to:
        -- dataset: torch.utils.data.Dataset for the task
        -- loss_fn: torch.nn.loss_fn for the task or inherited class
        -- delimetet: int, if (#step of optimizer % delim == 0) -> plot results
        -- batch_size: int, size of batch of data
        -- model_name: str, name of file to save

        """
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size
        self.model_name = model_name
        self.custom_tok_type = custom_tok_type
        self.dataset = dataset
    def save_model(self, model, name: str):
        """
        save model method; Parameters: 
        -- model: class inherited of nn.Module
        -- name: str, name of file to save
        Output: torch checkpoint
        """
        checkpoint = {'state_dict': model.state_dict(),
                      'loss_dict': self.loss_function.state_dict()}
        with open(name, 'wb') as f: torch.save(checkpoint, f)

    def forward(self, model, loss):

        """
        The main method of class: provides a working function;
        Parameters:
        -- model: class inherited of nn.Module
        -- loss_fn: torch.nn.loss_fn for the task or inherited class
        """
        # raise NotImplementedError("IMPLEMENT IT!")
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)
        
        if self.step % self.delimeter == 0:
            # _ = self.save_model(model, str(self.model_name) + "_iter_" + str(self.step))

            batch_generator = DataLoader(dataset = self.dataset, batch_size = self.batch_size)
            
            test_loss = 0
            _ = model.eval()
                      
            pred = []
            real = []
            for it, (x_batch, y_batch) in enumerate(batch_generator):
                x_batch = x_batch.to(model.device)
                y_batch = y_batch.to(model.device)

                with torch.no_grad():
                  output = model(x_batch)

                  test_loss += self.loss_function(output.reshape(-1, output.shape[-1]), y_batch.reshape(-1)).cpu().item() * len(x_batch)

                  pred.extend(torch.argmax(output, dim = -1).cpu().numpy().tolist())
                  real.extend(y_batch.cpu().numpy().tolist())

            test_loss /= len(self.dataset)
            
            self.writer.add_scalar('LOSS/test', test_loss, self.step)

            self.writer.add_text('REPORT/test', str(np.mean([accuracy_score(r, p) for r, p in zip(real, pred)])), self.step)

            
            if self.custom_tok_type == 'LaBSE': decoded = "ACTUAL_TAGS: " + " ".join([inv_tags_vocab[tag] for tag in real[-1]]) + '\n' + "PREDICTED_TAGS: +" + \
                                                                                                      " ".join(tokenizer.decode(torch.tensor(pred[-1])))
            else: decoded = "ACTUAL_TAGS: " + " ".join([inv_tags_vocab[tag] for tag in real[-1]]) + '\n' + "PREDICTED_TAGS: +" + " ".join([inv_tags_vocab[tag] for tag in pred[-1]])
            self.writer.add_text('LABELS_EVOLUTION/test', decoded, self.step)
                 

    def __call__(self, model, loss):
        return self.forward(model, loss)

class Trainer():
    def __init__(self, model,  loss_function, optimizer, callback = None, lr = 1e-2):

      """
      This class provides main train and validation functions
      Parameters:
      -- model: class inherited of nn.Module
      -- loss_fn: torch.nn.loss_fn for the task or inherited class
      -- optimizer: torch.optim.optimizer for the task
      -- callback: an initialized object of callback class 
      """
      self.model =  model
      self.loss_function = loss_function
      self.optimizer = optimizer(self.model.parameters(), lr = lr)
      self.callback = callback

    def train_on_batch(self, x_batch, y_batch):
      """
      This function is need to be implemented for 
      any particular model;
      Parameters:
      -- x_batch, y_batch: batches of data and target
      Output: loss: int, value of loss_fn on the batch of data
      """
      # raise NotImplementedError("IMPLEMENT IT!")
      _ = self.model.train()
      
      self.optimizer.zero_grad()
      output = self.model(x_batch.to(self.model.device))

      loss = self.loss_function(output.reshape(-1, output.shape[-1]), y_batch.to(self.model.device).reshape(-1))
      loss.backward()

      self.optimizer.step()
      return loss.cpu().item()

    def train_epoch(self, train_generator):
      """
      method of train for batches of data on 1 epoch; 
      uses train_on_batch method
      Parameters:
      -- train_generator: torch on own batch generator
      Output:
      mean loss per the epoch: int
      """
      epoch_loss = 0
      total = 0
      for it, (batch_x, batch_y) in enumerate(train_generator):
          batch_loss = self.train_on_batch(batch_x, batch_y)
  
          if self.callback is not None:
              with torch.no_grad():
                  self.callback(self.model, batch_loss)
              
          epoch_loss += batch_loss * len(batch_x)
          total += len(batch_x)
      
      return epoch_loss/total

    def train(self, dataset, count_of_epoch, batch_size):
      """
      Trainer of the model; 
      uses train_epoch method
      Parameters:
      -- dataset: torch.utils.data.Dataset for the task or inherited class
      -- count_of_epoch: int, how many epochs to train
      -- batch_size: int, size of batch of data
      Output: self
      """
      _ = self.model.train()
      iterations = tqdm(range(count_of_epoch), desc = 'epoch')
      iterations.set_postfix({'train epoch loss': np.nan})
      for it in iterations:
          batch_generator = tqdm(
              DataLoader(dataset = dataset, batch_size = batch_size, shuffle = True), 
              leave = False, total = len(dataset) // batch_size + (len(dataset) % batch_size > 0))
          
          epoch_loss = self.train_epoch(train_generator = batch_generator)
          
          iterations.set_postfix({'train epoch loss': epoch_loss})
      return self 

In [22]:
class LSTM(nn.Module):
    @property
    def device(self): return next(self.parameters()).device
    def __init__(self, config: dict,
                 input_dim: int,
                 hid_dim: int,
                 embed_dim: int):
      super(LSTM, self).__init__()

      self.input_dim = input_dim
      self.hid_dim = hid_dim
      self.embed_dim = embed_dim
      self.config = config

      self.emb = nn.Embedding(self.input_dim, self.embed_dim)
      self.lstm = nn.LSTM(input_size = self.embed_dim, hidden_size = self.hid_dim,
                          num_layers = config['n_layers'], batch_first = True,
                          dropout = config['dropout_rate'], bidirectional = config['bidir'])
      
      self.bn1 = nn.BatchNorm1d( (int(config['bidir']) + 1) * self.hid_dim ) 
      self.bn2= nn.BatchNorm1d( len(tags_vocab)  ) 
      self.relu1 = nn.ReLU()
      self.fc = nn.Linear((int(config['bidir']) + 1) * self.hid_dim,
                          len(tags_vocab))

    def forward(self, x):
        # assert isinstance(x, torch.LongTensor)
        x, (h, c) = self.lstm(self.emb(x))
        x = self.relu1(x)
        if self.config['use_batchnorm']: x = self.bn1(x.transpose(1, 2)).transpose(1, 2)
        x = self.fc(x)
        if self.config['use_batchnorm']: x = self.bn2(x.transpose(1, 2)).transpose(1, 2)
        return x

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
def set_config(): return  {'n_layers': 4, \
                           'bidir': True, \
                           'use_batchnorm': False, \
                           'use_dropout': True,
                           'dropout_rate': .2}

dataset = torch.utils.data.TensorDataset(*Tokenizer(tokens_vocab, tags_vocab, tokens_mapping, nltk.WordPunctTokenizer().tokenize)(documents, max_length = 30, normalize = True))
dataset_train, dataset_test = test_train_split(dataset)
print("len of tokenized documents =  {}, percent of errors during tokenization and mapping = {}%".format(len(dataset), 100 * (1. - len(dataset)/len(documents))))

loss_function = nn.CrossEntropyLoss(ignore_index = 0)
optimizer = torch.optim.Adam


#####an experiment with hidden_dim size
print("an experiment with hidden_dim size")
hidden_dims = [32, 64, 128]
for hidden_dim in hidden_dims:
    writer = SummaryWriter(log_dir = '/content/drive/MyDrive/Tasks/Task2/hidden_dim_size/{}'.format(hidden_dim))
    config = set_config()
    tr = Trainer(LSTM(config, len(tokens_vocab), hidden_dim, 256).to(device), loss_function, optimizer,
               callback = callback(writer, dataset_test, loss_function, custom_tok_type =  ""))
    _ = tr.train(dataset_train, 5, 128)

len of tokenized documents =  17582, percent of errors during tokenization and mapping = 0.243971631205675%


In [None]:

#####an experiment with n_layers size
print("an experiment with n_layers size")
layers = [2, 4]
for l in layers:
    writer = SummaryWriter(log_dir = '/content/drive/MyDrive/Tasks/Task2/n_layers/{}'.format(l))
    config = set_config()
    config['n_layers'] = l
    tr = Trainer(LSTM(config, len(tokens_vocab), 32, 256).to(device), loss_function, optimizer,
               callback = callback(writer, dataset_test, loss_function, custom_tok_type =  ""))
    _ = tr.train(dataset_train, 5, 128)

In [None]:
#####an experiment with dropout rate
print("an experiment with dropout rate")
rates = [0., .2, .5]
for dr in rates:еtent/drive/MyDrive/Tasks/Task2/dropout_rate/{}'.format(dr))  
    config = set_config()
    config['dropout_rate'] = dr
    tr = Trainer(LSTM(config, len(tokens_vocab), 32, 256).to(device), loss_function, optimizer,
               callback = callback(writer, dataset_test, loss_function, custom_tok_type =  ""))
    _ = tr.train(dataset_train, 5, 128)


In [25]:
#####an experiment with batch norm
print("an experiment with batch norm")
ubn = [False, True]
for u in ubn:
    writer = SummaryWriter(log_dir = '/content/drive/MyDrive/Tasks/Task2/batch_norm/{}'.format(u))
    config = set_config()
    config['use_batchnorm'] = u
    tr = Trainer(LSTM(config, len(tokens_vocab), 32, 256).to(device), loss_function, optimizer,
               callback = callback(writer, dataset_test, loss_function, custom_tok_type =  ""))
    _ = tr.train(dataset_train, 5, 128)

an experiment with batch norm


HBox(children=(FloatProgress(value=0.0, description='epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))




In [None]:
#####an experiment with vocab_size
print("an experiment with vocab_size")
ks = [100, 500, 2000, 5000]
for k in ks:
    writer = SummaryWriter(log_dir = '/content/drive/MyDrive/Tasks/Task2/vocab_size/{}'.format(k))
    tokens_vocab = build_vocab(train_data, 'tokens', truncate = True, most_k = k)
    config = set_config()
    dataset = torch.utils.data.TensorDataset(*Tokenizer(tokens_vocab, tags_vocab, tokens_mapping, nltk.WordPunctTokenizer().tokenize)(train_data, max_length = 30, normalize = True))
    dataset_train, dataset_test = test_train_split(dataset)
    print("len of tokenized documents =  {}, percent of errors during tokenization and mapping = {}%".format(len(dataset), 100 * (1. - len(dataset)/len(documents))))
    tr = Trainer(LSTM(config, len(tokens_vocab), 32, 256).to(device), loss_function, optimizer,
               callback = callback(writer, dataset_test, loss_function, custom_tok_type =  ""))
    _ = tr.train(dataset_train, 5, 128)


In [None]:
#####an experiment with tokenizer
print("an experiment with tokenizer")
ks = ["LaBSE", "nltk"]
for k in ks:
    writer = SummaryWriter(log_dir = '/content/drive/MyDrive/Tasks/Task2/tokenizer/{}'.format(k))
    tokens_vocab = build_vocab(train_data, 'tokens')
    voc_len = len(tokens_vocab)
    config = set_config()
    if k == "LaBSE":  tokens_vocab = build_vocab_(train_data)
      
    dataset = torch.utils.data.TensorDataset(*Tokenizer(tokens_vocab, tags_vocab, tokens_mapping, None if k == "LaBSE" else nltk.WordPunctTokenizer().tokenize)(documents, max_length = 30, normalize = True, 
                                                                                                                                                               custom_tok_type = k))
    dataset_train, dataset_test = test_train_split(dataset)
    if k == "LaBSE":  voc_len = np.max([torch.max(d[0]).item() for d in dataset])
    print("len of tokenized documents =  {}, percent of errors during tokenization and mapping = {}%".format(len(dataset), 100 * (1. - len(dataset)/len(documents))))
    tr = Trainer(LSTM(config, voc_len + 1, 32, 256).to(device), loss_function, optimizer,
               callback = callback(writer, dataset_test, loss_function, custom_tok_type =  k))
    _ = tr.train(dataset_train, 5, 128)

an experiment with tokenizer
len of tokenized documents =  14165, percent of errors during tokenization and mapping = 19.631205673758867%


HBox(children=(FloatProgress(value=0.0, description='epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, max=89.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=89.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=89.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=89.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=89.0), HTML(value='')))


len of tokenized documents =  17582, percent of errors during tokenization and mapping = 0.243971631205675%


HBox(children=(FloatProgress(value=0.0, description='epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))




In [None]:
%load_ext tensorboard
