In [1]:
import sys
sys.path.append(r"/kaggle/input/custom-torch-utils/")

In [2]:
!pip install pyconll

Collecting pyconll
  Downloading pyconll-3.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyconll
Successfully installed pyconll-3.1.0
[0m

In [3]:
import random
import re
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import numpy
import pyconll
from gensim.corpora import Dictionary
from train_eval_tools import Trainer
from sklearn.metrics import classification_report

In [4]:
def init_random_seed(value=0):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    torch.backends.cudnn.deterministic = True
init_random_seed()

In [5]:
DATASET_PATH = '/kaggle/input/old-ru-syntagrus-data/'
MODELS_PATH = '/kaggle/input/torch-pos-tagger-weights/'
TRAIN_DATASET_NAME = 'ru_syntagrus-ud-train.conllu'
EVAL_DATASET_NAME = 'ru_syntagrus-ud-dev.conllu'

TRAIN_SIZE_RATIO = 0.7
BATCH_SIZE = 256
PAD_TOKEN = '<PAD>'
MAX_EPOCH = 10


# Data preprocess

In [6]:
main_df = pyconll.load_from_file(DATASET_PATH + TRAIN_DATASET_NAME)
eval_df = pyconll.load_from_file(DATASET_PATH + EVAL_DATASET_NAME)

In [7]:
for sent in main_df[:3]:
        print([(token.form, token.upos) for token in sent])
        print()

[('Анкета', 'NOUN'), ('.', 'PUNCT')]

[('Начальник', 'NOUN'), ('областного', 'ADJ'), ('управления', 'NOUN'), ('связи', 'NOUN'), ('Семен', 'PROPN'), ('Еремеевич', 'PROPN'), ('был', 'AUX'), ('человек', 'NOUN'), ('простой', 'ADJ'), (',', 'PUNCT'), ('приходил', 'VERB'), ('на', 'ADP'), ('работу', 'NOUN'), ('всегда', 'ADV'), ('вовремя', 'ADV'), (',', 'PUNCT'), ('здоровался', 'VERB'), ('с', 'ADP'), ('секретаршей', 'NOUN'), ('за', 'ADP'), ('руку', 'NOUN'), ('и', 'CCONJ'), ('иногда', 'ADV'), ('даже', 'PART'), ('писал', 'VERB'), ('в', 'ADP'), ('стенгазету', 'NOUN'), ('заметки', 'NOUN'), ('под', 'ADP'), ('псевдонимом', 'NOUN'), ('"', 'PUNCT'), ('Муха', 'NOUN'), ('"', 'PUNCT'), ('.', 'PUNCT')]

[('В', 'ADP'), ('приемной', 'NOUN'), ('его', 'PRON'), ('с', 'ADP'), ('утра', 'NOUN'), ('ожидали', 'VERB'), ('посетители', 'NOUN'), (',', 'PUNCT'), ('-', 'PUNCT'), ('кое-кто', 'PRON'), ('с', 'ADP'), ('важными', 'ADJ'), ('делами', 'NOUN'), (',', 'PUNCT'), ('а', 'CCONJ'), ('кое-кто', 'PRON'), ('и', 'PART'), ('

In [8]:
# Concat data into list of str sentences
train_text_data = [' '.join(tokens) for tokens in [[token.form for token in sent] for sent in main_df]]
eval_text_data = [' '.join(tokens) for tokens in [[token.form for token in sent] for sent in eval_df]]

In [9]:
train_text_data[:3]

['Анкета .',
 'Начальник областного управления связи Семен Еремеевич был человек простой , приходил на работу всегда вовремя , здоровался с секретаршей за руку и иногда даже писал в стенгазету заметки под псевдонимом " Муха " .',
 'В приемной его с утра ожидали посетители , - кое-кто с важными делами , а кое-кто и с такими , которые легко можно было решить в нижестоящих инстанциях , не затрудняя Семена Еремеевича .']

In [10]:
train_labels = [[token.upos for token in sent] for sent in main_df]
eval_labels = [[token.upos for token in sent] for sent in eval_df]

In [11]:
UNIQUE_TAGS = ['<NOTAG>'] + sorted(set(token.upos for sent in main_df for token in sent if token.upos is not None))
label2id = dict(zip(UNIQUE_TAGS, [i for i in range(len(UNIQUE_TAGS))]))

In [12]:
label2id

{'<NOTAG>': 0,
 'ADJ': 1,
 'ADP': 2,
 'ADV': 3,
 'AUX': 4,
 'CCONJ': 5,
 'DET': 6,
 'INTJ': 7,
 'NOUN': 8,
 'NUM': 9,
 'PART': 10,
 'PRON': 11,
 'PROPN': 12,
 'PUNCT': 13,
 'SCONJ': 14,
 'SYM': 15,
 'VERB': 16,
 'X': 17}

## Tokenization

In [13]:
def tokenize_doc(doc, skip_special_tokens=False):
    tokenized_doc = list(doc)
    if not skip_special_tokens:
        tokenized_doc = [PAD_TOKEN] + tokenized_doc + [PAD_TOKEN]
    return tokenized_doc

def tokenize_corp(corp, skip_special_tokens=False):
    tokenized_corp = []
    for doc in corp:
        tokenized_corp.append(tokenize_doc(doc, skip_special_tokens))
    return tokenized_corp


In [14]:
train_tokenized_for_vocab = tokenize_corp(train_text_data)

In [15]:
train_tokenized_for_vocab[0]

['<PAD>', 'А', 'н', 'к', 'е', 'т', 'а', ' ', '.', '<PAD>']

## Vocabulary building

In [16]:
special_tokens = [PAD_TOKEN]
special_tokens_map = {token: token_id for token_id, token in enumerate(special_tokens)}

In [17]:
vocabulary = Dictionary(train_tokenized_for_vocab)

In [18]:
vocabulary.filter_extremes(no_above=1.0, no_below=5, keep_tokens=special_tokens)

In [19]:
vocabulary.patch_with_special_tokens(special_tokens_map)

In [20]:
len(vocabulary)

150

Vocabulary is built, so now train_tokenized_for_vocab is of no use for us.

In [21]:
del train_tokenized_for_vocab

## Dataset building

In [22]:
train_tokenized = [[tokenize_doc(token.form) for token in sent] for sent in main_df]
eval_tokenized = [[tokenize_doc(token.form) for token in sent] for sent in eval_df]

In [23]:
train_labels_encoded = [[label2id.get(label, 0) for label in sent] for sent in train_labels]
eval_labels_encoded = [[label2id.get(label, 0) for label in sent] for sent in eval_labels]

In [24]:
train_tokenized[1][:5]

[['<PAD>', 'Н', 'а', 'ч', 'а', 'л', 'ь', 'н', 'и', 'к', '<PAD>'],
 ['<PAD>', 'о', 'б', 'л', 'а', 'с', 'т', 'н', 'о', 'г', 'о', '<PAD>'],
 ['<PAD>', 'у', 'п', 'р', 'а', 'в', 'л', 'е', 'н', 'и', 'я', '<PAD>'],
 ['<PAD>', 'с', 'в', 'я', 'з', 'и', '<PAD>'],
 ['<PAD>', 'С', 'е', 'м', 'е', 'н', '<PAD>']]

In [25]:
train_labels_encoded[1][:5]

[8, 1, 8, 8, 12]

In [26]:
train_encoded = [[vocabulary.doc2idx(token, unknown_word_index=0) for token in sent] for sent in train_tokenized]
eval_encoded = [[vocabulary.doc2idx(token, unknown_word_index=0) for token in sent] for sent in eval_tokenized]

In [27]:
train_encoded[0]

[[0, 3, 7, 6, 5, 8, 4, 0], [0, 1, 0]]

In [28]:
MAX_TOKEN_LEN = max(len(token) for sent in train_encoded for token in sent)
MAX_SENTENCE_LEN = max(len(sent) for sent in train_encoded)

In [29]:
class CharlvlDataset(Dataset):
    def __init__(self, features, targets) -> None:
        self.features = features
        self.targets = targets
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        # get sentence of tokens
        features_tensor_list = []
        for token in self.features[idx]:
            features_tensor_list.append(torch.LongTensor(token))
        features = torch.nn.utils.rnn.pad_sequence(features_tensor_list, batch_first=True)
        features = torch.nn.functional.pad(features, (0, MAX_TOKEN_LEN - features.shape[1], 0, MAX_SENTENCE_LEN - features.shape[0]), value=0)

        targets = torch.LongTensor(self.targets[idx])
        targets = torch.nn.functional.pad(targets, (0, MAX_SENTENCE_LEN - targets.shape[0]), value=0)

        return features, targets

In [30]:
train_dataset = CharlvlDataset(train_encoded, train_labels_encoded)
eval_dataset = CharlvlDataset(eval_encoded, eval_labels_encoded)

In [31]:
def custom_collate(data):
    n_sentences = len(data)
    features, labels = zip(*data)
    features = torch.stack(features)
    labels = torch.stack(labels) if labels[0] is not None else None
    return (features, labels)

In [32]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate)
eval_dataloader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate)

In [33]:
next(iter(train_dataloader))[1]

tensor([[ 1,  8, 13,  ...,  0,  0,  0],
        [13, 10, 13,  ...,  0,  0,  0],
        [ 6,  8,  2,  ...,  0,  0,  0],
        ...,
        [ 8, 12, 16,  ...,  0,  0,  0],
        [ 8, 12, 12,  ...,  0,  0,  0],
        [ 5, 11, 10,  ...,  0,  0,  0]])

# Model building

In [34]:
class CustomConv1d(torch.nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, padding=0):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.padding = padding
        
        self.weight = torch.nn.Parameter(torch.randn(in_channels * kernel_size, out_channels)/(in_channels*kernel_size), requires_grad=True)
        self.bias = torch.nn.Parameter(torch.zeros(out_channels), requires_grad=True)
        
    def forward(self, x):
        # X - BatchSize x InChannels x SequenceLen

        batch_size, src_channels, sequence_len = x.shape
        if self.padding > 0:
            pad = x.new_zeros(batch_size, src_channels, self.padding)
            x = torch.cat((pad, x, pad), dim=-1)
            sequence_len = x.shape[-1]

        chunks = []
        chunk_size = sequence_len - self.kernel_size + 1
        for offset in range(self.kernel_size):
            chunks.append(x[:, :, offset:offset + chunk_size])

        in_features = torch.cat(chunks, dim=1) # BatchSize x InChannels*KernelSize x ChunkSize
        in_features = in_features.permute(0, 2, 1) # BatchSize x ChunkSize x InChannels*KernelSize

        out_features = torch.bmm(in_features, self.weight.unsqueeze(0).expand(batch_size, -1, -1)) + self.bias.unsqueeze(0).unsqueeze(0)
        out_features = out_features.permute(0, 2, 1)
        return out_features

In [35]:
class Conv1dCascade(torch.nn.Module):
    def __init__(self, emb_size, n_layers, kernel_size, conv_layer=CustomConv1d, dropout=0.0) -> None:
        super().__init__()
        layers = []
        for _ in range(n_layers):
            layers.append(torch.nn.Sequential(
                conv_layer(emb_size, emb_size, kernel_size, padding=kernel_size//2),
                torch.nn.Dropout(dropout),
                torch.nn.LeakyReLU()))
        self.layers = torch.nn.ModuleList(layers)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x) + x
        return x

In [36]:
class POSTaggerModel(torch.nn.Module):
    def __init__(self, charlvl_backbone_model, sentencelvl_backbone_model, emb_size, vocabulary_size, n_labels, device) -> None:
        super().__init__()
        self.device = device
        self.n_labels = n_labels
        self.char_embs = torch.nn.Embedding(vocabulary_size, emb_size, padding_idx=0)
        self.charlvl_backbone = charlvl_backbone_model
        self.sentencelvl_backbone = sentencelvl_backbone_model
        self.pooling = torch.nn.AdaptiveMaxPool1d(1)
        self.out_transform = torch.nn.Conv1d(emb_size, n_labels, 1)
    
    def forward(self, input_ids):
        n_sentences, n_tokens, n_chars = input_ids.shape

        input_ids = input_ids.view(n_sentences * n_tokens, -1)
        input_embs = self.char_embs(input_ids)
        input_embs = input_embs.permute(0, 2, 1)
        
        model_out = self.charlvl_backbone(input_embs)
        model_out = self.pooling(model_out).squeeze(-1)
        
        model_out = model_out.view(n_sentences, n_tokens, -1)
        model_out = model_out.permute(0, 2, 1)
        
        model_out = self.sentencelvl_backbone(model_out)   
        model_out = self.out_transform(model_out)

        return model_out

In [37]:
VOCAB_SIZE = len(vocabulary)
N_LABELS = len(label2id)
EMB_SIZE = 64
N_CONV_LAYERS = 3
KERNEL_SIZE = 3
DROPOUT = 0.3

In [38]:
charlvl_conv_stack = Conv1dCascade(EMB_SIZE, N_CONV_LAYERS, KERNEL_SIZE, dropout=DROPOUT)
sentencelvl_conv_stack =  Conv1dCascade(EMB_SIZE, N_CONV_LAYERS, KERNEL_SIZE, dropout=DROPOUT)

In [39]:
model = POSTaggerModel(charlvl_conv_stack, sentencelvl_conv_stack, EMB_SIZE, VOCAB_SIZE, N_LABELS, device='cuda').to('cuda')

In [40]:
from functools import reduce

print('N parameters:', sum(reduce(lambda x, y: x*y, p.shape) for p in model.parameters() ))

N parameters: 84882


In [41]:
loss_foo = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3)

In [42]:
trainer = Trainer(verbose=True)

In [43]:
trainer.init_model(model)
trainer.loss_function = loss_foo
trainer.optimizer = optimizer
trainer.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, factor=0.5, verbose=True)
trainer.prerun_check()

In [44]:
trainer.train_eval_procedure(train_dataset, eval_dataset, max_epochs=MAX_EPOCH, batch_size=BATCH_SIZE, custom_collate=custom_collate)

Epoch Number: 1 	| Training time: 2:19 	| Evaluation time: 0:7 	| epoch train loss: 0.14622315496245292 	| epoch eval loss: 0.030617373422361337

Epoch Number: 2 	| Training time: 2:15 	| Evaluation time: 0:7 	| epoch train loss: 0.024476588211212482 	| epoch eval loss: 0.022297524488889255

Epoch Number: 3 	| Training time: 2:12 	| Evaluation time: 0:7 	| epoch train loss: 0.019584490113077363 	| epoch eval loss: 0.01921179424971342

Epoch Number: 4 	| Training time: 2:13 	| Evaluation time: 0:7 	| epoch train loss: 0.01733679876843204 	| epoch eval loss: 0.01787674563148847

Epoch Number: 5 	| Training time: 2:12 	| Evaluation time: 0:8 	| epoch train loss: 0.015987233444073125 	| epoch eval loss: 0.01670093433215068

Epoch Number: 6 	| Training time: 2:12 	| Evaluation time: 0:7 	| epoch train loss: 0.014952814698960457 	| epoch eval loss: 0.01618431232726345

Epoch Number: 7 	| Training time: 2:12 	| Evaluation time: 0:7 	| epoch train loss: 0.01423326950929864 	| epoch eval loss: 

In [45]:
torch.save(trainer.model.state_dict(), 'pos_tagger_conv_model.pth')

In [46]:
predictions = []
references = []
model.eval()
for batch_id, batch in enumerate(eval_dataloader):
    input_ids = batch[0].to(model.device)
    labels = batch[1].to(model.device)
    with torch.no_grad():
        model_out = model(input_ids)
        predictions += model_out.argmax(1).flatten().tolist()
        references += labels.flatten().tolist()

In [47]:
print(classification_report(references, predictions, target_names=label2id.keys()))

              precision    recall  f1-score   support

     <NOTAG>       1.00      1.00      1.00   1231232
         ADJ       0.92      0.93      0.93     11222
         ADP       1.00      0.99      1.00     10585
         ADV       0.92      0.91      0.92      6165
         AUX       0.86      0.95      0.90      1108
       CCONJ       0.94      0.99      0.96      4410
         DET       0.89      0.93      0.91      3085
        INTJ       1.00      0.27      0.43        11
        NOUN       0.98      0.96      0.97     27974
         NUM       0.95      0.95      0.95      1829
        PART       0.97      0.87      0.92      3875
        PRON       0.96      0.90      0.93      5598
       PROPN       0.94      0.95      0.94      4438
       PUNCT       1.00      1.00      1.00     22694
       SCONJ       0.85      0.96      0.90      2258
         SYM       0.96      1.00      0.98        53
        VERB       0.93      0.97      0.95     13078
           X       0.97    