In [None]:
!git clone https://github.com/KuzmaKhrabrov/character-tokenizer.git

In [None]:
!pip install transformers

In [1]:
import string
import sys
sys.path.append("/content/character-tokenizer")
from charactertokenizer import CharacterTokenizer

chars = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
model_max_length = 64
tokenizer = CharacterTokenizer(chars, model_max_length)

In [2]:
example = "Привет"
tokens = tokenizer(example)
print(tokens)

{'input_ids': [0, 39, 42, 26, 12, 18, 46, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


Задание: обучите модель классификации букв для задачи расстановки ударения с помощью методов из библиотеки transformers. Датасет для обучения можно взять отсюда: https://github.com/Koziev/NLP_Datasets/blob/master/Stress/all_accents.zip

1. Напишите класс для Dataset/Dataloder и разбейте данные на случайные train / test сплиты в соотношении 50:50. (1 балл)
2. Попробуйте обучить одну или несколько из моделей: Bert, Albert, Deberta. Посчитайте метрику Accuracy на train и test. (1 балл). При преодолении порога в Accuracy на test 0.8: (+1 балл), 0.85: (+2 балла), 0.89: (+3 балла).
Пример конфигурации для deberta: https://huggingface.co/IlyaGusev/ru-word-stress-transformer/blob/main/config.json

In [6]:
import numpy as np
import pandas as pd 
import os
import torch
import sys
import torch.nn as nn

from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertForTokenClassification, BertConfig
from transformers import get_cosine_schedule_with_warmup
sys.path.append("/content/character-tokenizer")
from charactertokenizer import CharacterTokenizer

In [7]:
df = pd.read_csv("all_accents.tsv", sep="\t", header=None)
df = df.rename({0: "word", 1:"target"}, axis=1)
df.head()

Unnamed: 0,word,target
0,-де,-д^е
1,-ка,-к^а
2,-либо,-л^ибо
3,-нибудь,-ниб^удь
4,-с,-с


In [8]:
train_data, val_data = train_test_split(df, test_size=0.5, random_state=42)

In [21]:
def get_labels_for_text(target, get_indexes=True):
    class2id = {
        "O": 0,
        "X": 1
    }
    result = []
    last = False
    for char in target:
        if last:
            last = False
        elif char == "^":
            result.append("X")
            last = True
        else:
            result.append("O")
    if get_indexes:
        result = [class2id[value] for value in result]
    return result

In [23]:
class StressDataset(Dataset):
    def __init__(self, df, MAX_LENGTH=64):
        super(StressDataset, self).__init__()
        df = df.reset_index(drop=True).copy()
        self.df = df
        self.MAX_LENGTH = MAX_LENGTH

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        line = self.df.iloc[idx]
        encoded_dict = tokenizer.encode_plus(
                            line["word"],                     
                            add_special_tokens = True, 
                            max_length = self.MAX_LENGTH,           
                            padding='max_length',
                            pad_to_max_length = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',     
                            return_special_tokens_mask=True,
                    )
        lebels = get_labels_for_text(line["target"])
        encoded_dict['input_ids'] = encoded_dict['input_ids'].squeeze()
        encoded_dict['special_tokens_mask'] = encoded_dict['special_tokens_mask'].squeeze()
        encoded_dict['attention_mask'] = encoded_dict['attention_mask'].squeeze()
        lebels = torch.tensor([-100] + lebels + [-100] * (encoded_dict['input_ids'].shape[-1] - (1 + len(lebels)))).long()
        return (encoded_dict, lebels)
    


In [24]:

dataset_train = StressDataset(train_data)
dataset_val = StressDataset(val_data)
train_loader = DataLoader(dataset_train, batch_size=784, shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=784, shuffle=False)

In [25]:
def train_one_epoch(epoch, model, train_loader, optimizer, scheduler):
    model.train()
    lst_losses = []
    correct_elements = 0
    total_elements = 0
    
    for idx, batch in tqdm(enumerate(train_loader)):
        inputs, labels = batch[0].to(device), batch[1].to(device)
        outputs = model(
            input_ids = inputs['input_ids'],
            attention_mask =inputs['attention_mask'],
            labels = labels,
        )
        loss, scores = outputs.loss, outputs.logits
        optimizer.zero_grad()
        lst_losses.append(loss.item())
        loss.backward()
        optimizer.step()
        scheduler.step()
        
    avg_loss = np.array(lst_losses).mean()
    print(f"Train: epoch {epoch} | loss = {avg_loss} ")

In [46]:
def validate_one_epoch(epoch, model, val_loader, optimizer):
    with torch.no_grad():
        model.eval()
        lst_losses = []
        correct_elements = 0
        total_elements = 0

        for idx, batch in tqdm(enumerate(val_loader)):
            inputs, labels = batch[0].to(device), batch[1].to(device)
            outputs = model(
                input_ids = inputs['input_ids'],
                attention_mask =inputs['attention_mask'],
                labels = labels,
            )
            loss, scores = outputs.loss, outputs.logits
            lst_losses.append(loss.item())

            for ind_elem in range(inputs['input_ids'].shape[0]):
                x1 = torch.masked_select(scores[ind_elem, :, :].argmax(dim=-1), inputs['special_tokens_mask'][ind_elem] == 0)
                x2 = torch.masked_select(labels[ind_elem], labels[ind_elem] != -100)
                if torch.equal(x1, x2):
                    correct_elements += 1
                total_elements += 1

        avg_loss = np.array(lst_losses).mean()

        print(f"Val: epoch {epoch} | loss = {avg_loss} | accuracy = {(correct_elements / total_elements)* 100}")

In [47]:
configuration = BertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=256,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    num_hidden_layers=4,
    num_attention_heads=4,
    intermediate_size=512,
)

In [48]:
device = "cuda:3" 
model = BertForTokenClassification(configuration,)
model.classifier = torch.nn.Linear(model.classifier.in_features, 2)
configuration = model.config
# model= nn.DataParallel(model,device_ids = [3,4, 5,7])
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(73, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_a

In [49]:
train_loader = DataLoader(dataset_train, batch_size=392, shuffle=True, num_workers=2)
val_loader = DataLoader(dataset_val, batch_size=392, shuffle=False, num_workers=2)

In [50]:
num_warmup_steps = 1000
num_training_steps= NUM_EPOCHS * len(train_loader)
num_cycles = 0.5
last_epoch = -1
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [51]:

scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
    num_cycles=num_cycles,
    last_epoch=last_epoch
)

In [370]:
for epoch in range(1, 10):
    train_one_epoch(epoch, model, train_loader, optimizer, scheduler)
    if epoch % 2 == 1:
        validate_one_epoch(epoch, model, val_loader, optimizer)

2144it [02:08, 16.65it/s]

Train: epoch 1 | loss = 0.09263319548205542 



2144it [03:43,  9.59it/s]

Val: epoch 1 | loss = 0.07772094627439197 | accuracy = 79.84



2144it [02:07, 16.86it/s]

Train: epoch 2 | loss = 0.08020266937786963 



2144it [02:18, 15.51it/s]

Train: epoch 3 | loss = 0.07020231828253383 



2144it [03:49,  9.33it/s]

Val: epoch 3 | loss = 0.056171288512257944 | accuracy = 85.97



2144it [02:16, 15.68it/s]

Train: epoch 4 | loss = 0.06239392812912867 



2144it [02:17, 15.62it/s]

Train: epoch 5 | loss = 0.055654147693734446 



2144it [03:43,  9.60it/s]

Val: epoch 5 | loss = 0.04323482880309752 | accuracy = 89.25



2144it [02:14, 15.96it/s]

Train: epoch 6 | loss = 0.04997781533817413 



2144it [02:20, 15.28it/s]

Train: epoch 7 | loss = 0.0456450547963909 



2144it [03:47,  9.41it/s]

Val: epoch 7 | loss = 0.03651020029531931 | accuracy = 91.10000000000001



2144it [02:14, 15.96it/s]

Train: epoch 8 | loss = 0.0427387932345007 



2144it [02:07, 16.83it/s]

Train: epoch 9 | loss = 0.04155977822166047 



2144it [03:50,  9.32it/s]

Val: epoch 9 | loss = 0.03489579213696268 | accuracy = 91.53





### итоги
Луший скор __91,53__

Выбраная архитектура - BERT (BertForTokenClassification)