In [None]:
import pandas as pd
import wandb
from torch import nn
import numpy as np
from datasets import Dataset
from torch import nn
import torch
from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModel,
    DataCollatorWithPadding,
    TrainerCallback
)
from sklearn.metrics import f1_score

In [None]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33monishchenko-av[0m. Use [1m`wandb login --relogin`[0m to force relogin


Загрузим датасет

In [None]:
train = pd.read_csv('train_data.csv', sep='\t')
val = pd.read_csv('validation_data_labeled.csv', sep='\t')

Определим токенайзер

In [None]:
tokenizer = AutoTokenizer.from_pretrained('ai-forever/ruBert-base',  do_lower_case=True)
new_special_tokens = {'additional_special_tokens': ['[PERSON]', '[ORGANIZATION]', '[PROFESSION]', '[COUNTRY]', '[NATIONALITY]']}
tokenizer.add_special_tokens(new_special_tokens)

5

Сформируем датасет

In [None]:
class TransformString():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    def transform_str(self, elem):
        ''' Добавление токена типа сущности '''
        start = elem.entity_pos_start_rel
        end = elem.entity_pos_end_rel
        sentence = elem.sentence
        return sentence[:start] + " " + "[" + elem.entity_tag +  "]"+ " " + sentence[start:end] + " " + self.tokenizer.sep_token + " " + sentence[end:]

In [None]:
def make_dataset(train, val, tokenizer):
    ''' Функция для создания датасета '''
    ts = TransformString(tokenizer)

    # Получаем тексты
    train_concatenated = [
        ts.transform_str(elem)
        for _, elem in train.iterrows()
    ]

    val_concatenated = [
        ts.transform_str(elem)
        for _, elem in val.iterrows()
    ]

    # Считаем лейблы
    train_labels = [
        elem.label + 1
        for _, elem in train.iterrows()
    ]

    val_labels = [
        elem.label + 1
        for _, elem in val.iterrows()
    ]

    # Добавляем позицию специального токена
    train_special_tokens = [
        tokenizer("[" + elem.entity_tag +  "]")['input_ids'][1]
        for _, elem in train.iterrows()
    ]

    val_special_tokens = [
        tokenizer("[" + elem.entity_tag +  "]")['input_ids'][1]
        for _, elem in val.iterrows()
    ]

    train_dataset = Dataset.from_dict({
        "text": train_concatenated,  # тексты
        "labels": train_labels,      # метки
        "spec_tokens": train_special_tokens # позиция специального токена
    })

    val_dataset = Dataset.from_dict({
        "text": val_concatenated,  # тексты
        "labels": val_labels,      # метки
        "spec_tokens": val_special_tokens # позиция специального токена
    })

    # Формируем итоговые датасеты
    train_dataset = train_dataset.map(
    lambda elem: tokenizer(
        elem["text"],             
        add_special_tokens=False,  
        truncation=True,  
        max_length=200,  
        padding='max_length'
        )
    )

    val_dataset = val_dataset.map(
        lambda elem: tokenizer(
        elem["text"],
        add_special_tokens=False, 
        truncation=True,
        max_length=200, 
        padding='max_length'
        )
    )

    return train_dataset, val_dataset


In [None]:
train_dataset, val_dataset = make_dataset(train, val, tokenizer)
train_dataset = train_dataset.remove_columns(["text", "spec_tokens"])
val_dataset = val_dataset.remove_columns(["text", "spec_tokens"])

Функции для заморозки/разморозки слоев.

In [None]:
class Freeze():
    @staticmethod
    def freeze_bert(model):
        ''' Заморозка берта '''
        for param in model.bert.embeddings.parameters():
            param.requires_grad=False
    
        for param in model.bert.encoder.parameters():
            param.requires_grad=False

    @staticmethod
    def unfreeze_layers(model, cnt):
        ''' Разморозка слоев '''
        for i in range(len(model.bert.encoder.layer) - cnt, len(model.bert.encoder.layer)):
            for param in model.bert.encoder.layer[i].parameters():
                param.requires_grad=True

    @staticmethod
    def unfreeze_bert(model):
        ''' Разморозка берта '''
        for param in model.bert.encoder.parameters():
            param.requires_grad=True
            
    @staticmethod
    def unfreeze_embed(model):
        ''' Разморозка encoder '''
        for param in model.bert.embeddings.parameters():
            param.requires_grad=True
    
        for param in model.bert.encoder.parameters():
            param.requires_grad=True

Зададим модель

In [None]:
class LargeModelCallbacks(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        ''' Разморозка слоев и изменение lr '''
        if(state.epoch == 1):
            Freeze.unfreeze_layers(state.model, 1)
            state.learning_rate = 1e-5

        if(state.epoch == 2):
            Freeze.unfreeze_layers(state.model, 3)
        
        if(state.epoch == 9):
            Freeze.unfreeze_layers(state.model, 6)
        
        if(state.epoch == 16):
            Freeze.unfreeze_layers(state.model, 9)

        if(state.epoch ==20):
            Freeze.unfreeze_bert(state.model)

        if(state.epoch == 30):
            Freeze.unfreeze_embed(state.model)

In [None]:
class SimpleModel(nn.Module):
    def __init__(self, num_labels, model):
        ''' Инициализация модели '''
        super().__init__()
        self.bert = AutoModel.from_pretrained(model, add_pooling_layer=False)
        self.drop = nn.Dropout(p=0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.loss = nn.CrossEntropyLoss()
    
    def forward(self, input_ids, attention_mask, labels):
        ''' forward модели '''
        output = self.bert(input_ids=input_ids,
          attention_mask=attention_mask)
        cls_token = output.last_hidden_state[:, 0, :]
        cls_token = self.drop(cls_token)
        logits = self.classifier(cls_token)
        loss = self.loss(logits, labels)
        return loss, logits

In [None]:
model = SimpleModel(3, 'ai-forever/ruBert-base')
model.bert.resize_token_embeddings(len(tokenizer))
Freeze.freeze_bert(model)

Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertModel: ['bert.pooler.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Обучим модель

In [None]:
def f1_metric(predictions, labels):
    scores = f1_score(labels, np.argmax(predictions, axis=-1), average=None, labels=[0, 1, 2], zero_division=0)
    return 1 / 2 * (scores[0] + scores[2])

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {"f1_score" : f1_metric(predictions, labels)}

In [None]:
training_args = TrainingArguments(
        output_dir="my_awesome_model",
        learning_rate=5e-3,
        per_device_train_batch_size=40,
        per_device_eval_batch_size=40,
        num_train_epochs=50,
        weight_decay=0.01,
        evaluation_strategy="steps",
        eval_steps=250,
        logging_steps=250,
        save_strategy="steps",
        load_best_model_at_end=True,
        fp16=True,
        report_to="wandb", 
        run_name=""
    )
training_args.run_name = "simple_bert"

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[LargeModelCallbacks]
)

trainer.train()
wandb.finish()



Step,Training Loss,Validation Loss,F1 Score
250,0.7507,0.914862,0.410401
500,0.9025,0.907058,0.4116
750,0.9077,0.897573,0.416467
1000,0.8905,0.889377,0.420555
1250,0.874,0.882028,0.422213
1500,0.8747,0.873949,0.423782
1750,0.8763,0.867335,0.424682
2000,0.8685,0.860324,0.424132
2250,0.8521,0.852908,0.426807
2500,0.8433,0.845035,0.425686


Step,Training Loss,Validation Loss,F1 Score
250,0.7507,0.914862,0.410401
500,0.9025,0.907058,0.4116
750,0.9077,0.897573,0.416467
1000,0.8905,0.889377,0.420555
1250,0.874,0.882028,0.422213
1500,0.8747,0.873949,0.423782
1750,0.8763,0.867335,0.424682
2000,0.8685,0.860324,0.424132
2250,0.8521,0.852908,0.426807
2500,0.8433,0.845035,0.425686


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1_score,▁▁▂▃▃▄▄▄▄▄▄▃▃▄▄▅▆▆▆▆▇▇█▇▇▇▇▇█▇▇██
eval/loss,██▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁
eval/runtime,█▅▆█▃█▂▆▃▅▃▄▃▄▁▄▄▃▅▄▅▆▇▅▅▅█▇▅█▇▄▇
eval/samples_per_second,▁▄▃▁▆▁▇▃▆▄▆▅▆▄█▅▅▆▄▄▄▃▂▄▄▄▁▂▄▁▂▅▂
eval/steps_per_second,▁▄▃▁▆▁▇▃▆▄▆▅▆▄█▅▅▆▄▄▄▃▂▄▄▄▁▂▄▁▂▅▂
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,███▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁▁
train/loss,▃██▇▇▇▇▇▆▆▆▆▆▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁
train/total_flos,▁

0,1
eval/f1_score,0.44521
eval/loss,0.6766
eval/runtime,10.6158
eval/samples_per_second,267.996
eval/steps_per_second,6.782
train/epoch,50.0
train/global_step,8300.0
train/learning_rate,3e-05
train/loss,0.6752
train/total_flos,0.0


**Проведем эксперимент, с тем, что берем предсказание из спец.токена**

In [None]:
def add_position(elem, max_length=200):
    ''' Создает one-hot вектор для позиции спец токена '''
    pos = np.where(np.array(elem['input_ids']) == elem['spec_tokens'])[0][0]
    res = np.zeros(max_length)
    res[pos] = 1
    return {'spec_token_position': res}

def add_position_of_special_token(train_dataset, val_dataset):
    ''' Добавляет маску с позицией спец токена для всех элементов датасета '''
    train_dataset = train_dataset.map(
        lambda elem: add_position(elem)
    )
    val_dataset = val_dataset.map(
        lambda elem: add_position(elem)
    )   
    return train_dataset, val_dataset

In [None]:
train_dataset, val_dataset = make_dataset(train, val, tokenizer)
train_dataset, val_dataset = add_position_of_special_token(train_dataset, val_dataset)
train_dataset = train_dataset.remove_columns(["text", "spec_tokens"])
val_dataset = val_dataset.remove_columns(["text", "spec_tokens"])

In [None]:
class SpecTokenModel(nn.Module):
    def __init__(self, num_labels, model):
        ''' Инициализация модели '''
        super().__init__()
        self.bert = AutoModel.from_pretrained(model, add_pooling_layer=False)
        self.drop = nn.Dropout(p=0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.loss = nn.CrossEntropyLoss()
    
    def forward(self, input_ids, spec_token_position, attention_mask, labels):
        ''' forward модели '''
        output = self.bert(input_ids=input_ids,
          attention_mask=attention_mask)
        batch_size, max_len, _ = output.last_hidden_state.shape
        spec_token = torch.sum(output.last_hidden_state * spec_token_position.reshape(batch_size, max_len, 1), dim=1)
        spec_token = self.drop(spec_token)
        logits = self.classifier(spec_token)
        loss = self.loss(logits, labels)
        return loss, logits

In [None]:
model = SpecTokenModel(3, "ai-forever/ruBert-base")
model.bert.resize_token_embeddings(len(tokenizer))
Freeze.freeze_bert(model)

Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertModel: ['bert.pooler.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
log_steps=250

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[LargeModelCallbacks]
)

trainer.train()

wandb.finish()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1 Score
250,0.661,0.621652,0.347248
500,0.5968,0.607574,0.4093
750,0.5802,0.613542,0.432967
1000,0.5877,0.625601,0.389122
1250,0.5772,0.625385,0.376365
1500,0.5932,0.615259,0.379997
1750,0.5689,0.603769,0.439069
2000,0.5741,0.604254,0.387208
2250,0.5726,0.610496,0.410695
2500,0.5696,0.615938,0.356725


0,1
eval/f1_score,▂▅▇▅▃▁▅▇▄▃▃▇▄▅▂▇▆▄▇▆▃▇▅▇▆▅▄▆▇▆▅▆█▇▆▆▅▅
eval/loss,▆▃▅▅█▆▃▅▇▇▅▃▃▄▅▅▆▇▄▄▃▄▄▆▃▃▄▂▂▂▅▂▃▄▁▁▁▁
eval/runtime,▃▂▂▃▁▃█▃▂▃▃▃▃▃▂▂▃▂▂▃▂▂▂▃▃▃▃▂▃▃▃▂▃▄▃▃▃▃
eval/samples_per_second,▆▇▇▆█▅▁▆▇▆▆▆▆▆▇▇▆▇▇▆▆▇▇▆▆▆▆▆▆▆▆▇▆▅▆▆▆▆
eval/steps_per_second,▆▇▇▆█▅▁▆▇▆▆▆▆▆▇▇▆▇▇▆▆▇▇▆▆▆▆▆▆▆▆▇▆▅▆▆▆▆
train/epoch,▁▁▁▁▂▂▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▁▂▂▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇████
train/learning_rate,███▇▇███▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▄▄▄▄█▅▄▄▄▅▃▄▃▃▃▄▃▃▃▄▃▃▃▃▃▂▂▃▂▃▂▂▂▂▂▁▁
train/total_flos,▁

0,1
eval/f1_score,0.41325
eval/loss,0.59513
eval/runtime,33.7596
eval/samples_per_second,84.272
eval/steps_per_second,2.133
train/epoch,50.0
train/global_step,8300.0
train/learning_rate,3e-05
train/loss,0.5241
train/total_flos,0.0


**Теперь проведем эксперимент с моделью поменьше**

In [None]:
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/distilrubert-base-cased-conversational',  do_lower_case=True)
new_special_tokens = {'additional_special_tokens': ['[PERSON]', '[ORGANIZATION]', '[PROFESSION]', '[COUNTRY]', '[NATIONALITY]']}
tokenizer.add_special_tokens(new_special_tokens)

In [None]:
class SmallModelCallbacks(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        ''' Разморозка слоев и изменение lr '''
        if(state.epoch == 6):
            Freeze.unfreeze_layers(state.model, 1)
            state.learning_rate = 1e-3

        if(state.epoch == 10):
            Freeze.unfreeze_layers(state.model, 3)
            state.learning_rate = 1e-4
        
        if(state.epoch == 15):
            Freeze.unfreeze_layers(state.model, 5)
            state.learning_rate = 1e-5

In [None]:
model = SimpleModel(3, "DeepPavlov/distilrubert-base-cased-conversational")
model.bert.resize_token_embeddings(len(tokenizer))
Freeze.freeze_bert(model)

Some weights of the model checkpoint at DeepPavlov/distilrubert-base-cased-conversational were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Здесь обучаем меньшее количество эпох, так как модель переобучается

In [None]:
training_args.run_name = "dist_bert"
training_args.weight_decay=0.1

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[SmallModelCallbacks]
)

trainer.train()

wandb.finish()



Step,Training Loss,Validation Loss,F1 Score
250,0.7325,0.67695,0.197715
500,0.7093,0.695712,0.260092
750,0.6948,0.70867,0.344768
1000,0.6906,0.671181,0.29828
1250,0.8321,0.877246,0.384533
1500,0.8404,0.870212,0.382552
1750,0.8389,0.865789,0.384043
2000,0.8364,0.863894,0.384096
2250,0.8377,0.862761,0.380099
2500,0.8393,0.861998,0.380923


VBox(children=(Label(value='0.001 MB of 0.009 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.121520…

0,1
eval/f1_score,▁▃▇▅█████████
eval/loss,▁▂▂▁█████▇▇▇▇
eval/runtime,▅▆▃▁█▇▃▂▂▃▁█▁
eval/samples_per_second,▄▃▆█▁▂▆▇▇▆█▁█
eval/steps_per_second,▄▃▆█▁▂▆▇▇▆█▁█
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇███
train/learning_rate,█▇▇▆▆▅▅▄▃▃▂▂▁
train/loss,▃▂▁▁█████████
train/total_flos,▁

0,1
eval/f1_score,0.38258
eval/loss,0.86154
eval/runtime,5.0862
eval/samples_per_second,559.361
eval/steps_per_second,14.156
train/epoch,20.0
train/global_step,3320.0
train/learning_rate,0.00011
train/loss,0.8345
train/total_flos,0.0


**Вывод:** Меньшая модель, конечно, проиграла в качестве, но в целлом показала терпимый результат. Использование специальных токенов, для предсказания, дало небольшое улучшение - самая лучшая модель показала качество на валидации 0.4521, в сравнении с лучшим значением при использовании CLS - 0.4452.


**Ссылка с грфиками:** https://wandb.ai/onishchenko-av/huggingface?workspace=user-onishchenko-av