# 1. Обучение нейронной сети Bert

In [1]:
!pip install datasets transformers seqeval

Collecting datasets
  Downloading datasets-1.18.4-py3-none-any.whl (312 kB)
[K     |████████████████████████████████| 312 kB 5.2 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 41.9 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.8 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.2.0-py3-none-any.whl (134 kB)
[K     |████████████████████████████████| 134 kB 12.7 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 1.8 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 43.3 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp3

In [2]:
# загружаем библиотеки
from datasets import load_dataset, load_metric, Dataset
import pandas as pd
import numpy as np
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers.optimization import AdamW
import torch
import warnings 

warnings.filterwarnings('ignore')

# задаем параметры для сети, количество батчей и тип токенайзера
model_checkpoint = "distilbert-base-uncased"
batch_size = 20

In [3]:
# загружаем исходный размеченный датасет
dataset = load_dataset('json', data_files='year.json')
print(dataset)

# разделяем дата сет на тренировочную и тестовую выборку
datasets=dataset["train"].train_test_split(test_size=0.1)
print(datasets)

Using custom data configuration default-307c34af2ab059a4


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-307c34af2ab059a4/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-307c34af2ab059a4/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['description', 'labels'],
        num_rows: 3153
    })
})
DatasetDict({
    train: Dataset({
        features: ['description', 'labels'],
        num_rows: 2837
    })
    test: Dataset({
        features: ['description', 'labels'],
        num_rows: 316
    })
})


In [4]:
# загружаем предобученный токенайзер
tokenizer = AutoTokenizer.from_pretrained(
    model_checkpoint, padding='max_length')

# подключаем cuda для pytorch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [5]:
# функция для токенизации предишкен датасета
def tokenize_dataset(examples):
    tokenized_inputs = tokenizer(
        examples["description"], truncation=True, padding=True)
    return tokenized_inputs

In [6]:
datasets_token=datasets.map(tokenize_dataset, batched=True)
datasets_token

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['description', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 2837
    })
    test: Dataset({
        features: ['description', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 316
    })
})

In [7]:
# загружаем предобученную модель BERT
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=4)

# очищаем кеш cuda, это полезно, при файнтюинге
torch.cuda.empty_cache()

# отправляем модель в cuda
model = model.to(device)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [8]:
# задаем параметры для обучения
args = TrainingArguments(
    f"test-ner",
    evaluation_strategy="epoch",
    learning_rate=0.0002,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=6,
    weight_decay=0.01,
)

In [9]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

Downloading:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [10]:
# определяем различные метрики для оценки качества
label_list = ['O', 'creation_year', 'born', 'died']


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(
        predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [11]:
# функция оптимизации
optimizer = AdamW(model.parameters(), lr=0.0003)

# шедулер
lr_scheduler = transformers.get_polynomial_decay_schedule_with_warmup(optimizer,
                                                                      num_warmup_steps=30,
                                                                      num_training_steps=800,
                                                                      power=2.0)
# функция тренировки
trainer = Trainer(
    model,
    args,
    train_dataset=datasets_token["train"],
    eval_dataset=datasets_token["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler))

In [12]:
# тренируем модель
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: description. If description are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2837
  Num Epochs = 6
  Instantaneous batch size per device = 20
  Total train batch size (w. parallel, distributed & accumulation) = 20
  Gradient Accumulation steps = 1
  Total optimization steps = 852


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.00559,0.982402,0.972336,0.977343,0.998617
2,No log,0.004033,0.975659,0.985656,0.980632,0.99879
3,No log,0.003835,0.984678,0.987705,0.986189,0.99917
4,0.023900,0.003583,0.982724,0.990779,0.986735,0.99917
5,0.023900,0.003578,0.982741,0.991803,0.987251,0.999205
6,0.023900,0.003582,0.982741,0.991803,0.987251,0.999205


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: description. If description are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 316
  Batch size = 20
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: description. If description are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 316
  Batch size = 20
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: description. If description are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluatio

TrainOutput(global_step=852, training_loss=0.014689353640090691, metrics={'train_runtime': 1630.1103, 'train_samples_per_second': 10.442, 'train_steps_per_second': 0.523, 'total_flos': 2224057493200896.0, 'train_loss': 0.014689353640090691, 'epoch': 6.0})

# 2.Предсказание

In [13]:
# переходим в режим предсказания
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: description. If description are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 316
  Batch size = 20


{'epoch': 6.0,
 'eval_accuracy': 0.9992048401037165,
 'eval_f1': 0.9872514023457419,
 'eval_loss': 0.003581771394237876,
 'eval_precision': 0.9827411167512691,
 'eval_recall': 0.9918032786885246,
 'eval_runtime': 9.6957,
 'eval_samples_per_second': 32.592,
 'eval_steps_per_second': 1.65}

In [20]:
# посмотрим на несколько экземпляров тестовой выборке, которые были предсказаны
predictions, labels, _ = trainer.predict(datasets_token["test"])

for i in range(5):
    print("--------------------------------------------------------------------")
    print(datasets_token["test"][i]['description'])
    real = datasets_token["test"][i]['labels']

    pred = labels[i]

    logit = torch.tensor(predictions[i])
    softmax_score = torch.nn.functional.softmax(logit, dim=-1)
    for j in range(len(real)):

        d = {1: 'create_year',
             2: 'born',
             3: 'died',
             0: "missing"}
        lab = softmax_score[j].argmax().item()
        label = d[softmax_score[j].argmax().item()]
        if (real[j] != 0 and real[j] != -100):
            print('\nWORD:', tokenizer.convert_ids_to_tokens(
                datasets_token["test"][i]['input_ids'][j]))
            print('Real: ', d[real[j]])
            print('Predict:', label)
            print('probably: ', softmax_score[j].max().item())
            print('Score', softmax_score[j])

The following columns in the test set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: description. If description are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 316
  Batch size = 20


--------------------------------------------------------------------
Philip Alexius de László, P.R.B.A.
1869-1937
Portrait of a Child with a Steiff Lion
signed, dated and inscribed de László / 1927 PARIZ lower right
oil on canvas
Unframed: 74 by 55cm., 29 by 21¾in.
Framed: 87.5 by 67cm., 34½ by 26¼in.
This portrait will be included in the Philip de László catalogue raisonné , currently presented
in progress online: www.delaszlocatalogueraisonne.com

WORD: 1869
Real:  born
Predict: born
probably:  0.9959694147109985
Score tensor([2.0557e-03, 4.4811e-04, 9.9597e-01, 1.5269e-03])

WORD: 1937
Real:  died
Predict: died
probably:  0.9960253238677979
Score tensor([1.6681e-03, 8.3992e-04, 1.4667e-03, 9.9603e-01])

WORD: 1927
Real:  create_year
Predict: create_year
probably:  0.9991650581359863
Score tensor([4.6392e-04, 9.9917e-01, 1.3555e-04, 2.3544e-04])
--------------------------------------------------------------------
signed, dedicated and dated "FOR JO! + BANKSY 07" lower left; screen pr

In [15]:
# сохраним модель в файл, который пойдет в продакшен
model.save_pretrained('ner_year')

Configuration saved in ner_year/config.json
Model weights saved in ner_year/pytorch_model.bin


# 3.Анализ результатов

Т.к. у нас классы не сбалансированы, то в данном случае в качестве метрики качества анализируем F1. На тестовой выборке F1=0.98, что говорит об очень хорошем результате, не стыдно будет отправлять в продакшен.

# Вывод:

- Очевидная задача изначально, стала сложнее после детального исследования предметной области (необходимость собирать даты рождения художников).
- Наличие большого количества аукционных домов, разных структур подачи информации, усложняет дело, т.к. в тренировочны данные необходимо включать примеры из большинства из них. Сейчас у нас больше 20, но будет еще больше. 
- Разметка данных это самый был долгий этап, т.к. несмотря на то, что якобы информация есть, ее необходимо было чистить.
- По дефолтным параметрам модели, f1 было уровня 90%, блягодаря шедулер, удалось значительно улучшить качество модели.
- Плюсы при использовании данной библиотеки transformers:
    - Простая реализация;
    - Предобученная модель и токенайзер; 
    - Гибкость параметров при файтюнинг;
    - Маленький размер модели (около 250Мб) .