# Обучение нейросетевых моделей

## Модули

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random
import time
from tqdm.auto import tqdm

import datetime
import os

from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, AutoModelForTokenClassification, AutoTokenizer, \
    pipeline, DataCollatorForTokenClassification, get_scheduler

from accelerate import Accelerator

import torch
from torch.optim import AdamW
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader

import evaluate

## Исправление орфографии

In [2]:
# path_to_model = "ai-forever/RuM2M100-1.2B" 

path_to_model = "model/M2M100ForConditionalGeneration/" 
path_to_tokenizer = "model/M2M100Tokenizer/"

In [3]:
model_M100_spell = M2M100ForConditionalGeneration.from_pretrained(path_to_model)
tokenizer_M100_spell = M2M100Tokenizer.from_pretrained(path_to_tokenizer)

In [4]:
# model_M100_spell.save_pretrained("model/M2M100ForConditionalGeneration/")
# tokenizer_M100_spell.save_pretrained("model/M2M100Tokenizer/")

In [6]:
def correct_errors(sentence: str) -> str:

    encodings = tokenizer_M100_spell(sentence, return_tensors="pt")
    generated_tokens = model_M100_spell.generate(**encodings, 
                                                forced_bos_token_id=tokenizer_M100_spell.get_lang_id("ru"), 
                                                max_new_tokens = 200)
    answer = tokenizer_M100_spell.batch_decode(generated_tokens, skip_special_tokens=True)

    return(answer[0])

In [5]:
sentence = "Основая цель мероприятия 5 орпеля 2020 - практичиская оттработка навыкоф по ока занию помощь гражданов, попавшим в ДТП, а также повышение и совершенствование уровня профессиональной подготовки сотрудников МЧС при проведении аварийно-спасательных работ по ликвидации последствий дорожно-транспортных проишествий, сокращение временной показатель реагирование."

print("Input:", sentence)
print("Output:", correct_errors(sentence=sentence))

Input: Основая цель мероприятия 5 орпеля 2020 - практичиская оттработка навыкоф по ока занию помощь гражданов, попавшим в ДТП, а также повышение и совершенствование уровня профессиональной подготовки сотрудников МЧС при проведении аварийно-спасательных работ по ликвидации последствий дорожно-транспортных проишествий, сокращение временной показатель реагирование.
Output: Основная цель мероприятия 5 апреля 2020 - практическая отработка навыков по оказанию помощь гражданам, попавшим в ДТП, а также повышение и совершенствование уровня профессиональной подготовки сотрудников МЧС при проведении аварийно-спасательных работ по ликвидации последствий дорожно-транспортных происшествий, сокращение временной показатель реагирование.


In [7]:
sentence = "Фомилию, имя, отчество не изменял"

print("Input:", sentence)
print("Output:", correct_errors(sentence=sentence))

Input: Фомилию, имя, отчество не изменял
Output: Register фамилию, имя, отчество не изменял


In [12]:
sentence = "Студент, Уфимский фелеал Масковского нифтеного института им. Академика И.М. Губкина"

print("Input:", sentence)
print("Output:", correct_errors(sentence=sentence))

Input: Студент, Уфимский фелеал Масковского нифтеного института им. Академика И.М. Губкина
Output: Secret Студент, Уфимский филиал Московского нефтяного института им. Академика И.М. Губкина


In [9]:
sentence = "Имею загррничный паспорт 6543217 89, ОУМС пос. Хор Хабаровского края, 04.02.2012"

print("Input:", sentence)
print("Output:", correct_errors(sentence=sentence))

Input: Имею загррничный паспорт 6543217 89, ОУМС пос. Хор Хабаровского края, 04.02.2012
Output: Имею загрничный паспорт 654321789, ОУМС пос. Хор Хабаровского края, 04.02.2012


## NER (для имен)

In [2]:
# path_to_model = "viktoroo/sberbank-rubert-base-collection3" 
# path_to_tokenizer = "viktoroo/sberbank-rubert-base-collection3"

path_to_model = "ai-forever/ruBert-base" 
path_to_tokenizer = "ai-forever/ruBert-base"

# path_to_model = "DeepPavlov/rubert-base-cased" 
# path_to_tokenizer = "DeepPavlov/rubert-base-cased"

In [3]:
label_names = ['PER-NAME', 'PER-SURN', 'PER-PATR']

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [4]:
model_NER = AutoModelForTokenClassification.from_pretrained(path_to_model,
                                                               id2label=id2label,
                                                               label2id=label2id)
tokenizer_NER = AutoTokenizer.from_pretrained(path_to_tokenizer, use_fast=True)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model_NER

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(120138, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [6]:
tokenizer_NER

BertTokenizerFast(name_or_path='ai-forever/ruBert-base', vocab_size=120138, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [7]:
# model_NER.save_pretrained("model/model_NER_custom/")
# tokenizer_NER.save_pretrained("model/tokenizer_NER_custom/")

In [8]:
# для обычной модели

# sentence = "Меня зовут Вольфганг и я живу в Берлине"

# print("Input:", sentence)

# nlp = pipeline("ner", model=model_NER, tokenizer=tokenizer_NER)

# ner_results = nlp(sentence)

# print("Output:", ner_results)

In [9]:
# "".join([elem['word'] for elem in ner_results[:2]]).replace('#', '').capitalize()

In [10]:
# "".join([elem['word'] for elem in ner_results[2:]]).replace('#', '').capitalize()

### Обучение

In [11]:
names_dataset = pd.read_json("data/names/names_train.json")
print(names_dataset.shape)

(15000, 2)


In [12]:
names_dataset['mode'] = np.random.choice(['train', "val", "test"], size = names_dataset.shape[0], p = [0.8, 0.15, 0.05])

In [13]:
names_dataset['ner_tags'] = names_dataset['ner_tags'].map(lambda x: [i-1 for i in x])
names_dataset.head()

Unnamed: 0,tokens,ner_tags,mode
0,"[Колесова, Потапова, Корнелия, Аникина, Беляев...","[1, 1, 0, 1, 2, 1]",val
1,[Остап],[0],train
2,"[Логинов, Витаутас, Савицкий, Калашников, Левк...","[1, 0, 1, 1, 2]",train
3,"[Горячева, Куприяновна, Алекса]","[1, 2, 0]",val
4,"[Фотий, Медведев, Быкович, Терентьев, Верещагин]","[0, 1, 2, 1, 1]",val


In [14]:
trdf = Dataset.from_pandas(names_dataset[names_dataset['mode'] == "train"])
vldf = Dataset.from_pandas(names_dataset[names_dataset['mode'] == "val"])
tedf = Dataset.from_pandas(names_dataset[names_dataset['mode'] == "test"])

dataset_names = DatasetDict({"train": trdf, "validation": vldf, "test": tedf})
dataset_names

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'mode', '__index_level_0__'],
        num_rows: 11966
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'mode', '__index_level_0__'],
        num_rows: 2254
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'mode', '__index_level_0__'],
        num_rows: 780
    })
})

In [15]:
# model_NER.cuda()

In [16]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples, tokenizer = tokenizer_NER):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [17]:
tokenized_datasets = dataset_names.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset_names["train"].column_names
)

Map:   0%|          | 0/11966 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2254 [00:00<?, ? examples/s]

Map:   0%|          | 0/780 [00:00<?, ? examples/s]

In [18]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer_NER)

In [19]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 11966
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2254
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 780
    })
})

In [20]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(4)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100],
        [-100,    1,    2,    0,    0,    0,    1,    2,    2,    1,    2,    2,
            2,    2, -100],
        [-100,    1,    2,    1,    2,    2,    2,    1,    2,    0,    0, -100,
         -100, -100, -100],
        [-100,    1,    0,    0,    2,    2,    2, -100, -100, -100, -100, -100,
         -100, -100, -100]])

In [21]:
for i in range(4):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 0, 0, -100]
[-100, 1, 2, 0, 0, 0, 1, 2, 2, 1, 2, 2, 2, 2, -100]
[-100, 1, 2, 1, 2, 2, 2, 1, 2, 0, 0, -100]
[-100, 1, 0, 0, 2, 2, 2, -100]


In [22]:
metric = evaluate.load("seqeval")

In [23]:
dataset_names

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'mode', '__index_level_0__'],
        num_rows: 11966
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'mode', '__index_level_0__'],
        num_rows: 2254
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'mode', '__index_level_0__'],
        num_rows: 780
    })
})

In [24]:
labels = dataset_names["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['PER-NAME']

In [25]:
labels = dataset_names["train"][1]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['PER-SURN', 'PER-NAME', 'PER-SURN', 'PER-SURN', 'PER-PATR']

In [26]:
labels = dataset_names["train"][2]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['PER-SURN', 'PER-SURN', 'PER-PATR', 'PER-SURN', 'PER-NAME']

In [27]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))


{'NAME': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'PATR': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'SURN': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'overall_precision': 1.0,
 'overall_recall': 0.75,
 'overall_f1': 0.8571428571428571,
 'overall_accuracy': 0.8}

In [28]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [29]:
model_NER.config.num_labels

3

In [30]:
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=4,
)

eval_dataloader = DataLoader(
    tokenized_datasets["validation"], 
    collate_fn=data_collator, 
    batch_size=8
)

In [31]:
optimizer = AdamW(model_NER.parameters(), lr=5e-05, betas=(0.9,0.999), eps=1e-08)

In [32]:
# accelerator = Accelerator(cpu=True)
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model_NER, optimizer, train_dataloader, eval_dataloader
)

In [33]:
train_dataloader.device

device(type='cuda')

In [34]:
model.device

device(type='cuda', index=0)

In [35]:
num_train_epochs = 5
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=0.1,
    num_training_steps=num_training_steps,
)

In [36]:
def postprocess(predictions, labels):
    
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [37]:
output_dir = "model/bert-finetuned-ner-names-accelerate"

In [39]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training

    model.train()
    
    for i, batch in enumerate(train_dataloader):
        
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
        
    model.eval()
    
    for batch in eval_dataloader:

        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer_NER.save_pretrained(output_dir)

  0%|          | 0/14960 [00:00<?, ?it/s]



epoch 0: {'precision': 0.997057296174485, 'recall': 0.997057296174485, 'f1': 0.997057296174485, 'accuracy': 0.9985177393133786}
epoch 1: {'precision': 0.9984420979747274, 'recall': 0.9986149584487535, 'f1': 0.9985285207305462, 'accuracy': 0.999043702782825}
epoch 2: {'precision': 0.9996537995499394, 'recall': 0.9995672868887927, 'f1': 0.9996105413475269, 'accuracy': 0.9997131108348475}
epoch 3: {'precision': 0.9995672494374243, 'recall': 0.9996537695836579, 'f1': 0.999610507638378, 'accuracy': 0.9997131108348475}
epoch 4: {'precision': 0.9995672494374243, 'recall': 0.9996537695836579, 'f1': 0.999610507638378, 'accuracy': 0.9997131108348475}


In [41]:
model.cpu()

token_classifier = pipeline(
    "token-classification", model=model, aggregation_strategy="simple", tokenizer=tokenizer_NER
)

In [42]:
print(token_classifier("Макаров Андрей Николаевич"))

[{'entity_group': 'SURN', 'score': 0.9999559, 'word': 'мака', 'start': 0, 'end': 4}, {'entity_group': 'PATR', 'score': 0.9999753, 'word': '##ров', 'start': 4, 'end': 7}, {'entity_group': 'NAME', 'score': 0.9999467, 'word': 'андреи', 'start': 8, 'end': 14}, {'entity_group': 'PATR', 'score': 0.99996763, 'word': 'николаевич', 'start': 15, 'end': 25}]


In [43]:
print(token_classifier("Окрошковская Елена Викторовна"))

print(token_classifier("Кобаева Виола Ролландовна"))

print(token_classifier("Макарова (Пурпурова) Валентина Михайловна"))

[{'entity_group': 'SURN', 'score': 0.99995637, 'word': 'окро', 'start': 0, 'end': 4}, {'entity_group': 'PATR', 'score': 0.9999767, 'word': '##шковская', 'start': 4, 'end': 12}, {'entity_group': 'NAME', 'score': 0.999951, 'word': 'елена', 'start': 13, 'end': 18}, {'entity_group': 'PATR', 'score': 0.9999747, 'word': 'викторовна', 'start': 19, 'end': 29}]
[{'entity_group': 'SURN', 'score': 0.99995744, 'word': 'коба', 'start': 0, 'end': 4}, {'entity_group': 'PATR', 'score': 0.99997675, 'word': '##ева', 'start': 4, 'end': 7}, {'entity_group': 'NAME', 'score': 0.99995106, 'word': 'виола', 'start': 8, 'end': 13}, {'entity_group': 'PATR', 'score': 0.9999754, 'word': 'ролландовна', 'start': 14, 'end': 25}]
[{'entity_group': 'SURN', 'score': 0.9999559, 'word': 'мака', 'start': 0, 'end': 4}, {'entity_group': 'PATR', 'score': 0.9999777, 'word': '##рова', 'start': 4, 'end': 8}, {'entity_group': 'SURN', 'score': 0.9993266, 'word': '( пурпур', 'start': 9, 'end': 16}, {'entity_group': 'PATR', 'score':

In [65]:
import re

In [167]:
def name_reconstruct(name: str) -> str:

    """
    Функция для исправления формата имен в формат ФИО 
    В случае, если в тексте распознается более 1 фамилии, то используется формат 
        Ф (Ф1, Ф2, ... - при наличии старых фамилий) И О

    Параметры:
    name : str
        Строка с именем

    Возвращает:
    string_out : str
        Строка с именем требуемого формата
    """

    # создание словаря для сортировки элементов имени
    entities = ['SURN', 'NAME', 'PATR']
    sort_dict = {key: elem for elem, key in list(enumerate(entities))}

    name_tokens = re.findall("[а-яА-ЯЁё\-]+", name)

    NER_output = token_classifier(name_tokens)

    name_classes =  np.array([elem[0]['entity_group'] for elem in NER_output])

    # переформирование имени
    string_out = " ".join([x for _, x in sorted(zip(name_classes, name_tokens), key = lambda pair: sort_dict[pair[0]])])

    nameparts_counts = np.unique(name_classes, return_counts=True)

    # В случае, если больше одной фамилии - фамилии, следующие после 1й заключить в скобки
    if "SURN" in nameparts_counts[0] and nameparts_counts[1][-1] > 1:

        surnames = string_out.split()[:nameparts_counts[1][-1]]
        other_name_part = string_out.split()[nameparts_counts[1][-1]:]

        string_out = surnames[0] + " (" + ", ".join(surnames[1:]) + ") " + " ".join(other_name_part)

    return(string_out)

In [168]:
name_reconstruct("Пётр Петрович Семёнов-Тян-Шанский")

'Семёнов-Тян-Шанский Пётр Петрович'

In [170]:
name_reconstruct("Валентина (Пурпурова, Ольшанская)")

'Пурпурова (Ольшанская) Валентина'

## NER (для адресов)

In [None]:
sentence = "Калужская обл., г. Обнинск, ул. Аксёнова, д. 33, кв.21, прибыла из г. Воскресенск Московской обл. в 1990 г."

print("Input:", sentence)

nlp = pipeline("ner", model=model_NER, tokenizer=tokenizer_NER)

ner_results = nlp(sentence)

print("Output:", ner_results)

In [None]:
[elem['word'].replace("#", "") for elem in ner_results if elem['entity'] in ['B-LOC', "I-LOC"]]