# Обучение нейросетевых моделей

## Модули

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random
import time
from tqdm.auto import tqdm

import datetime
import os
import re

from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, AutoModelForTokenClassification, AutoTokenizer, \
    pipeline, DataCollatorForTokenClassification, get_scheduler

from accelerate import Accelerator

import torch
from torch.optim import AdamW
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader

import evaluate

## Исправление орфографии

In [None]:
# path_to_model = "ai-forever/RuM2M100-1.2B" 

path_to_model = "model/M2M100ForConditionalGeneration/" 
path_to_tokenizer = "model/M2M100Tokenizer/"

In [None]:
model_M100_spell = M2M100ForConditionalGeneration.from_pretrained(path_to_model)
tokenizer_M100_spell = M2M100Tokenizer.from_pretrained(path_to_tokenizer)

In [None]:
# model_M100_spell.save_pretrained("model/M2M100ForConditionalGeneration/")
# tokenizer_M100_spell.save_pretrained("model/M2M100Tokenizer/")

In [None]:
def correct_errors(sentence: str) -> str:

    encodings = tokenizer_M100_spell(sentence, return_tensors="pt")
    generated_tokens = model_M100_spell.generate(**encodings, 
                                                forced_bos_token_id=tokenizer_M100_spell.get_lang_id("ru"), 
                                                max_new_tokens = 200)
    answer = tokenizer_M100_spell.batch_decode(generated_tokens, skip_special_tokens=True)

    return(answer[0])

In [None]:
sentence = "Основая цель мероприятия 5 орпеля 2020 - практичиская оттработка навыкоф по ока занию помощь гражданов, попавшим в ДТП, а также повышение и совершенствование уровня профессиональной подготовки сотрудников МЧС при проведении аварийно-спасательных работ по ликвидации последствий дорожно-транспортных проишествий, сокращение временной показатель реагирование."

print("Input:", sentence)
print("Output:", correct_errors(sentence=sentence))

In [None]:
sentence = "Фомилию, имя, отчество не изменял"

print("Input:", sentence)
print("Output:", correct_errors(sentence=sentence))

In [None]:
sentence = "Студент, Уфимский фелеал Масковского нифтеного института им. Академика И.М. Губкина"

print("Input:", sentence)
print("Output:", correct_errors(sentence=sentence))

In [None]:
sentence = "Имею загррничный паспорт 6543217 89, ОУМС пос. Хор Хабаровского края, 04.02.2012"

print("Input:", sentence)
print("Output:", correct_errors(sentence=sentence))

## NER (Общее)

In [2]:
def align_labels_with_tokens(labels, word_ids, label_names):

    new_labels = []
    current_word = None
    
    for word_id in word_ids:
        if word_id != current_word:

            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            # if label % 2 == 1:
            #     if label < len(label_names)-1:
            #         label += 1
            new_labels.append(label)

    return new_labels

In [3]:
def compute_metrics(eval_preds, label_names, metric):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [4]:
def postprocess(predictions, labels, label_names):
    
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

### NER (для имен)

#### Настройки

In [None]:
# path_to_model = "viktoroo/sberbank-rubert-base-collection3" 
# path_to_tokenizer = "viktoroo/sberbank-rubert-base-collection3"

path_to_model = "ai-forever/ruBert-base" 
path_to_tokenizer = "ai-forever/ruBert-base"

# path_to_model = "DeepPavlov/rubert-base-cased" 
# path_to_tokenizer = "DeepPavlov/rubert-base-cased"

In [None]:
tokenizer_NER = AutoTokenizer.from_pretrained(path_to_tokenizer, use_fast=True)

In [None]:
def tokenize_and_align_labels(examples, tokenizer = tokenizer_NER):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
label_names = ['PER-NAME', 'PER-SURN', 'PER-PATR']

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
model_NER = AutoModelForTokenClassification.from_pretrained(path_to_model,
                                                               id2label=id2label,
                                                               label2id=label2id)

In [None]:
model_NER

In [None]:
tokenizer_NER

In [None]:
# model_NER.save_pretrained("model/model_NER_custom/")
# tokenizer_NER.save_pretrained("model/tokenizer_NER_custom/")

In [None]:
# для обычной модели

# sentence = "Меня зовут Вольфганг и я живу в Берлине"

# print("Input:", sentence)

# nlp = pipeline("ner", model=model_NER, tokenizer=tokenizer_NER)

# ner_results = nlp(sentence)

# print("Output:", ner_results)

In [None]:
# "".join([elem['word'] for elem in ner_results[:2]]).replace('#', '').capitalize()

In [None]:
# "".join([elem['word'] for elem in ner_results[2:]]).replace('#', '').capitalize()

#### Обучение

In [None]:
names_dataset = pd.read_json("data/names/names_train_large.json")
print(names_dataset.shape)

In [None]:
names_dataset['mode'] = np.random.choice(['train', "val", "test"], size = names_dataset.shape[0], p = [0.9, 0.05, 0.05])

In [None]:
names_dataset['ner_tags'] = names_dataset['ner_tags'].map(lambda x: [i-1 for i in x])
names_dataset.head()

In [None]:
trdf = Dataset.from_pandas(names_dataset[names_dataset['mode'] == "train"])
vldf = Dataset.from_pandas(names_dataset[names_dataset['mode'] == "val"])
tedf = Dataset.from_pandas(names_dataset[names_dataset['mode'] == "test"])

dataset_names = DatasetDict({"train": trdf, "validation": vldf, "test": tedf})
dataset_names

In [None]:
tokenized_datasets = dataset_names.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset_names["train"].column_names
)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer_NER)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(4)])
batch["labels"]

In [None]:
for i in range(4):
    print(tokenized_datasets["train"][i]["labels"])

In [None]:
metric = evaluate.load("seqeval")

In [None]:
dataset_names

In [None]:
labels = dataset_names["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

In [None]:
labels = dataset_names["train"][1]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

In [None]:
labels = dataset_names["train"][2]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

In [None]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

In [None]:
model_NER.config.num_labels

In [None]:
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=4,
)

eval_dataloader = DataLoader(
    tokenized_datasets["validation"], 
    collate_fn=data_collator, 
    batch_size=8
)

In [None]:
optimizer = AdamW(model_NER.parameters(), lr=5e-05, betas=(0.9,0.999), eps=1e-08)

In [None]:
# accelerator = Accelerator(cpu=True)
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model_NER, optimizer, train_dataloader, eval_dataloader
)

In [None]:
train_dataloader.device

In [None]:
model.device

In [None]:
num_train_epochs = 5
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

cos_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=0.1,
    num_training_steps=num_training_steps,
)

In [None]:
output_dir = "model/bert-finetuned-ner-names-accelerate"

In [None]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training

    model.train()
    
    for i, batch in enumerate(train_dataloader):
        
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        cos_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
        
    model.eval()
    
    for batch in eval_dataloader:

        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered, label_names)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer_NER.save_pretrained(output_dir)

#### Тест

In [None]:
path_to_model_NER_names = "model/bert-finetuned-ner-names-accelerate" 

In [None]:
## NER для имен
label_names = ['PER-NAME', 'PER-SURN', 'PER-PATR']

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model_NER = AutoModelForTokenClassification.from_pretrained(path_to_model_NER_names,
                                                               id2label=id2label,
                                                               label2id=label2id)
tokenizer_NER = AutoTokenizer.from_pretrained(path_to_model_NER_names, use_fast=True)

model_NER.cpu()

token_classifier = pipeline(
    "token-classification", model=model_NER, aggregation_strategy="simple", tokenizer=tokenizer_NER
)

In [None]:
print(token_classifier("Макаров Андрей Николаевич"))

In [None]:
print(token_classifier("Окрошковская Елена Викторовна"))

print(token_classifier("Кобаева Виола Ролландовна"))

print(token_classifier("Макарова (Пурпурова) Валентина Михайловна"))

In [None]:
def name_reconstruct(name: str) -> str:

    """
    Функция для исправления формата имен в формат ФИО 
    В случае, если в тексте распознается более 1 фамилии, то используется формат 
        Ф (Ф1, Ф2, ... - при наличии старых фамилий) И О

    Параметры:
    name : str
        Строка с именем

    Возвращает:
    string_out : str
        Строка с именем требуемого формата
    """

    # создание словаря для сортировки элементов имени
    entities = ['SURN', 'NAME', 'PATR']
    sort_dict = {key: elem for elem, key in list(enumerate(entities))}

    name_tokens = re.findall("[а-яА-ЯЁё\-]+", name)

    NER_output = token_classifier(name_tokens)

    name_classes =  np.array([elem[0]['entity_group'] for elem in NER_output])

    # переформирование имени
    string_out = " ".join([x for _, x in sorted(zip(name_classes, name_tokens), key = lambda pair: sort_dict[pair[0]])])

    nameparts_counts = np.unique(name_classes, return_counts=True)

    # В случае, если больше одной фамилии - фамилии, следующие после 1й заключить в скобки
    if "SURN" in nameparts_counts[0] and nameparts_counts[1][-1] > 1:

        surnames = string_out.split()[:nameparts_counts[1][-1]]
        other_name_part = string_out.split()[nameparts_counts[1][-1]:]

        string_out = surnames[0] + " (" + ", ".join(surnames[1:]) + ") " + " ".join(other_name_part)

    return(string_out)

In [None]:
name_reconstruct("Пётр Петрович Семёнов-Тян-Шанский")

In [None]:
name_reconstruct("Валентина (Пурпурова, Ольшанская)")

### NER (для адресов)

#### Настройки

In [None]:
# sentence = "Калужская обл., г. Обнинск, ул. Аксёнова, д. 33, кв.21, прибыла из г. Воскресенск Московской обл. в 1990 г."

# print("Input:", sentence)

# nlp = pipeline("ner", model=model_NER, tokenizer=tokenizer_NER)

# ner_results = nlp(sentence)

# print("Output:", ner_results)

In [None]:
# [elem['word'].replace("#", "") for elem in ner_results if elem['entity'] in ['B-LOC', "I-LOC"]]

In [5]:
# path_to_model = "ai-forever/ruBert-large" 
# path_to_tokenizer = "ai-forever/ruBert-large"

path_to_model = "DeepPavlov/rubert-base-cased" 
path_to_tokenizer = "DeepPavlov/rubert-base-cased"

In [7]:
tokenizer_NER = AutoTokenizer.from_pretrained(path_to_tokenizer, use_fast=True)

In [8]:
label_names = ["O",
               "LOC-REG", 
               "LOC-DIST", 
               "LOC-SETL", 
               "LOC-CDIST", 
               "LOC-STRT", 
               "LOC-HOUS", 
               "LOC-FLAT"]

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [9]:
def tokenize_and_align_labels(examples, tokenizer = tokenizer_NER):

    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    
    all_labels = examples["ner_tags"]
    new_labels = []

    for i, labels in enumerate(all_labels):
        if i == 4: print(labels)
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids, label_names))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [10]:
model_NER_adr = AutoModelForTokenClassification.from_pretrained(path_to_model,
                                                               id2label=id2label,
                                                               label2id=label2id)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Обучение

In [11]:
adr_dataset = pd.read_json("data/addresses/addresses_train_med_extra.json")
print(adr_dataset.shape)

(1200, 2)


In [12]:
adr_dataset['mode'] = np.random.choice(['train', "val", "test"], size = adr_dataset.shape[0], p = [0.9, 0.05, 0.05])
adr_dataset['ner_tags'] = adr_dataset['ner_tags'].map(lambda x: [i for i in x])
adr_dataset.head()

Unnamed: 0,tokens,ner_tags,mode
0,"[Семиэтажное, здание, находилось, по, адресу:,...","[0, 0, 0, 0, 0, 4, 4, 1, 1, 2, 2, 3, 3, 0, 0, ...",train
1,"[Проживает, по, адресу:, Нагорское, Деревня, д...","[0, 0, 0, 4, 4, 6, 6, 7, 7, 5, 5, 2, 2, 1, 1]",train
2,"[Проживает, по, адресу:, Боханский, р-н, Тойси...","[0, 0, 0, 2, 2, 3, 3, 6, 6, 5, 5, 7, 7, 1, 1, ...",train
3,"[Выставочный, зал, находится, по, адресу, Чишм...","[0, 0, 0, 0, 0, 2, 2, 1, 1, 3, 3, 4, 4]",train
4,"[Проживает, по, адресу:, Солтонский, р-н, Стер...","[0, 0, 0, 2, 2, 3, 3, 1, 1, 4, 4, 0, 0, 0, 0, ...",train


In [13]:
trdf = Dataset.from_pandas(adr_dataset[adr_dataset['mode'] == "train"])
vldf = Dataset.from_pandas(adr_dataset[adr_dataset['mode'] == "val"])
tedf = Dataset.from_pandas(adr_dataset[adr_dataset['mode'] == "test"])

dataset_adr = DatasetDict({"train": trdf, "validation": vldf, "test": tedf})
dataset_adr

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'mode', '__index_level_0__'],
        num_rows: 1065
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'mode', '__index_level_0__'],
        num_rows: 66
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'mode', '__index_level_0__'],
        num_rows: 69
    })
})

In [14]:
tokenized_datasets = dataset_adr.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset_adr["train"].column_names
)

Map:   0%|          | 0/1065 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[0, 0, 0, 2, 2, 3, 3, 1, 1, 4, 4, 0, 0, 0, 0, 3, 3, 7, 7, 2, 2, 6, 6, 4, 4, 5, 5]
[0, 0, 0, 0, 0, 0, 7, 7, 4, 4, 3, 3, 2, 2, 6, 6, 1, 1, 5, 5, 0, 0, 0, 0]


Map:   0%|          | 0/66 [00:00<?, ? examples/s]

[0, 0, 1, 1, 6, 6, 7, 7, 5, 5, 4, 4, 2, 2, 3, 3, 0, 0, 0, 0, 0, 0]


Map:   0%|          | 0/69 [00:00<?, ? examples/s]

[4, 4, 1, 1, 0, 0, 2, 2, 4, 4, 3, 3, 6, 6, 5, 5, 7, 7]


In [15]:
print(tokenized_datasets['train'][4]['labels'])

[-100, 0, 0, 0, 0, 2, 2, 2, 2, 2, 3, 3, 1, 1, 4, 4, 4, 0, 0, 0, 0, 3, 3, 3, 3, 7, 7, 7, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, -100]


In [16]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer_NER)
batch = data_collator([tokenized_datasets["train"][i] for i in range(5)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    4,    4,    4,    1,
            1,    1,    1,    1,    2,    2,    2,    2,    2,    3,    3,    3,
            3,    3,    0,    0,    0,    0,    0,    0,    0,    0, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100],
        [-100,    0,    0,    0,    0,    4,    4,    4,    6,    6,    7,    7,
            7,    5,    5,    2,    2,    2,    1,    1, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100],
        [-100,    0,    0,    0,    0,    2,    2,    2,    2,    2,    3,    3,
            3,    3,    3,    3,    6,    6,    6,    5,    5,    7,    7,    7,
            1,    1,    4,    4,    4,    4,    0,    0,    0,    0,    1,    1,
            4,   

In [17]:
metric = evaluate.load("seqeval")

Using the latest cached version of the module from C:\Users\Artem Kondrashov\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--seqeval\541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Wed Jan 17 23:40:08 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.


In [18]:
labels = dataset_adr["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['O',
 'O',
 'O',
 'O',
 'O',
 'LOC-CDIST',
 'LOC-CDIST',
 'LOC-REG',
 'LOC-REG',
 'LOC-DIST',
 'LOC-DIST',
 'LOC-SETL',
 'LOC-SETL',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [19]:
predictions = labels.copy()
predictions[2] = "LOC-SETL"
metric.compute(predictions=[predictions], references=[labels])



{'CDIST': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'DIST': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'REG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'SETL': {'precision': 0.5,
  'recall': 1.0,
  'f1': 0.6666666666666666,
  'number': 1},
 'overall_precision': 0.8,
 'overall_recall': 1.0,
 'overall_f1': 0.888888888888889,
 'overall_accuracy': 0.9473684210526315}

In [20]:
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=4,
)

eval_dataloader = DataLoader(
    tokenized_datasets["validation"], 
    collate_fn=data_collator, 
    batch_size=8
)

In [21]:
optimizer = AdamW(model_NER_adr.parameters(), lr=5e-05, betas=(0.9,0.999), eps=1e-08)

In [22]:
# accelerator = Accelerator(cpu=True)
accelerator = Accelerator()

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model_NER_adr, optimizer, train_dataloader, eval_dataloader
)

In [23]:
train_dataloader.device

device(type='cuda')

In [24]:
model.device

device(type='cuda', index=0)

In [25]:
num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

cos_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=0.1,
    num_training_steps=num_training_steps,
)

In [26]:
output_dir = "model/bert-finetuned-ner-addresses-accelerate"

In [27]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training

    # model.train()
    
    for i, batch in enumerate(train_dataloader):

        # print(i, batch['labels'])
        
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        cos_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
        
    model.eval()
    
    for batch in eval_dataloader:

        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered, label_names)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer_NER.save_pretrained(output_dir)

  0%|          | 0/2670 [00:00<?, ?it/s]

0 tensor([[-100,    0,    0,    3,    3,    3,    5,    5,    5,    6,    6,    6,
            6,    1,    1,    1,    1,    1,    2,    2,    2,    2,    4,    4,
            4,    7,    7,    7,    0,    0,    0,    0,    0,    0,    0, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    5,    5,    5,    1,
            1,    1,    4,    4,    4,    7,    7,    7,    6,    6,    6,    6,
            6,    2,    2,    2,    2,    2,    2,    2,    3,    3,    3,    3,
            0,    0,    0,    0,    0,    0,    0,    0, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100],
        [-100,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    3,
            3,    4,    4,    4,    4,    4,    4,    1,    1,    2,    2,    2,
            2,    6,    



epoch 0: {'precision': 0.9685230024213075, 'recall': 0.9569377990430622, 'f1': 0.9626955475330927, 'accuracy': 0.9805194805194806}
0 tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    2,    2,    2,    2,    2,    4,
            4,    4,    4,    4,    4,    4,    4,    4,    3,    3,    3,    3,
            3,    3,    1,    1,    1, -100],
        [-100,    0,    0,    2,    2,    2,    2,    4,    4,    4,    4,    3,
            3,    3,    3,    0,    0,    0,    0,    0,    0,    0, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100],
        [-100,    0,    0,    6,    6,    6,    5,    5,    5,    1,    1,    2,
            2,    2,    2,    2,    7,    7,    7,    4,    4,    4,    4,    4,
            4,    3,    3,    3,    3,    3,    0,    0,    0,    0,    0,    0,
            0, -100, -100, -100, -100, -100],


#### Тест

In [28]:
path_to_model_NER_addresses = "model/bert-finetuned-ner-addresses-accelerate" 

In [29]:
## NER для адресов
label_names = ["O",
               "LOC-REG", 
               "LOC-DIST", 
               "LOC-SETL", 
               "LOC-CDIST", 
               "LOC-STRT", 
               "LOC-HOUS", 
               "LOC-FLAT"]

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model_NER_adr = AutoModelForTokenClassification.from_pretrained(path_to_model_NER_addresses,
                                                               id2label=id2label,
                                                               label2id=label2id)

tokenizer_NER_adr = AutoTokenizer.from_pretrained(path_to_model_NER_addresses, use_fast=True)


In [44]:
path_to_base_model = "viktoroo/sberbank-rubert-base-collection3" 
path_to_base_tokenizer = "viktoroo/sberbank-rubert-base-collection3"

In [45]:
tokenizer_base_NER = AutoTokenizer.from_pretrained(path_to_base_tokenizer, use_fast=True)
model_base_NER = AutoModelForTokenClassification.from_pretrained(path_to_base_model)

Downloading tokenizer.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [51]:
sentence = "Калужская обл., г. Обнинск, ул. Аксёнова, д. 33, кв.21, прибыла из г. Воскресенск Московской обл. в 1990 г."

print("Input:", sentence)

nlp = pipeline("ner", model=model_base_NER, tokenizer=tokenizer_base_NER)

ner_results = nlp(sentence)

[print(i) for i in ner_results]

Input: Калужская обл., г. Обнинск, ул. Аксёнова, д. 33, кв.21, прибыла из г. Воскресенск Московской обл. в 1990 г.
{'entity': 'B-LOC', 'score': 0.99903023, 'index': 1, 'word': 'калу', 'start': 0, 'end': 4}
{'entity': 'I-LOC', 'score': 0.99928755, 'index': 2, 'word': '##жская', 'start': 4, 'end': 9}
{'entity': 'I-LOC', 'score': 0.99918526, 'index': 3, 'word': 'обл', 'start': 10, 'end': 13}
{'entity': 'B-LOC', 'score': 0.99773264, 'index': 8, 'word': 'обни', 'start': 19, 'end': 23}
{'entity': 'I-LOC', 'score': 0.99917656, 'index': 9, 'word': '##нс', 'start': 23, 'end': 25}
{'entity': 'I-LOC', 'score': 0.9992446, 'index': 10, 'word': '##к', 'start': 25, 'end': 26}
{'entity': 'B-PER', 'score': 0.47631174, 'index': 14, 'word': 'аксе', 'start': 32, 'end': 36}
{'entity': 'B-LOC', 'score': 0.9991147, 'index': 29, 'word': 'воскрес', 'start': 70, 'end': 77}
{'entity': 'I-LOC', 'score': 0.99945587, 'index': 30, 'word': '##енс', 'start': 77, 'end': 80}
{'entity': 'I-LOC', 'score': 0.9994547, 'inde

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [30]:
model_NER_adr.cpu()

token_classifier = pipeline(
    "token-classification", model=model_NER_adr, aggregation_strategy="simple", tokenizer=tokenizer_NER_adr
)

In [31]:
token_classifier("Калужская обл., г. Обнинск, ул. Университетская, д.50")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'REG',
  'score': 0.9971412,
  'word': 'Калужская обл.',
  'start': 0,
  'end': 14},
 {'entity_group': 'SETL',
  'score': 0.34576797,
  'word': 'г',
  'start': 16,
  'end': 17},
 {'entity_group': 'DIST',
  'score': 0.36668354,
  'word': '.',
  'start': 17,
  'end': 18},
 {'entity_group': 'SETL',
  'score': 0.7416569,
  'word': 'Обнинск',
  'start': 19,
  'end': 26},
 {'entity_group': 'STRT',
  'score': 0.9991473,
  'word': 'ул. Университетская,',
  'start': 28,
  'end': 48},
 {'entity_group': 'HOUS',
  'score': 0.99976665,
  'word': 'д. 50',
  'start': 49,
  'end': 53}]

In [32]:
[print(i) for i in token_classifier(("Калужская обл., г. Обнинск, ул. Университетская, д.50").split(", "))]

[{'entity_group': 'REG', 'score': 0.99958247, 'word': 'Калужская обл.', 'start': 0, 'end': 14}]
[{'entity_group': 'SETL', 'score': 0.786415, 'word': 'г. Обнинск', 'start': 0, 'end': 10}]
[{'entity_group': 'STRT', 'score': 0.9994958, 'word': 'ул. Университетская', 'start': 0, 'end': 19}]
[{'entity_group': 'HOUS', 'score': 0.9997356, 'word': 'д. 50', 'start': 0, 'end': 4}]


[None, None, None, None]

In [39]:
[print(i) for i in token_classifier(("Проживала по адресу: Республика Башкортостан, г. Салават, б-р Космонавтов, д. 14, кв. 67"))]

{'entity_group': 'REG', 'score': 0.9996827, 'word': 'Республика Башкортостан', 'start': 21, 'end': 44}
{'entity_group': 'STRT', 'score': 0.976499, 'word': 'г. Салават, б - р Космонавтов,', 'start': 46, 'end': 74}
{'entity_group': 'HOUS', 'score': 0.9997561, 'word': 'д. 14', 'start': 75, 'end': 80}
{'entity_group': 'FLAT', 'score': 0.9996807, 'word': 'кв. 67', 'start': 82, 'end': 88}


[None, None, None, None]

In [42]:
[print(i) for i in token_classifier(("Проживала по адресу: Республика Башкортостан, г. Салават, б-р Космонавтов, д. 14, кв. 67").split(", "))]

[{'entity_group': 'REG', 'score': 0.99970347, 'word': 'Республика Башкортостан', 'start': 21, 'end': 44}]
[{'entity_group': 'SETL', 'score': 0.52847666, 'word': 'г', 'start': 0, 'end': 1}, {'entity_group': 'STRT', 'score': 0.4328746, 'word': '.', 'start': 1, 'end': 2}, {'entity_group': 'SETL', 'score': 0.6736115, 'word': 'Салават', 'start': 3, 'end': 10}]
[{'entity_group': 'STRT', 'score': 0.99009085, 'word': 'б - р Космонавтов', 'start': 0, 'end': 15}]
[{'entity_group': 'HOUS', 'score': 0.9997365, 'word': 'д. 14', 'start': 0, 'end': 5}]
[{'entity_group': 'FLAT', 'score': 0.99945134, 'word': 'кв. 67', 'start': 0, 'end': 6}]


[None, None, None, None, None]

In [34]:
[print(i) for i in token_classifier(("Башкирской АССР, г. Уфа, ул. Космонавтов, д. 1").split(", "))]

[{'entity_group': 'REG', 'score': 0.99971014, 'word': 'Башкирской АССР', 'start': 0, 'end': 15}]
[{'entity_group': 'STRT', 'score': 0.567421, 'word': 'г. Уфа', 'start': 0, 'end': 6}]
[{'entity_group': 'STRT', 'score': 0.9993725, 'word': 'ул. Космонавтов', 'start': 0, 'end': 15}]
[{'entity_group': 'HOUS', 'score': 0.9996503, 'word': 'д. 1', 'start': 0, 'end': 4}]


[None, None, None, None]

In [52]:
[i for i in \
 token_classifier(("Калужская обл., г. Обнинск, ул. Аксёнова, д. 33, кв.21, прибыла из г. Воскресенск Московской обл. в 1990 г").split(", "))]

[[{'entity_group': 'REG',
   'score': 0.99958247,
   'word': 'Калужская обл.',
   'start': 0,
   'end': 14}],
 [{'entity_group': 'SETL',
   'score': 0.786415,
   'word': 'г. Обнинск',
   'start': 0,
   'end': 10}],
 [{'entity_group': 'STRT',
   'score': 0.9995245,
   'word': 'ул. Аксёнова',
   'start': 0,
   'end': 12}],
 [{'entity_group': 'HOUS',
   'score': 0.9997351,
   'word': 'д. 33',
   'start': 0,
   'end': 5}],
 [{'entity_group': 'FLAT',
   'score': 0.999176,
   'word': 'кв. 21',
   'start': 0,
   'end': 5}],
 [{'entity_group': 'SETL',
   'score': 0.837485,
   'word': 'г. Воскресенск',
   'start': 11,
   'end': 25},
  {'entity_group': 'REG',
   'score': 0.9994273,
   'word': 'Московской обл.',
   'start': 26,
   'end': 41}]]

In [58]:
[print(i) for i in \
 token_classifier(("г. Москва, ул. Новочерёмушкинская, д. 27, кв. 154, прибыл из г. Обнинска Калужской обл. в 2020 г.").split(", "))]

[{'entity_group': 'REG', 'score': 0.7767597, 'word': 'г. Москва', 'start': 0, 'end': 9}]
[{'entity_group': 'STRT', 'score': 0.998096, 'word': 'ул. Новочерёмушкинская', 'start': 0, 'end': 22}]
[{'entity_group': 'HOUS', 'score': 0.99972206, 'word': 'д. 27', 'start': 0, 'end': 5}]
[{'entity_group': 'FLAT', 'score': 0.99955446, 'word': 'кв. 154', 'start': 0, 'end': 7}]
[{'entity_group': 'SETL', 'score': 0.96924317, 'word': 'г. Обнинска', 'start': 10, 'end': 21}, {'entity_group': 'REG', 'score': 0.99968284, 'word': 'Калужской обл.', 'start': 22, 'end': 36}, {'entity_group': 'SETL', 'score': 0.85763633, 'word': '2020 г', 'start': 39, 'end': 45}]


[None, None, None, None, None]

In [54]:
[print(i) for i in token_classifier(("г. Москва, ул. Новочерёмушкинская, д. 27, кв. 154, прибыл из г. Обнинска Калужской обл. в 2020 г."))]

{'entity_group': 'REG', 'score': 0.8138423, 'word': 'Москва', 'start': 3, 'end': 9}
{'entity_group': 'STRT', 'score': 0.9994006, 'word': 'ул. Новочерёмушкинская,', 'start': 11, 'end': 34}
{'entity_group': 'HOUS', 'score': 0.99947906, 'word': 'д. 27,', 'start': 35, 'end': 41}
{'entity_group': 'FLAT', 'score': 0.9996786, 'word': 'кв. 154', 'start': 42, 'end': 49}
{'entity_group': 'SETL', 'score': 0.9822567, 'word': 'г. Обнинска', 'start': 61, 'end': 72}
{'entity_group': 'REG', 'score': 0.9995957, 'word': 'Калужской обл.', 'start': 73, 'end': 87}
{'entity_group': 'SETL', 'score': 0.48084185, 'word': '2020', 'start': 90, 'end': 94}


[None, None, None, None, None, None, None]

In [104]:
def address_reconstruct(address: str) -> str:

    """
    Функция для исправления формата адреса в формат Регион, район, город/поселок, улица, дом, квартира 

    Параметры:
    address : str
        Строка, содержащая адрес

    Возвращает:
    string_out : str
        Строка с адресом требуемого формата
    """

    # создание словаря для сортировки элементов адреса
    entities = ["O", "REG", "DIST", "SETL", "CDIST", "STRT", "HOUS", "FLAT"]
    
    sort_dict = {key: elem for elem, key in list(enumerate(entities))}

    adr_tokens = address.strip().split(", ")

    # print(adr_tokens)

    NER_output = list(token_classifier(adr_tokens))

    addresses = [[]]
    adr_entities = [[]]
    i = 0

    for token, elem in zip(adr_tokens, NER_output):

        if len(elem) > 1:

            if elem[0]["start"] > 0:

                i += 1
                addresses.append([token[:elem[0]["start"]]])
                adr_entities.append(["O"])

            for subelem in elem:

                addresses[i].append(subelem["word"])
                adr_entities[i].append(subelem["entity_group"])

            if elem[-1]["end"] < len(token)-1:

                addresses.append([token[elem[-1]["end"]:]])
                adr_entities[i].append("O")

        else:
            addresses[i].append(token)
            adr_entities[i].append(elem[0]["entity_group"])

    # print(addresses)
    # print(adr_entities)

    final_list = []

    for adr_lst, entity in zip(addresses, adr_entities):      

        # print("tokens:", adr_lst)
        # print("entities:", entity)    

        if entity[0] == "O":
                idx_start = 1
                left = adr_lst[0]
        else:
                idx_start = 0
                left = ""

        if entity[-1] == "O":
                idx_end = len(entity) - 1
                right = adr_lst[-1]
        else:
                idx_end = len(entity)
                right = ""

        adr_sorted = [x for _, x in sorted(zip(entity[idx_start:idx_end], adr_lst[idx_start:idx_end]), \
                                            key = lambda pair: sort_dict[pair[0]])]

        final_list.append(left + ", ".join(adr_sorted) + right)
        
        # print(final_list)

    # # переформирование адреса
    string_out = ", ".join(final_list)

    return(string_out)

In [105]:
address_reconstruct("ул. Новочерёмушкинская, д. 27, кв. 154, г. Москва, прибыл из г. Обнинска Калужской обл. в 2020 г.")

'г. Москва, ул. Новочерёмушкинская, д. 27, кв. 154, прибыл из Калужской обл., г. Обнинска, 2020 г'

In [None]:
address_reconstruct("ул. Университетская, д.50, г. Обнинск, Калужская обл.")

In [None]:
address_reconstruct("ул. Ярцевская, д. 29, корп.2, кв. 147, г. Москва")

In [None]:
address_reconstruct("Хабаровский край, г. Хабаровск, Центральный р-н, ул. Гоголя, д. 42")