In [5]:
import json
import re
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("Gherman/bert-base-NER-Russian")


# Загрузка исходных данных
FOLDER_PATH = '../datasets/'
with open(FOLDER_PATH + '307_labeled_resumes_no_duplicates.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Группы сущностей
GROUP_1 = ['DEGREE', 'LOCATION', 'TIME', 'LINKS', 'METRICS', 'POSITIONS']
GROUP_2 = ['COMPANIES', 'TECHNOLOGIES', 'NAME']
GROUP_3 = ['PROJECTS', 'ACHIEVEMENTS', 'RESPONSIBILITIES', 'SKILLS', 'EDUCATION', 'CONTACTS']

def extract_annotations_with_text(item):
    """Извлекает аннотации с текстом"""
    
    annotations = []
    text = item['data']['text']
    
    for ann in item['annotations']:
        for result in ann['result']:
            if 'value' in result:
                value = result['value']
                start = value['start']
                end = value['end']
                label = value['labels'][0]
                annotated_text = text[start:end]
                
                annotations.append({
                    'start': start,
                    'end': end,
                    'label': label,
                    'text': annotated_text
                })
    
    return annotations, text


def create_ner_sequence(text, annotations, target_labels, tokenizer):
    """Создает NER последовательность с BERT-токенизацией для всех сущностей и приведением к IOB формату """
    
    annotations = sorted(annotations, key=lambda x: x['start'])
    
    # Создаем карту аннотаций для быстрого поиска
    annotation_map = {}
    for ann in annotations:
        if ann['label'] in target_labels:
            for pos in range(ann['start'], ann['end']):
                annotation_map[pos] = ann
    
    tokens = []
    labels = []
    i = 0
    
    while i < len(text):
        if text[i].isspace():
            i += 1
            continue
            
        # Проверяем, начинается ли здесь аннотация
        if i in annotation_map:
            current_ann = annotation_map[i]
            annotated_text = current_ann['text']
            
            # Токенезируем как BERT для любой сущности
            sub_tokens = tokenizer.tokenize(annotated_text)
            
            if not sub_tokens:
                sub_tokens = [annotated_text]
            
            # Добавляем с правильными BIO-метками
            for sub_idx, sub_token in enumerate(sub_tokens):
                tokens.append(sub_token)
                if sub_idx == 0:
                    labels.append(f'B-{current_ann["label"]}')
                else:
                    labels.append(f'I-{current_ann["label"]}')
            
            i = current_ann['end']
            
        else:
            # Не аннотированный текст - тоже токенизируем через BERT
            # Находим границу неаннотированного текста
            j = i
            while j < len(text) and j not in annotation_map and not text[j].isspace():
                j += 1
            
            if j > i:
                non_annotated_text = text[i:j]
                sub_tokens = tokenizer.tokenize(non_annotated_text)
                
                if not sub_tokens:
                    sub_tokens = [non_annotated_text]
                
                for sub_token in sub_tokens:
                    tokens.append(sub_token)
                    labels.append('O')
                
                i = j
            else:
                i += 1
    
    return tokens, labels

def create_dataset_for_group(data,group_labels, group_name):
    """Создает датасет для конкретной группы сущностей"""
    dataset = []
    
    for item_idx, item in enumerate(data):
        annotations, text = extract_annotations_with_text(item)
        tokens, labels = create_ner_sequence(text, annotations, group_labels,tokenizer)
        
        if tokens and len(tokens) > 0:
            dataset.append({
                'tokens': tokens,
                'ner_tags': labels
            })
    
    # Сохраняем в JSONL
    output_file = f'{group_name}_dataset.jsonl'
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in dataset:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    print(f"Создан датасет {output_file} с {len(dataset)} примерами")
    
    # Статистика
    label_counts = defaultdict(int)
    for item in dataset:
        for tag in item['ner_tags']:
            label_counts[tag] += 1
    
    print(f"Статистика меток для {group_name}:")
    for label, count in sorted(label_counts.items()):
        print(f"   {label}: {count}")
    
    return dataset, label_counts

# Создаём датасеты
dataset1, stat_1 = create_dataset_for_group(data, GROUP_1, "group1")
dataset2, stat_2 = create_dataset_for_group(data, GROUP_2, "group2") 
dataset3, stat_3 = create_dataset_for_group(data, GROUP_3, "group3")
statistics = [stat_1, stat_2, stat_3]
    

Создан датасет group1_dataset.jsonl с 307 примерами
Статистика меток для group1:
   B-DEGREE: 559
   B-LINKS: 1638
   B-LOCATION: 1599
   B-METRICS: 425
   B-POSITIONS: 1826
   B-TIME: 3783
   I-DEGREE: 1481
   I-LINKS: 22789
   I-LOCATION: 1117
   I-METRICS: 1827
   I-POSITIONS: 8522
   I-TIME: 13070
   O: 310168
Создан датасет group2_dataset.jsonl с 307 примерами
Статистика меток для group2:
   B-COMPANIES: 2105
   B-NAME: 437
   B-TECHNOLOGIES: 12007
   I-COMPANIES: 10251
   I-NAME: 1787
   I-TECHNOLOGIES: 19284
   O: 322924
Создан датасет group3_dataset.jsonl с 307 примерами
Статистика меток для group3:
   B-ACHIEVEMENTS: 1303
   B-CONTACTS: 199
   B-EDUCATION: 1151
   B-PROJECTS: 1051
   B-RESPONSIBILITIES: 5793
   B-SKILLS: 1194
   I-ACHIEVEMENTS: 31530
   I-CONTACTS: 2828
   I-EDUCATION: 14600
   I-PROJECTS: 25200
   I-RESPONSIBILITIES: 103769
   I-SKILLS: 14284
   O: 165902


In [6]:
# Сохранение статистики в pickle
import pickle
with open(FOLDER_PATH + '307_stat.pkl', 'wb') as f:
    pickle.dump(statistics, f)
    

In [8]:
import pickle
import pandas as pd
with open(FOLDER_PATH + '150_stat.pkl', 'rb') as f_150:
    stat_150 = pickle.load(f_150)
with open(FOLDER_PATH + '307_stat.pkl', 'rb') as f_307:
    stat_307 = pickle.load(f_307)
def stat_to_pd(stat_data):
    total_stat = list()
    for stat in stat_data:
        group_pd = pd.Series(stat)
        total_stat.append(group_pd)
    return pd.concat(total_stat)
    
stat_150_pd = stat_to_pd(stat_150)
stat_307_pd = stat_to_pd(stat_307)
total_stat_pd = pd.concat([stat_150_pd, stat_307_pd], axis = 1)
total_stat_pd.columns = ['150_resumes', '307_resumes']
difference = total_stat_pd['307_resumes'] - total_stat_pd['150_resumes']

relative_difference = difference / total_stat_pd['150_resumes']
relative_difference = relative_difference.apply(lambda x: str(round(x*100, 2)) + '%')
total_stat_pd = pd.concat([total_stat_pd, difference, relative_difference], axis = 1)
total_stat_pd.columns = ['150_resumes', '307_resumes', 'difference', 'relative difference']
print(total_stat_pd)

                    150_resumes  307_resumes  difference relative difference
O                        151246       310168      158922             105.08%
B-TIME                     1759         3783        2024             115.07%
I-TIME                     5771        13070        7299             126.48%
B-LINKS                     787         1638         851             108.13%
I-LINKS                   12599        22789       10190              80.88%
B-POSITIONS                 850         1826         976             114.82%
I-POSITIONS                3873         8522        4649             120.04%
B-DEGREE                    255          559         304             119.22%
I-DEGREE                    680         1481         801             117.79%
B-LOCATION                  720         1599         879             122.08%
I-LOCATION                  569         1117         548              96.31%
B-METRICS                   201          425         224             111.44%