In [5]:
import warnings 
warnings.filterwarnings('ignore')

In [6]:
import pandas as pd

train_data = pd.read_json("/kaggle/input/datasetner/train.jsonl", lines=True)
test_data = pd.read_json("/kaggle/input/datasetner/test.jsonl", lines=True)

In [7]:
test_data = test_data.rename(columns={"senences": "sentences"})
test_data.head()

Unnamed: 0,sentences,id
0,Владелец «Бирмингема» получил шесть лет тюрьмы...,584
1,Акция протеста на Майдане Независимости объявл...,585
2,Фольксваген может перейти под контроль Порше \...,586
3,В Москве покажут фильмы Чарли Чаплина с живой ...,587
4,Чулпан Хаматова сыграет главную роль в фильме ...,588


In [8]:
train_data.head()

Unnamed: 0,ners,sentences,id
0,"[[0, 5, CITY], [16, 23, PERSON], [34, 41, PERS...",Бостон взорвали Тамерлан и Джохар Царнаевы из ...,0
1,"[[21, 28, PROFESSION], [53, 67, ORGANIZATION],...",Умер избитый до комы гитарист и сооснователь г...,1
2,"[[0, 4, PERSON], [37, 42, COUNTRY], [47, 76, O...",Путин подписал распоряжение о выходе России из...,2
3,"[[0, 11, PERSON], [36, 47, PROFESSION], [49, 6...",Бенедикт XVI носил кардиостимулятор\nПапа Римс...,3
4,"[[0, 4, PERSON], [17, 29, ORGANIZATION], [48, ...",Обама назначит в Верховный суд латиноамериканк...,4


In [9]:
from datasets import load_dataset

ent_types = load_dataset('MalakhovIlya/RuNNE', 'ent_types')['ent_types']
ent_types = ent_types['type']
list(ent_types)

Downloading builder script:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/236 [00:00<?, ?B/s]

Generating ent_types split: 0 examples [00:00, ? examples/s]

['AGE',
 'AWARD',
 'CITY',
 'COUNTRY',
 'CRIME',
 'DATE',
 'DISEASE',
 'DISTRICT',
 'EVENT',
 'FACILITY',
 'FAMILY',
 'IDEOLOGY',
 'LANGUAGE',
 'LAW',
 'LOCATION',
 'MONEY',
 'NATIONALITY',
 'NUMBER',
 'ORDINAL',
 'ORGANIZATION',
 'PENALTY',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'PROFESSION',
 'RELIGION',
 'STATE_OR_PROVINCE',
 'TIME',
 'WORK_OF_ART']

# Dictionary solution

In [10]:
from collections import defaultdict

# Initialize a dictionary to store token frequency and associated entity labels
entity_label_count = defaultdict(lambda: defaultdict(int))

for data in train_data.iterrows():
    for entity in data[1]['ners']:
        start_index, end_index, label = int(entity[0]), int(entity[1]), entity[2]
        token = data[1]['sentences'][start_index:end_index + 1]
        entity_label_count[token][label] += 1

In [11]:
!pip install razdel -q

In [12]:
from collections import defaultdict
from razdel import tokenize

# Function to find the most common label for a token
def get_most_common_label(label_counts):
    return max(label_counts.items(), key=lambda item: item[1])[0]

def get_ner_annotations(df, knowledge_base):
    """
    Process a DataFrame to extract named entity annotations based on a provided knowledge base.

    Args:
        df (pd.DataFrame): A DataFrame containing the texts to process in a column named 'sentences'.
        knowledge_base (dict): A dictionary with tokens as keys and another dictionary as values,
                               which maps labels to their counts for each token.

    Returns:
        list of dicts: A list where each element is a dictionary containing the document ID and its extracted entities.
    """
    ner_data = []

    for index, document in df.iterrows():
        entities = []
        for token in tokenize(document['sentences']):
            start, end = token.start, token.stop - 1
            token_text = document['sentences'][start:end + 1]

            if token_text in knowledge_base:
                most_common_label = get_most_common_label(knowledge_base[token_text])
                entities.append((start, end, most_common_label))

        ner_data.append({'id': document['id'], 'ners': entities})
    
    return ner_data

def combine_entities(ner_data):
    """
    Combine closely located entities with the same label into a single entity.

    Args:
        ner_data (list of dicts): A list containing dictionaries of named entities and their document IDs.

    Returns:
        list of dicts: A list similar to the input but with entities combined where appropriate.
    """
    combined_data = []

    for data in ner_data:
        combined_entities = []
        current_entity = None

        for start, end, label in data['ners']:
            if current_entity and start - current_entity[1] <= 4 and label == current_entity[2]:
                current_entity = (current_entity[0], end, label)  # Extend the current entity
            else:
                if current_entity:
                    combined_entities.append(current_entity)
                current_entity = (start, end, label)

        if current_entity:
            combined_entities.append(current_entity)

        combined_data.append({'id': data['id'], 'ners': combined_entities})

    return combined_data

ner_annotations = get_ner_annotations(test_data, entity_label_count)
processed_ners = combine_entities(ner_annotations)

In [13]:
processed_ners[:1]

[{'id': 584,
  'ners': [(30, 34, 'NUMBER'),
   (40, 45, 'PENALTY'),
   (128, 134, 'DATE'),
   (137, 137, 'NUMBER'),
   (145, 147, 'EVENT'),
   (149, 156, 'STATE_OR_PROVINCE'),
   (158, 167, 'EVENT'),
   (298, 302, 'NUMBER'),
   (320, 329, 'PENALTY'),
   (350, 358, 'AGE'),
   (382, 389, 'EVENT'),
   (403, 414, 'NUMBER'),
   (472, 475, 'DATE'),
   (480, 488, 'DATE'),
   (534, 537, 'DATE'),
   (567, 575, 'NUMBER')]}]

In [14]:
import json

with open("test.jsonl", 'w', encoding='utf-8') as file:
    for entry in processed_ners:  
        json_line = json.dumps(entry, ensure_ascii=False)
        file.write(json_line + '\n')