In [9]:
import warnings 
warnings.filterwarnings('ignore')

In [10]:
import pandas as pd

train_data = pd.read_json("/kaggle/input/datasetner/train.jsonl", lines=True)
test_data = pd.read_json("/kaggle/input/datasetner/test.jsonl", lines=True)
dev_data = pd.read_json("/kaggle/input/datasetner/dev.jsonl", lines=True)

In [11]:
train_data.head()

Unnamed: 0,ners,sentences,id
0,"[[0, 5, CITY], [16, 23, PERSON], [34, 41, PERS...",Бостон взорвали Тамерлан и Джохар Царнаевы из ...,0
1,"[[21, 28, PROFESSION], [53, 67, ORGANIZATION],...",Умер избитый до комы гитарист и сооснователь г...,1
2,"[[0, 4, PERSON], [37, 42, COUNTRY], [47, 76, O...",Путин подписал распоряжение о выходе России из...,2
3,"[[0, 11, PERSON], [36, 47, PROFESSION], [49, 6...",Бенедикт XVI носил кардиостимулятор\nПапа Римс...,3
4,"[[0, 4, PERSON], [17, 29, ORGANIZATION], [48, ...",Обама назначит в Верховный суд латиноамериканк...,4


In [12]:
test_data = test_data.rename(columns={"senences": "sentences"})
test_data.head()

Unnamed: 0,sentences,id
0,Владелец «Бирмингема» получил шесть лет тюрьмы...,584
1,Акция протеста на Майдане Независимости объявл...,585
2,Фольксваген может перейти под контроль Порше \...,586
3,В Москве покажут фильмы Чарли Чаплина с живой ...,587
4,Чулпан Хаматова сыграет главную роль в фильме ...,588


In [13]:
dev_data = dev_data.rename(columns={"senences": "sentences"})
dev_data.head()

Unnamed: 0,sentences,id
0,Генерал Д.Петреус назначен на пост главы ЦРУ.\...,519
1,Подозреваемые в нападении на Charlie Hebdo зах...,520
2,Скончалась Джанет Рено — первая женщина-генпро...,521
3,Школьник из Иванова получил «Золотой крест» за...,522
4,Врачи установили причину смерти Сергея Доренко...,523


In [14]:
from datasets import load_dataset

ent_types = load_dataset('MalakhovIlya/RuNNE', 'ent_types')['ent_types']
ent_types = ent_types['type']
list(ent_types)

['AGE',
 'AWARD',
 'CITY',
 'COUNTRY',
 'CRIME',
 'DATE',
 'DISEASE',
 'DISTRICT',
 'EVENT',
 'FACILITY',
 'FAMILY',
 'IDEOLOGY',
 'LANGUAGE',
 'LAW',
 'LOCATION',
 'MONEY',
 'NATIONALITY',
 'NUMBER',
 'ORDINAL',
 'ORGANIZATION',
 'PENALTY',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'PROFESSION',
 'RELIGION',
 'STATE_OR_PROVINCE',
 'TIME',
 'WORK_OF_ART']

# Spacy solution

In [15]:
def sort_entities(entities):
    """
    Sort entities by their length in descending order to prioritize larger entities.

    This function ensures that larger entities are considered first, which can be 
    useful when larger contexts carry more specific or relevant information.

    Args:
        entities (list of tuples): A list of tuples where each tuple represents 
                                   an entity defined as (start_index, end_index, label).

    Returns:
        list of tuples: The sorted list of entities, with longer entities appearing first.
    """
    return sorted(entities, key=lambda x: x[1] - x[0], reverse=True)

def remove_overlapping_entities(entities):
    """
    Remove overlapping entities from a list, preferring to keep the longest entity
    in cases where overlaps occur.

    This function is particularly useful in named entity recognition tasks where 
    overlapping entities can be identified by the model, and a decision needs to be
    made to keep the most relevant (typically the longest) entity.

    Args:
        entities (list of tuples): A list of tuples representing the entities. 
                                   Each tuple is formatted as (start_index, end_index, label).

    Returns:
        list of tuples: A list of entities with overlaps removed. Each entity is 
                        extended to include its end_index by adding 1 to ensure inclusive indexing.
    """
    
    span_set = set()  # Tracks covered indices
    non_overlapping_entities = []

    for start, end, label in entities:
        if not any(index in span_set for index in range(start, end + 1)):
            non_overlapping_entities.append((start, end + 1, label))  # +1 to make the end index inclusive
            span_set.update(range(start, end + 1))

    return non_overlapping_entities


# Prepare the training data for spaCy with sorted and non-overlapping entities
prepared_data = [
    (row['sentences'], {"entities": remove_overlapping_entities(sort_entities(row['ners']))})
    for _, row in train_data.iterrows()
]

In [16]:
!python3 -m spacy download ru_core_news_lg -q

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_lg')


In [17]:
import random
import spacy
from spacy.training import Example
import ru_core_news_lg

nlp = ru_core_news_lg.load()
optimizer = nlp.initialize()
num_iterations = 10

for iteration in range(num_iterations):
    random.shuffle(prepared_data)  # Shuffle training data each epoch

    losses = {}
    for idx, (text, annotations) in enumerate(prepared_data):
        doc = nlp.make_doc(text)
        training_example = Example.from_dict(doc, annotations)
        nlp.update([training_example], sgd=optimizer, losses=losses)

    print(f'Epoch {iteration + 1}/{num_iterations}, Loss: {losses.get("ner", 0):.4f}')

[2024-04-28 13:23:55,237] [INFO] Created vocabulary
[2024-04-28 13:23:55,239] [INFO] Finished initializing nlp object


Epoch 1/10, Loss: 34955.8723
Epoch 2/10, Loss: 20337.4654
Epoch 3/10, Loss: 15832.1105
Epoch 4/10, Loss: 12428.6963
Epoch 5/10, Loss: 9994.4448
Epoch 6/10, Loss: 8044.7176
Epoch 7/10, Loss: 6832.7917
Epoch 8/10, Loss: 5691.2122
Epoch 9/10, Loss: 4779.8625
Epoch 10/10, Loss: 4520.9024


In [18]:
import json

# Process test data to extract named entities and prepare for serialization
extracted_entities = []
for index, record in test_data.iterrows():
    document = nlp(record['sentences'])
    entities = [{'start': ent.start_char, 'end': ent.end_char - 1, 'type': ent.label_}
                for ent in document.ents if ent.label_ in ent_types]
    
    extracted_entities.append({'id': record['id'], 'entities': entities})

# Write the extracted entities to a JSONL file
with open("test.jsonl", "w", encoding="utf-8") as file:
    for data in extracted_entities:
        file.write(json.dumps(data) + "\n")

  matches = self.matcher(doc, allow_missing=True, as_spans=False)


# Dictionary solution

In [19]:
from collections import defaultdict

# Initialize a dictionary to store token frequency and associated entity labels
entity_label_count = defaultdict(lambda: defaultdict(int))

for data in train_data.iterrows():
    for entity in data[1]['ners']:
        start_index, end_index, label = int(entity[0]), int(entity[1]), entity[2]
        token = data[1]['sentences'][start_index:end_index + 1]
        entity_label_count[token][label] += 1

In [20]:
!pip install razdel -q

In [21]:
from collections import defaultdict
from razdel import tokenize

# Function to find the most common label for a token
def get_most_common_label(label_counts):
    return max(label_counts.items(), key=lambda item: item[1])[0]

def get_ner_annotations(df, knowledge_base):
    """
    Process a DataFrame to extract named entity annotations based on a provided knowledge base.

    Args:
        df (pd.DataFrame): A DataFrame containing the texts to process in a column named 'sentences'.
        knowledge_base (dict): A dictionary with tokens as keys and another dictionary as values,
                               which maps labels to their counts for each token.

    Returns:
        list of dicts: A list where each element is a dictionary containing the document ID and its extracted entities.
    """
    ner_data = []

    for index, document in df.iterrows():
        entities = []
        for token in tokenize(document['sentences']):
            start, end = token.start, token.stop - 1
            token_text = document['sentences'][start:end + 1]

            if token_text in knowledge_base:
                most_common_label = get_most_common_label(knowledge_base[token_text])
                entities.append((start, end, most_common_label))

        ner_data.append({'id': document['id'], 'ners': entities})
    
    return ner_data

def combine_entities(ner_data):
    """
    Combine closely located entities with the same label into a single entity.

    Args:
        ner_data (list of dicts): A list containing dictionaries of named entities and their document IDs.

    Returns:
        list of dicts: A list similar to the input but with entities combined where appropriate.
    """
    combined_data = []

    for data in ner_data:
        combined_entities = []
        current_entity = None

        for start, end, label in data['ners']:
            if current_entity and start - current_entity[1] <= 4 and label == current_entity[2]:
                current_entity = (current_entity[0], end, label)  # Extend the current entity
            else:
                if current_entity:
                    combined_entities.append(current_entity)
                current_entity = (start, end, label)

        if current_entity:
            combined_entities.append(current_entity)

        combined_data.append({'id': data['id'], 'ners': combined_entities})

    return combined_data

ner_annotations = get_ner_annotations(test_data, entity_label_count)
processed_ners = combine_entities(ner_annotations)

In [22]:
processed_ners[:1]

[{'id': 584,
  'ners': [(30, 34, 'NUMBER'),
   (40, 45, 'PENALTY'),
   (128, 134, 'DATE'),
   (137, 137, 'NUMBER'),
   (145, 147, 'EVENT'),
   (149, 156, 'STATE_OR_PROVINCE'),
   (158, 167, 'EVENT'),
   (298, 302, 'NUMBER'),
   (320, 329, 'PENALTY'),
   (350, 358, 'AGE'),
   (382, 389, 'EVENT'),
   (403, 414, 'NUMBER'),
   (472, 475, 'DATE'),
   (480, 488, 'DATE'),
   (534, 537, 'DATE'),
   (567, 575, 'NUMBER')]}]

In [23]:
with open("test.jsonl", 'w', encoding='utf-8') as file:
    for entry in processed_ners:  
        json_line = json.dumps(entry, ensure_ascii=False)
        file.write(json_line + '\n')