In [1]:
# !pip install razdel -q

In [2]:
# import dependencies

import pandas as pd
from collections import defaultdict
from razdel import tokenize
import json
import warnings

In [3]:
# ignore warnings

warnings.filterwarnings('ignore')

In [4]:
# read data in the pandas format

train_data = pd.read_json("../data/data_train.jsonl", lines=True)
test_data = pd.read_json("../data/data_test.jsonl", lines=True)
test_data = test_data.rename(columns={"senences": "sentences"})

In [5]:
# counts occurrences of named entity labels for each token in the training data.

label_count = defaultdict(lambda: defaultdict(int))

for ind, row in train_data.iterrows():
    for entity in row['ners']:
        start_index, end_index, label = entity
        token = row['sentences'][start_index:end_index + 1]
        label_count[token][label] += 1

In [6]:
# generates annotations for named entities in the test data based on label counts.

annotations = []

for index, row in test_data.iterrows():
    entities = []
    for token in tokenize(row['sentences']):
        start, end = token.start, token.stop - 1
        text = row['sentences'][start:end + 1]

        if text in label_count:
            common = max(
                label_count[text].items(), key=lambda item: item[1])[0]
            entities.append((start, end, common))

    annotations.append({'id': row['id'], 'ners': entities})

In [7]:
# merge adjacent named entities in the annotations.

ners = []

for data in annotations:
    entites = []
    temp = None

    for start, end, label in data['ners']:
        if temp and not (start - temp[1] <= 4 and label == temp[2]):
            entites.append(temp)
        start = temp[0] if (temp and start - temp[1] <= 4 and label == temp[2]) else start
        temp = (start, end, label)

    ners.append({'id': data['id'], 'ners': entites})

In [None]:
# save predictions to an output file

with open("test.jsonl", 'w') as f:
    for v in ners:
        json.dump(v, f)
        f.write('\n')