# Imports

In [56]:
import json
import re
from collections import Counter

# Functions

In [57]:
def read_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    return data

# create pairs of words and their corresponding ners
def count_ners(train):
    ners = {}
    for listing in train:
        for start, end, ner in listing["ners"]:
            word = ''.join(listing["sentences"][start:end+1])
            ners.setdefault(word, []).append(ner)
    return ners

# define the most common ner
def most_common_ner(ners):
    for key in ners.keys():
        ners[key] = Counter(ners[key]).most_common(1)[0]
    return ners

def tokenize(text):
    return re.findall(r"\w+|\n", text)

# Predict ners, using dictionary with pairs

In [58]:
train = read_file("/content/drive/MyDrive/Colab Notebooks/Assignment 3/test/train.jsonl")
ners = most_common_ner(count_ners(train))

In [59]:
text = read_file("/content/drive/MyDrive/Colab Notebooks/Assignment 3/test/test.jsonl")

# process each text entry to identify NERs and their positions within the text
# this is done by tokenizing the text and checking if each token is in the NERs dictionary
# the result is a list of lists, where each inner list contains tuples of start and end positions and the NER type.
tuples = [
    [(data["senences"].find(token), data["senences"].find(token) + len(token) - 1, ners[token][0])
     for token in tokenize(data["senences"]) if token in ners]
    for data in text
]

predictions = []

# iterate through each tuples list to merge entities
# if the current entity is of a different type or not adjacent to the previous entity, it is added as a new entry
# otherwise, the end position of the last entry is updated to include the current entity
for prediction in tuples:
    merged_prediction = []
    for start, end, ner in prediction:
        if not merged_prediction or ner != merged_prediction[-1][2] or end + 2 != merged_prediction[-1][1]:
            merged_prediction.append([start, end, ner])
        else:
            merged_prediction[-1][1] = end
    predictions.append(merged_prediction)

result = [{"ners": predictions[i], "id": text[i]["id"]} for i in range(len(text))]

In [60]:
with open('/content/drive/MyDrive/Colab Notebooks/Assignment 3/output_dict/test.jsonl', 'w') as outfile:
    for entry in result:
        json.dump(entry, outfile)
        outfile.write('\n')

# for convenient folder structure in colab
!zip -r /content/drive/MyDrive/Colab\ Notebooks/Assignment\ 3/output_dict/test.zip /content/drive/MyDrive/Colab\ Notebooks/Assignment\ 3/output_dict/test.jsonl

updating: content/drive/MyDrive/Colab Notebooks/Assignment 3/output_dict/test.jsonl (deflated 79%)


In [61]:
# for submission
!zip test test.jsonl

updating: test.jsonl (deflated 79%)
