In [1]:
import pandas as pd
import numpy as np

In [2]:
import json

def convert_to_bio_encoding(annotations, text):
    e_ind = 0
    bio_encoding = []
    
    for annotation in annotations:
        start = annotation['value']['start']
        end = annotation['value']['end']
        label = annotation['value']['labels'][0]

        bio_encoding.extend(['O'] * len(text[e_ind:start].split()))
        bio_encoding.extend(['B_' + label] + ['I_' + label] * (len(text[start:end].split()) - 1))
        e_ind = end
        while(e_ind<len(text) and text[e_ind]!=" "):
            e_ind+=1
        
    bio_encoding.extend(['O'] * len(text[e_ind:].split()))

    return bio_encoding

def preprocess_data(original_data):
    processed_data = {}

    for i,entry in enumerate(original_data):
        entry_id = entry['id']
        annotations = entry['annotations'][0]['result']
        text = entry['data']['text']

        bio_encoding = convert_to_bio_encoding(annotations, text)

        processed_data[entry_id] = {
            'text': text,
            'labels': bio_encoding
        }

    return processed_data


In [3]:
t1_train_data = "./Task1_data/NER_TRAIN_JUDGEMENT.json"
t1_test_data = "./Task1_data/NER_TEST_JUDGEMENT.json"

In [4]:
with open(t1_train_data, 'r') as file:
    tr_data = json.load(file)
with open(t1_test_data, 'r') as file:
    test_data = json.load(file)

from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(tr_data, test_size=0.15, random_state=21)

processed_train_data = preprocess_data(train_data)
processed_val_data = preprocess_data(val_data)
processed_test_data = preprocess_data(test_data)


with open('./Task1_data/NER_train.json', 'w') as file:
    json.dump(processed_train_data, file, indent=2)
with open('./Task1_data/NER_val.json', 'w') as file:
    json.dump(processed_val_data, file, indent=2)
with open('./Task1_data/NER_test.json', 'w') as file:
    json.dump(processed_test_data, file, indent=2)