In [5]:
import json
from sklearn.model_selection import train_test_split

# Load the dataset
with open('NER_TRAIN_JUDGEMENT.json', 'r', encoding='utf-8') as f:
    dataset = json.load(f)

# print('Number of sentences in the dataset:', len(dataset))

In [6]:
# Split the dataset into training and validation sets with an 85:15 ratio (randomly stratified)
train_data, val_data = train_test_split(dataset, test_size=0.15, random_state=42)

In [7]:
# Save the split datasets into separate JSON files
with open('train_dataset.json', 'w', encoding='utf-8') as train_file:
    json.dump(train_data, train_file, ensure_ascii=False, indent=4)

with open('val_dataset.json', 'w', encoding='utf-8') as val_file:
    json.dump(val_data, val_file, ensure_ascii=False, indent=4)

# print("Training set size:", len(train_data))
# print("Validation set size:", len(val_data))

In [8]:
import json
from collections import defaultdict

#Assign BIO labels
def assign_bio_labels(text, annotations):
    #Initialize a list to store tokenized text
    tokens=[]
    start=0
    i=0
    flag=0

    #Tokenize the text based on space using two pointers
    while i<len(text):
        if text[i]==' ':
            tokens.append([start, text[start:i], i-1])
            start=i+1
        i+=1
    tokens.append([start,text[start:i],i-1])  # [Specified format[0, 'The', 2], [4, 'quick', 8],......]
    # print(tokens)

    #BIO labels for each token stored in a different list
    bio_labels=['O']*len(tokens)

    #Updating the BIO labels from 'O' to 'B' or 'I' based on the index annotations
    for annotation in annotations:
        for result in annotation['result']:
            start=result['value']['start']
            end=result['value']['end']
            label=result['value']['labels'][0]
            num_iter=1
            for i,token in enumerate(tokens):
                # print(num)
                if (start-1==token[0] or start==token[0])  and token[2]<=end+1:
                    if num_iter!=1:
                        flag=1
                        # print("1",token[1])
                        break
                    bio_labels[i]='B_' +label
                    num_iter+=1
                elif start-1<token[0]<=end+1:
                    if num_iter<2:
                        flag=1
                        # print("2",token[1])
                        break
                    bio_labels[i]='I_'+label
                    num_iter+=1
            if flag==1:
                break
    return bio_labels,flag

############################################################
# Load the train dataset
with open('train_dataset.json', 'r', encoding='utf-8') as file:
    train_data = json.load(file)

# Initialize a dictionary to store the train output
train_output = defaultdict(dict)

# Process each data instance in the train dataset
for instance in train_data:
    case_id = instance['id']
    text = instance['data']['text']
    annotations = instance['annotations']
    bio_labels, flag = assign_bio_labels(text, annotations)
    if flag==0:
        # Store the text and BIO labels in the train output dictionary
        train_output[case_id]['text'] = text
        train_output[case_id]['labels'] = bio_labels

# Save the train output to a JSON file
with open('train_bio_labels.json', 'w', encoding='utf-8') as file:
    json.dump(train_output, file, ensure_ascii=False, indent=4)

############################################################
# Load the validation dataset
with open('val_dataset.json', 'r', encoding='utf-8') as file:
    val_data = json.load(file)

# Initialize a dictionary to store the validation output
val_output = defaultdict(dict)

# Process each data instance in the validation dataset
for instance in val_data:
    case_id = instance['id']
    text = instance['data']['text']
    annotations = instance['annotations']
    bio_labels, flag = assign_bio_labels(text, annotations)

    if flag==0:
        # Store the text and BIO labels in the validation output dictionary
        val_output[case_id]['text'] = text
        val_output[case_id]['labels'] = bio_labels

# Save the validation output to a JSON file
with open('val_bio_labels.json', 'w', encoding='utf-8') as file:
    json.dump(val_output, file, ensure_ascii=False, indent=4)

############################################################
# Load the test dataset
with open('NER_TEST_JUDGEMENT.json', 'r', encoding='utf-8') as file:
    test_data = json.load(file)

# Initialize a dictionary to store the test output
test_output = defaultdict(dict)

# Process each data instance in the test dataset
for instance in test_data:
    case_id = instance['id']
    text = instance['data']['text']
    annotations = instance['annotations']
    bio_labels, flag = assign_bio_labels(text, annotations)

    if flag==0:
        # Store the text and BIO labels in the test output dictionary
        test_output[case_id]['text'] = text
        test_output[case_id]['labels'] = bio_labels

# Save the test output to a JSON file
with open('test_bio_labels.json', 'w', encoding='utf-8') as file:
    json.dump(test_output, file, ensure_ascii=False, indent=4)
