In [7]:
import json
from sklearn.model_selection import train_test_split

# Load the dataset
with open('NER_TRAIN_JUDGEMENT.json', 'r', encoding='utf-8') as f:
    dataset = json.load(f)

# print('Number of sentences in the dataset:', len(dataset))

In [8]:
# Split the dataset into training and validation sets with an 85:15 ratio (randomly stratified)
train_data, val_data = train_test_split(dataset, test_size=0.15, random_state=42)

In [9]:
# Save the split datasets into separate JSON files
with open('train_dataset.json', 'w', encoding='utf-8') as train_file:
    json.dump(train_data, train_file, ensure_ascii=False, indent=4)

with open('val_dataset.json', 'w', encoding='utf-8') as val_file:
    json.dump(val_data, val_file, ensure_ascii=False, indent=4)

# print("Training set size:", len(train_data))
# print("Validation set size:", len(val_data))

In [11]:
import json
from collections import defaultdict
# Function to tokenize text based on space
def tokenize_text(text):
    tokens = []
    start = 0
    i = 0
    while i < len(text):
        if text[i] == ' ':
            tokens.append([start, text[start:i], i-1])
            start = i + 1
        i += 1
    tokens.append([start, text[start:i], i-1])
    return tokens

# Function to perform binary search to find token indices
def binary_search(tokens, target_index):
    low, high = 0, len(tokens) - 1
    start_index = -1
    while low <= high:
        mid = (low + high) // 2
        if tokens[mid][0] == target_index:
            return mid
        elif tokens[mid][0] < target_index:
            start_index = mid
            low = mid + 1
        else:
            high = mid - 1
    return start_index


# Function to assign BIO labels
def assign_bio_labels(text, annotations):
    tokens = tokenize_text(text)
    bio_labels = ['O'] * len(tokens)
    for annotation in annotations:
        for result in annotation['result']:
            start, end = result['value']['start'], result['value']['end']
            label = result['value']['labels'][0]
            start_index = binary_search(tokens, start)
            end_index = binary_search(tokens, end)
            if start_index != -1 and end_index != -1:
                for i in range(start_index, end_index + 1):
                    if i == start_index:
                        bio_labels[i] = 'B_' + label
                    else:
                        bio_labels[i] = 'I_' + label
            # elif start_index != -1 or end_index != -1:
            #     print('Tokenization error')
    return bio_labels

############################################################
# Load the train dataset
with open('train_dataset.json', 'r', encoding='utf-8') as file:
    train_data = json.load(file)

# Initialize a dictionary to store the train output
train_output = defaultdict(dict)

# Process each data instance in the train dataset
for instance in train_data:
    case_id = instance['id']
    text = instance['data']['text']
    annotations = instance['annotations']
    bio_labels = assign_bio_labels(text, annotations)
    # Store the text and BIO labels in the train output dictionary
    train_output[case_id]['text'] = text
    train_output[case_id]['labels'] = bio_labels

# Save the train output to a JSON file
with open('train_bio_labels.json', 'w', encoding='utf-8') as file:
    json.dump(train_output, file, ensure_ascii=False, indent=4)

############################################################
# Load the validation dataset
with open('val_dataset.json', 'r', encoding='utf-8') as file:
    val_data = json.load(file)

# Initialize a dictionary to store the validation output
val_output = defaultdict(dict)

# Process each data instance in the validation dataset
for instance in val_data:
    case_id = instance['id']
    text = instance['data']['text']
    annotations = instance['annotations']
    bio_labels = assign_bio_labels(text, annotations)

    # Store the text and BIO labels in the validation output dictionary
    val_output[case_id]['text'] = text
    val_output[case_id]['labels'] = bio_labels

# Save the validation output to a JSON file
with open('val_bio_labels.json', 'w', encoding='utf-8') as file:
    json.dump(val_output, file, ensure_ascii=False, indent=4)

############################################################
# Load the test dataset
with open('NER_TEST_JUDGEMENT.json', 'r', encoding='utf-8') as file:
    test_data = json.load(file)

# Initialize a dictionary to store the test output
test_output = defaultdict(dict)

# Process each data instance in the test dataset
for instance in test_data:
    case_id = instance['id']
    text = instance['data']['text']
    annotations = instance['annotations']
    bio_labels = assign_bio_labels(text, annotations)

    # Store the text and BIO labels in the test output dictionary
    test_output[case_id]['text'] = text
    test_output[case_id]['labels'] = bio_labels

# Save the test output to a JSON file
with open('test_bio_labels.json', 'w', encoding='utf-8') as file:
    json.dump(test_output, file, ensure_ascii=False, indent=4)