In [None]:
import json

In [None]:
# extract required data per annotator
def extract_data_per_annotator(annotator_number):
    file_path = f'data_extraction/personal_annotation_{annotator_number}.json'

    with open(file_path, 'r') as json_file:
        data = json.load(json_file)

    return data

In [None]:
def aggregate_data():
    all_annotators = []

    for i in range(0, 5):
        all_annotators.append(extract_data_per_annotator(i + 1))

    return all_annotators

In [None]:
from collections import Counter
import re


def normalize_label(label):
    label = label.lower()  # convert to lowercase
    label = re.sub(r'[^\w\s]', '', label)  # remove punctuation
    return label

def count_label_occurrences(labels_list):
    normalized_labels = [normalize_label(label) for sublist in labels_list for label in sublist]
    counter = Counter()

    # count occurrences based on substring inclusion
    for label in set(normalized_labels):
        for sublist in labels_list:
            for other_label in sublist:
                normalized_other_label = normalize_label(other_label)
                if label in normalized_other_label:
                    counter[label] += 1
                    break  # count each label only once per sublist

    return counter

def filter_labels_by_threshold(counter, threshold):
    return [label for label, count in counter.items() if count >= threshold]

def select_most_specific_labels(labels):
    labels = sorted(labels, key=len)  # Sort labels by length (shortest first)
    for x in labels:
        for y in labels:
            if x!=y and x in y:
                labels.remove(y)

    return labels


def majority_vote_labels(labels_list, threshold):
    counter = count_label_occurrences(labels_list)
    filtered_labels = filter_labels_by_threshold(counter, threshold)
    most_specific_labels = select_most_specific_labels(filtered_labels)
    return most_specific_labels

In [None]:
def get_majority_vote_labels_dataset():
    # retrieve labels from all annotators
    annotators_labels = [value["labels"] for value in aggregate_data()]

    # set the threshold
    threshold = 3
    dataset_majority_vote_labels = []

    # iterate over the first 50 entries
    for i in range(0, 50):
        # collect labels for the current entry from all annotators
        labels = [value[i] for value in annotators_labels]

        # get the majority vote labels based on the threshold
        majority_vote = majority_vote_labels(labels, threshold)
        # if no majority vote found, lower the threshold by 1 and try again
        if len(majority_vote) == 0:
            majority_vote = majority_vote_labels(labels, threshold - 1)
        dataset_majority_vote_labels.append(majority_vote)

    return dataset_majority_vote_labels