In [37]:
import json

In [31]:
with open("data/SQuAD/dev-v2.0.json") as f:
    squad = json.load(f)
with open("data/SQuAD/train-v2.0.json") as f:
    squad_train = json.load(f)

with open("data/GermanQuAD/GermanQuAD_test.json") as f:
    germanquad = json.load(f)
with open("data/GermanQuAD/GermanQuAD_train.json") as f:
    germanquad_train = json.load(f)

def total_dataset(dict1, dict2):
    def recursive_merge(out, in_dict):
        for key, value in in_dict.items():
            if key in out and isinstance(out[key], dict) and isinstance(value, dict):
                # Recursively merge nested dictionaries
                out[key] = recursive_merge(out[key].copy(), value)
            elif isinstance(value, list):
                if key in out and isinstance(out[key], list):
                    out[key] += value.copy()
                else:
                    out[key] = value.copy()
            else:
                # Merge non-dictionary values
                out[key] = value
        return out
    total = {}
    recursive_merge(total, dict1)
    recursive_merge(total, dict2)
    return total

datasets = {
    "SQuAD": {
        "train": squad_train,
        "dev": squad,
        "total": total_dataset(squad, squad_train)
    },
    "GermanQuAD": {
        "train": germanquad_train,
        "dev": germanquad,
        "total": total_dataset(germanquad, germanquad_train)
    }
}

In [51]:
def apply_on_datasets(operation):
    for name, data in datasets.items():
        for datasplit, dataset in data.items():
            print(name, " ", datasplit, ": ", operation(dataset))

def question_answer_pairs(dataset):
    count = 0
    for data in dataset["data"]:
        for paragraph in data["paragraphs"]:
            for qa in paragraph["qas"]:
                count += 1
    return count

def count_context(dataset):
    count = 0
    for data in dataset["data"]:
        for paragraph in data["paragraphs"]:
            count += 1
    return count

def count_impossible_answers(dataset):
    count = 0
    for data in dataset["data"]:
        for paragraph in data["paragraphs"]:
            for qa in paragraph["qas"]:
                if "is_impossible" in qa and qa["is_impossible"] == True:
                    count += 1
    return count

def avg_number_of_answers(dataset):
    total_answers = 0
    total_questions = 0
    for data in dataset["data"]:
        for paragraph in data["paragraphs"]:
            for qa in paragraph["qas"]:
                num = len(qa["answers"])
                if "is_impossible" in qa and qa["is_impossible"] == True:
                    assert num == 0
                if num == 0:
                    assert "is_impossible" in qa and qa["is_impossible"] == True
                else:
                    total_answers += num
                    total_questions += 1
    return total_answers / total_questions

In [52]:
print("=== Question-Answer Pairs ===")
apply_on_datasets(question_answer_pairs)

print()
print("=== Contexts ===")
apply_on_datasets(count_context)

print()
print("=== Impossible Questions ===")
apply_on_datasets(count_impossible_answers)

print()
print("=== Average Number of Answers per Question ===")
apply_on_datasets(avg_number_of_answers)

=== Question-Answer Pairs ===
SQuAD   train :  130319
SQuAD   dev :  11873
SQuAD   total :  142192
GermanQuAD   train :  11518
GermanQuAD   dev :  2204
GermanQuAD   total :  13722

=== Contexts ===
SQuAD   train :  19035
SQuAD   dev :  1204
SQuAD   total :  20239
GermanQuAD   train :  2540
GermanQuAD   dev :  474
GermanQuAD   total :  3014

=== Impossible Questions ===
SQuAD   train :  43498
SQuAD   dev :  5945
SQuAD   total :  49443
GermanQuAD   train :  0
GermanQuAD   dev :  0
GermanQuAD   total :  0

=== Average Number of Answers per Question ===
SQuAD   train :  1.0
SQuAD   dev :  3.4247638326585697
SQuAD   total :  1.154977412155387
GermanQuAD   train :  1.0
GermanQuAD   dev :  2.9655172413793105
GermanQuAD   total :  1.315697420201137
