In [14]:
import os
import yaml
import json

def filter_and_convert_to_floats(lst):
    return [float(x) for x in lst if is_number(x)]

def is_number(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

def get_points(data):
    result = {}
    for key, value in data.items():
        if "login" in value and "points" in value:
            numeric_points = filter_and_convert_to_floats(value["points"])
            if len(numeric_points) < 14:
                numeric_points.extend([0.0] * (14 - len(numeric_points)))
            result[value["login"]] = numeric_points
    return result

def get_dataset(data, points):
    result = []
    for key, value in data.items():
        for idx, question in value["questions"].items():
            item = {
                "questionNumber": idx + 1,
                "questionText": question,
                "answerTest": value["answers"][idx],
                "score": points[value["login"]][idx],
            }
            result.append(item)
    return result

def process_yaml_files(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    for filename in os.listdir(input_dir):
        if filename.endswith(".yaml"):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename.replace(".yaml", ".json"))
            
            with open(input_path, "r", encoding="utf-8") as file:
                data = yaml.safe_load(file)
            
            result = get_dataset(data, get_points(data))
            
            with open(output_path, "w", encoding="utf8") as f:
                json.dump(result, f, ensure_ascii=False, indent=4)
            
            print(f"Processed {filename} -> {output_path}")

input_directory = "datasets/raw"
output_directory = "datasets/parsed"
process_yaml_files(input_directory, output_directory)

Processed 2021_1_A.yaml -> datasets/parsed/2021_1_A.json
Processed 2022_3.yaml -> datasets/parsed/2022_3.json
Processed 2022_1_A.yaml -> datasets/parsed/2022_1_A.json
Processed 2021_1_B.yaml -> datasets/parsed/2021_1_B.json
Processed 2022_1_B.yaml -> datasets/parsed/2022_1_B.json
Processed 2022_2.yaml -> datasets/parsed/2022_2.json
Processed 2021_2.yaml -> datasets/parsed/2021_2.json
