In [1]:
# ---------------- Imports ----------------
import os
import json
import sys

from pathlib import Path

import yaml



In [None]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

data_folder = os.path.join(config["paths"]["proj_store"], "data")


# Input and output directories
input_directory = f"{data_folder}/intermediate_data/03_json_conversion"
output_directory = f"{data_folder}/yield_v1_base"
os.makedirs(output_directory, exist_ok=True)



In [3]:
# ---------------- Setup ----------------
# use:
folder_mapping = {
    # harvard_dataverse
    "harvard_dataverse/ai_feedback_moving_beyond": ("ai_feedback_moving_beyond", "academic_interviews", "harvard_dataverse"),
    "harvard_dataverse/biodiversity_offsetting": ("biodiversity_offsetting", "academic_interviews", "harvard_dataverse"),
    "harvard_dataverse/covid_19_threshold": ("covid_19_threshold", "academic_interviews", "harvard_dataverse"),
    "harvard_dataverse/drivers_of_food_choice": ("drivers_of_food_choice", "academic_interviews", "harvard_dataverse"),
    "harvard_dataverse/leaders_leading_organizational_change": ("leaders_leading_organizational_change", "academic_interviews", "harvard_dataverse"),
    "harvard_dataverse/relationship_building_around_farmers": ("relationship_building_around_farmers", "academic_interviews", "harvard_dataverse"),
    "harvard_dataverse/healthworker_interviews/lira": ("healthworker_interviews", "academic_interviews", "harvard_dataverse"),
    "harvard_dataverse/healthworker_interviews/mbarara": ("healthworker_interviews", "academic_interviews", "harvard_dataverse"),
    "harvard_dataverse/healthworker_interviews/wakiso": ("healthworker_interviews", "academic_interviews", "harvard_dataverse"),

    # jfk_library
    "jfk_library/returned_peace_corps_volunteers": ("returned_peace_corps_volunteers", "oral_history", "jfk_library"),

    # news
    "voa_news": ("voa_news", "journalistic_investigations", "voa_news"),
    "wikinews": ("wikinews", "journalistic_investigations", "wikinews"),

    # nasa
    "jsc_oral_history": ("jsc_oral_history", "oral_history", "jsc_oral_history"),

    # nara
    "nara/assembly_oral_histories": ("assembly_oral_histories", "oral_history", "nara"),
    "nara/nprc_oral_histories": ("nprc_oral_histories", "oral_history", "nara"),
    "nara/oral_history_at_the_national_archives": ("oral_history_at_the_national_archives", "oral_history", "nara"),
    "nara/veterans_oral_histories": ("veterans_oral_histories", "oral_history", "nara"),

    # other_collections
    "other_collections/flu_vaccination_interviews": ("flu_vaccination_interviews", "academic_interviews", "other_collections"),

}



In [4]:
def process_json_files(input_dir, output_dir, folder_mapping):
    def process_folder(input_folder, output_folder, counters):
        Path(output_folder).mkdir(parents=True, exist_ok=True)

        # Get the collection, domain, and subdomain for this folder
        relative_output_folder = os.path.relpath(output_folder, output_dir)
        collection, domain, broad_source = folder_mapping.get(relative_output_folder, ("", "", ""))

        # Generate the collection name in the desired format
        collection_id = collection.replace("_", "-").replace(" ", "-")

        # Initialize the counter for this collection if not already present
        if collection_id not in counters:
            counters[collection_id] = 0

        # Get a sorted list of files and directories in the folder
        items = sorted(os.listdir(input_folder))

        for item in items:
            item_path = os.path.join(input_folder, item)
            output_item_path = os.path.join(output_folder, item)

            if os.path.isdir(item_path):
                # Recursively process subfolders
                process_folder(item_path, output_item_path, counters)
            elif item.endswith('.json'):
                # Process JSON files
                with open(item_path, 'r', encoding='utf-8') as infile:
                    data = json.load(infile)

                    # Extract and remove elicitors and respondents from metadata
                    elicitors = data["metadata"].pop("elicitors", [])
                    respondents = data["metadata"].pop("respondents", [])
                    title = data["metadata"].pop("title", "")

                    # Ensure elicitors and respondents are lists
                    if isinstance(elicitors, str):
                        elicitor_list = [i.strip() for i in elicitors.split(",") if i.strip()]
                    else:
                        elicitor_list = [i.strip() for i in elicitors if isinstance(i, str) and i.strip()]

                    if isinstance(respondents, str):
                        respondent_list = [t.strip() for t in respondents.split(",") if t.strip()]
                    else:
                        respondent_list = [t.strip() for t in respondents if isinstance(t, str) and t.strip()]

                    # Create new top-level keys
                    dialogue_id = f"{collection_id}-{counters[collection_id]:05d}"
                    new_data = {
                        "dialogue_id": dialogue_id,
                        "metadata": data["metadata"],  
                        "broad_source": broad_source,       
                        "collection": collection,     
                        "domain": domain,             
                        "title": title,
                        "elicitors": elicitor_list,  
                        "respondents": respondent_list,           
                        "languages": ["en"],
                        "turns": []
                    }
                    counters[collection_id] += 1

                    # Process turns
                    turn_counter = 0
                    for i, turn in enumerate(data["turns"]):
                        speaker = turn["speaker"]

                        # Determine the speaker's role
                        role = "other"
                        if speaker in elicitor_list:
                            role = "elicitor"
                        elif speaker in respondent_list:
                            role = "respondent"

                        # Check if this turn should be combined with the previous turn
                        if i > 0 and turn["speaker"] == new_data["turns"][-1]["speaker"]:
                            # Combine with the previous turn
                            new_data["turns"][-1]["utterance"] += "\n\n" + turn["utterance"]
                        else:
                            # Add a new turn with an id at the top
                            new_turn = {
                                "turn_id": turn_counter,  # Counter at the top, renamed to 'id'
                                "timestamp": turn["timestamp"],
                                "speaker": turn["speaker"],
                                "role": role,  # Assign the role based on the speaker
                                "utterance": turn["utterance"],
                            }
                            turn_counter += 1
                            new_data["turns"].append(new_turn)

                # Write the modified JSON to the output location
                with open(output_item_path, 'w', encoding='utf-8') as outfile:
                    json.dump(new_data, outfile, indent=4, ensure_ascii=False)

    # Initialize counters dictionary for tracking counts by collection
    counters = {}

    # Start processing from the root input directory
    process_folder(input_dir, output_dir, counters)


In [5]:
# Process the JSON files
process_json_files(input_directory, output_directory, folder_mapping)

