In [1]:
# ---------------- Imports ----------------
import os
import json
import re
import sys

import yaml



In [None]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

data_folder = os.path.join(config["paths"]["proj_store"], "data")

# Input and output folder paths
input_folder = f"{data_folder}/intermediate_data/02_manual_tagging"  
output_folder = f"{data_folder}/intermediate_data/03_json_conversion"  
os.makedirs(output_folder, exist_ok=True)



In [3]:
def parse_txt_to_json(input_folder, output_folder):

    all_errors = []  # List to collect all validation errors

    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.endswith(".txt"):
                input_file_path = os.path.join(root, file)
                
                # Read and parse the .txt file
                with open(input_file_path, "r", encoding="utf-8") as f:
                    content = f.read()
                
                metadata, dialogue = parse_content(content)

                # Validate speaker consistency but do NOT raise an error immediately
                validate_speakers(metadata, dialogue, file, all_errors)

                # Prepare the JSON data
                json_data = {
                    "metadata": metadata,
                    "turns": dialogue
                }
                
                # Determine output file path
                relative_path = os.path.relpath(root, input_folder)
                output_dir = os.path.join(output_folder, relative_path)
                os.makedirs(output_dir, exist_ok=True)
                output_file_path = os.path.join(output_dir, f"{os.path.splitext(file)[0]}.json")
                
                # Write the JSON file
                with open(output_file_path, "w", encoding="utf-8") as json_file:
                    json.dump(json_data, json_file, indent=4)

    # After all files are processed, raise all errors at once
    if all_errors:
        raise ValueError("Speaker mismatches found:\n" + "\n".join(all_errors))


def clean_text(text):

    
    text = text.replace("\n", " ")  
    text = re.sub(r"\s{2,}", " ", text)  
    return text.strip()

def parse_content(content):

    
    # Split the file into metadata and dialogue parts
    sections = content.split("--- dialogue ---")
    metadata_section = sections[0].replace("--- metadata ---", "").strip()
    dialogue_section = sections[1].strip() if len(sections) > 1 else ""
    
    # Parse metadata into a dictionary
    metadata = {}
    for line in metadata_section.split("\n"):
        if ": " in line:
            key, value = line.split(": ", 1)
            if key.strip().lower() in ["elicitors", "respondents"]:  # Convert these fields to a list
                metadata[key.strip().lower()] = [item.strip() for item in value.split(",")]
            else:
                metadata[key.strip().lower()] = value.strip()
    
    # Extract elicitors and respondents for speaker identification
    elicitors = metadata.get("elicitors", [])  # Lowercased keys for consistency
    respondents = metadata.get("respondents", [])
    
    # Parse dialogue into a list of turns
    dialogue = []
    speaker_pattern = r"<SPEAKER>(.*?)</SPEAKER>"
    timestamp_pattern = r"<TIMESTAMP>(.*?)</TIMESTAMP>"
    segments = re.split(speaker_pattern, dialogue_section)
    
    current_speaker = None
    for i, segment in enumerate(segments):
        if i % 2 == 1:  # Odd indices are speaker tags
            current_speaker = segment.strip()
        elif current_speaker:  # Even indices are dialogue associated with the last speaker
            role = "elicitor" if current_speaker in elicitors else "respondent"
            
            # Extract timestamp if present
            timestamp_match = re.search(timestamp_pattern, segment)
            timestamp = timestamp_match.group(1).strip() if timestamp_match else ""
            cleaned_text = clean_text(re.sub(timestamp_pattern, "", segment))  # Remove timestamp from text
            
            dialogue.append({
                "speaker": current_speaker,
                "role": role,
                "timestamp": timestamp,
                "utterance": cleaned_text
            })
    
    return metadata, dialogue

def validate_speakers(metadata, dialogue, filename, errors_list):

    metadata_elicitors = set(metadata.get("elicitors", []))
    metadata_respondents = set(metadata.get("respondents", []))

    dialogue_speakers = {turn["speaker"] for turn in dialogue}

    # Collect inconsistencies
    missing_from_dialogue = (metadata_elicitors | metadata_respondents) - dialogue_speakers
    extra_in_dialogue = dialogue_speakers - (metadata_elicitors | metadata_respondents)

    if missing_from_dialogue or extra_in_dialogue:
        error_message = f"\nError in file '{filename}':\n" \
                        f"- Speakers in metadata but missing from dialogue: {list(missing_from_dialogue)}\n" \
                        f"- Speakers in dialogue but not in metadata: {list(extra_in_dialogue)}"
        errors_list.append(error_message)  # Collect errors instead of stopping execution



# Run the parsing function
parse_txt_to_json(input_folder, output_folder)

