In [1]:
# ---------------- Imports ----------------
import sys
import os
import json
import glob
import shutil

from datetime import datetime
from re import sub

import yaml



In [None]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

data_folder = os.path.join(config["paths"]["proj_store"], "data")

input_directory = f"{data_folder}/raw_data/machine_collected/oyez_supreme_court/data/transcripts/"
output_base_directory = f"{data_folder}/yield_v1_base/oyez_supreme_court/"
os.makedirs(output_base_directory, exist_ok=True)



In [3]:
# helpers
def snake_case(s):
    return '_'.join(
        sub('([A-Z][a-z]+)', r' \1',
        sub('([A-Z]+)', r' \1',
        s.replace('-', ' '))).split()).lower().replace(',', '')

In [4]:
# Initialize dialogue counter
dialogue_counter = 1

# Loop through all files in the input directory and subdirectories
for root, _, files in os.walk(input_directory):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(root, file)

            # Load the original JSON file
            with open(file_path, 'r') as f:
                original_data = json.load(f)

            # Extract year from the file path
            year = root.split('/')[-1]
            
            
            
            transcript = original_data.get("transcript")
            
            # Skip processing if 'transcript' is missing
            if transcript is None:
                #print("Skipping file as transcript is missing")
                continue  # Skip to the next file in the loop or exit this processing block
            
            
            media_file = original_data.get("media_file")
            # Skip processing if 'media_file' is missing
            if not media_file or all(item is None for item in media_file):
                #print("Skipping file as transcript is missing")
                continue  # Skip to the next file in the loop or exit this processing block
            audio_mpeg = media_file[0]["href"]
            
            id = original_data['id']
            document_title = original_data["title"].strip()
            transcript_title = transcript.get("title")
            duration = transcript.get("duration")
            

            #print(year)
            # Prepare output directory path by year
            output_directory = os.path.join(output_base_directory, year)
            os.makedirs(output_directory, exist_ok=True)
            
            # Create filename without extension and generate output filename
            filename_wo_ext = os.path.splitext(file)[0]
            output_filename = '_'.join(filter(None, snake_case(f"{filename_wo_ext}_{id}_{document_title}").replace(":","").split('_'))) + ".json"


            #print(output_filename)
            
            # Initialize elicitors and respondents for this file
            elicitors_set = set()
            respondents_set = set()

            
            #print(dialogue_counter)
            #print(document_title)
            # Transform the data to match the desired schema
            transformed_data = {
                "dialogue_id": f"oyez-supreme-court-{dialogue_counter:05}",
                "metadata": {
                    "id": id,
                    "document_title": document_title,
                    #"transcript_title": transcript_title,
                    "duration": duration,
                    "links": {
                        "audio_mpeg": audio_mpeg,
                        #"audio_ogg": original_data["media_file"][1]["href"],
                        #"audio_hls": original_data["media_file"][2]["href"]
                    },
                    "downloaded_on": None
                },
                "broad_source": "oyez_supreme_court",
                "collection": "oyez_supreme_court",
                "domain": "judicial_proceedings",
                "title": transcript_title,
                "elicitors": [],
                "respondents": [],
                "languages": ['en'],
                "turns": []
            }

            #print(transformed_data)

            # Populate the turns array
            turn_id = 1
            for section in original_data["transcript"]["sections"]:
                for turn in section["turns"]:
                    # Check if the speaker field exists and is not null
                    if not turn.get("speaker"):
                        continue  # Skip this turn if "speaker" is null

                    # Extract speaker information
                    speaker = turn["speaker"]["name"]
                    #speaker = None
                    #print(speaker)
                    #role = "elicitor" if turn["speaker"]["roles"][0]["type"] == "scotus_justice" else "respondent"
                    #role = None
                    
                    # Determine the role
                    if "roles" in turn["speaker"] and turn["speaker"]["roles"]:
                        role_type = turn["speaker"]["roles"][0].get("type")
                        role = "elicitor" if role_type == "scotus_justice" else "respondent"
                    else:
                        role = "respondent"

                    # Add speaker to elicitors or respondents set based on role
                    if role == "elicitor":
                        elicitors_set.add(speaker)
                    elif role == "respondent":
                        respondents_set.add(speaker)

                    
                    
                   # Combine all text blocks into a single utterance
                    combined_text = "\n\n".join([block["text"] for block in turn["text_blocks"]]).strip()
                    
                    # Format the start time as timestamp
                    start_time = turn.get("start")
                    timestamp = f"{int(start_time // 60):02}:{int(start_time % 60):02}:{int((start_time * 1000) % 1000):03}"
                    
                    # Append the single entry with combined text to transformed_data
                    transformed_data["turns"].append({
                        "turn_id": turn_id,
                        "timestamp": timestamp,
                        "speaker": speaker,                     
                        "role": role,
                        "utterance": combined_text,

                    })
                    turn_id += 1
                    
            # Convert sets to lists and assign to transformed_data
            transformed_data["elicitors"] = list(elicitors_set)
            transformed_data["respondents"] = list(respondents_set)


            # Write transformed data to the output file in the specified directory
            with open(os.path.join(output_directory, output_filename), 'w') as f:
                json.dump(transformed_data, f, indent=4)

            dialogue_counter += 1  # Increment dialogue counter

