In [1]:
# ---------------- Imports ----------------
import os
import json
import random
import math
import sys

import yaml



In [2]:
# ---------------- Args ----------------
CHUNK_SIZE = 128
SUBSET_SIZE = 0.10


In [3]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

data_folder = os.path.join(config["paths"]["proj_store"], "data")

# Directories
input_folder = f"{data_folder}/yield_v1_base/"
output_folder = f"{data_folder}/yield-v1/"
os.makedirs(output_folder, exist_ok=True)

subset_label = f"{int(SUBSET_SIZE * 100)}pct"
subset_output_folder = os.path.join(data_folder, f"yield-v1-small{subset_label}")
os.makedirs(subset_output_folder, exist_ok=True)



In [4]:
# Build index
index_data = []
for root, _, files in os.walk(input_folder):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    domain = data.get('domain', 'unknown')
                    index_data.append({'file_path': file_path, 'domain': domain})
            except Exception as e:
                print(f"Error reading {file_path}: {e}")



In [5]:
# Stratified split
domain_files = {}
for entry in index_data:
    domain_files.setdefault(entry['domain'], []).append(entry['file_path'])

train_files, dev_files, test_files = [], [], []

for domain, files in domain_files.items():
    random.shuffle(files)
    total = len(files)
    train_count = round(total * 0.8)
    dev_count = round(total * 0.1)
    test_count = total - train_count - dev_count  # ensure sum
    
    train_files.extend(files[:train_count])
    dev_files.extend(files[train_count:train_count + dev_count])
    test_files.extend(files[train_count + dev_count:])
    
    print(f"Domain '{domain}': {len(files)} files → train={train_count}, dev={dev_count}, test={test_count}")

splits = {
    'train': train_files,
    'dev': dev_files,
    'test': test_files
}

# write aggregated files
for split_name, file_list in splits.items():
    split_output_folder = os.path.join(output_folder, split_name)
    os.makedirs(split_output_folder, exist_ok=True)
    
    num_chunks = math.ceil(len(file_list) / CHUNK_SIZE)
    for i in range(num_chunks):
        chunk_files = file_list[i * CHUNK_SIZE : (i + 1) * CHUNK_SIZE]
        dialogues_chunk = []
        
        for file_path in chunk_files:
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    dialogues_chunk.append(json.load(f))
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
        
        output_path = os.path.join(split_output_folder, f'{split_name}-{i:03d}.json')
        with open(output_path, 'w', encoding='utf-8') as out_file:
            json.dump(dialogues_chunk, out_file, ensure_ascii=False, indent=2)
        
        print(f"Wrote {len(dialogues_chunk)} dialogues to {output_path}")

print("Done!")



Domain 'journalistic_investigations': 129 files → train=103, dev=13, test=13
Domain 'judicial_proceedings': 621 files → train=497, dev=62, test=62
Domain 'academic_interviews': 148 files → train=118, dev=15, test=15
Domain 'oral_history': 1383 files → train=1106, dev=138, test=139
Wrote 128 dialogues to /data/yield-v1/data/yield-v1/train/train-000.json
Wrote 128 dialogues to /data/yield-v1/data/yield-v1/train/train-001.json
Wrote 128 dialogues to /data/yield-v1/data/yield-v1/train/train-002.json
Wrote 128 dialogues to /data/yield-v1/data/yield-v1/train/train-003.json
Wrote 128 dialogues to /data/yield-v1/data/yield-v1/train/train-004.json
Wrote 128 dialogues to /data/yield-v1/data/yield-v1/train/train-005.json
Wrote 128 dialogues to /data/yield-v1/data/yield-v1/train/train-006.json
Wrote 128 dialogues to /data/yield-v1/data/yield-v1/train/train-007.json
Wrote 128 dialogues to /data/yield-v1/data/yield-v1/train/train-008.json
Wrote 128 dialogues to /data/yield-v1/data/yield-v1/train/tra

In [6]:
# Create stratified subset


subset_splits = {}
for split_name, file_list in splits.items():
    subset_files = []
    domain_files_split = {}

    # Organize files by domain for the split
    for file_path in file_list:
        with open(file_path, 'r', encoding='utf-8') as f:
            domain = json.load(f).get('domain', 'unknown')
        domain_files_split.setdefault(domain, []).append(file_path)

    # Sample from each domain
    for domain, files in domain_files_split.items():
        k = max(1, round(len(files) * SUBSET_SIZE))  # ensure at least one
        subset_files.extend(random.sample(files, k))
        print(f"[{split_name}] Domain '{domain}': {len(files)} → subset={k}")

    subset_splits[split_name] = subset_files

# write aggregated files for subset
for split_name, file_list in subset_splits.items():
    split_output_folder = os.path.join(subset_output_folder, split_name)
    os.makedirs(split_output_folder, exist_ok=True)
    
    num_chunks = math.ceil(len(file_list) / CHUNK_SIZE)
    for i in range(num_chunks):
        chunk_files = file_list[i * CHUNK_SIZE : (i + 1) * CHUNK_SIZE]
        dialogues_chunk = []
        
        for file_path in chunk_files:
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    dialogues_chunk.append(json.load(f))
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
        
        output_path = os.path.join(split_output_folder, f'{split_name}-{i:03d}.json')
        with open(output_path, 'w', encoding='utf-8') as out_file:
            json.dump(dialogues_chunk, out_file, ensure_ascii=False, indent=2)
        
        print(f"[subset] Wrote {len(dialogues_chunk)} dialogues to {output_path}")

print("stratified subset done!")


[train] Domain 'journalistic_investigations': 103 → subset=10
[train] Domain 'judicial_proceedings': 497 → subset=50
[train] Domain 'academic_interviews': 118 → subset=12
[train] Domain 'oral_history': 1106 → subset=111
[dev] Domain 'journalistic_investigations': 13 → subset=1
[dev] Domain 'judicial_proceedings': 62 → subset=6
[dev] Domain 'academic_interviews': 15 → subset=2
[dev] Domain 'oral_history': 138 → subset=14
[test] Domain 'journalistic_investigations': 13 → subset=1
[test] Domain 'judicial_proceedings': 62 → subset=6
[test] Domain 'academic_interviews': 15 → subset=2
[test] Domain 'oral_history': 139 → subset=14
[subset] Wrote 128 dialogues to /data/yield-v1/data/yield-v1-small10pct/train/train-000.json
[subset] Wrote 55 dialogues to /data/yield-v1/data/yield-v1-small10pct/train/train-001.json
[subset] Wrote 23 dialogues to /data/yield-v1/data/yield-v1-small10pct/dev/dev-000.json
[subset] Wrote 23 dialogues to /data/yield-v1/data/yield-v1-small10pct/test/test-000.json
strat