In [1]:
# ---------------- Imports ----------------
import os
import json
import yaml


In [2]:
# ---------------- Args ----------------
dataset = "yield-v1-small10pct-factualnovelty" 
# dataset = "yield_v1_factualnovelty"


In [3]:
# ---------------- Config ----------------
with open("../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

proj_store = config["paths"]["proj_store"]
data_path = os.path.join(proj_store, "data")



base_folder = os.path.join(data_path, dataset)
output_folder = os.path.join(data_path, f"{dataset}-with-rtg")
os.makedirs(output_folder, exist_ok=True)


In [4]:


def add_rtg_to_dialogue(dialogue, gamma: float = 1.0):
    turns = dialogue.get("turns", [])
    if not turns:
        return dialogue

    # Collect indices and rewards for respondent turns
    respondent_indices = []
    rewards = []
    for i, turn in enumerate(turns):
        if turn["role"] == "respondent":
            r = float(turn.get("factual_novelty_score", 0.0))
            rewards.append(r)
            respondent_indices.append(i)

    # Compute RTGs backwards
    rtg = [0.0] * len(rewards)
    running = 0.0
    for j in reversed(range(len(rewards))):
        running = rewards[j] + gamma * running
        rtg[j] = running

    # Assign RTG to corresponding respondent turns
    for idx, val in zip(respondent_indices, rtg):
        turns[idx]["return_to_go"] = val

    # Remove unwanted fields
    for turn in turns:
        turn.pop("factual_accumulated_entities", None)
        turn.pop("factual_novelty_entities", None)

    return dialogue


def process_file(input_path, output_path, gamma: float = 1.0):
    with open(input_path, "r", encoding="utf-8") as f:
        dialogues = json.load(f)

    processed = [add_rtg_to_dialogue(d, gamma=gamma) for d in dialogues]

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(processed, f, ensure_ascii=False, indent=2)


def process_split(input_folder, output_folder, gamma: float = 1.0):
    os.makedirs(output_folder, exist_ok=True)
    json_files = [f for f in os.listdir(input_folder) if f.endswith(".json")]

    for json_file in json_files:
        input_path = os.path.join(input_folder, json_file)
        output_path = os.path.join(output_folder, json_file)
        process_file(input_path, output_path, gamma=gamma)
        print(f"Processed {json_file} -> {output_path}")





In [5]:
# Procedure
splits = ["train", "dev", "test"]
for split in splits:
    input_split_folder = os.path.join(base_folder, split)
    output_split_folder = os.path.join(output_folder, split)
    process_split(input_split_folder, output_split_folder, gamma=0.9)




Processed train-001.json -> /data/sequential-ieas/data/yield-v1-small10pct-factualnovelty-with-rtg/train/train-001.json
Processed train-000.json -> /data/sequential-ieas/data/yield-v1-small10pct-factualnovelty-with-rtg/train/train-000.json
Processed dev-000.json -> /data/sequential-ieas/data/yield-v1-small10pct-factualnovelty-with-rtg/dev/dev-000.json
Processed test-000.json -> /data/sequential-ieas/data/yield-v1-small10pct-factualnovelty-with-rtg/test/test-000.json
