In [1]:
# ---------------- Imports ----------------
import os
import json
import sys

import yaml

from transformers import AutoTokenizer



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ---------------- Args ----------------
dataset = "yield-v1-small10pct"
tokenizer_model = "meta-llama/Llama-3.1-8B-Instruct"
TOKEN_LIMIT = 512  # max tokens allowed per dialogue
WINDOW_SIZE = 6


In [3]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

data_path = os.path.join(config["paths"]["proj_store"], "data")
models_folderpath = config["paths"]["models"]

base_folder = os.path.join(data_path, dataset)
output_folder = f"{base_folder}-finetuning"
os.makedirs(output_folder, exist_ok=True)

# Initialize tokenizer globally
MODEL_NAME = os.path.join(models_folderpath, tokenizer_model) 
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, local_files_only=True)



In [4]:
def process_dialogue(dialogue, file_path):
    turns = dialogue.get("turns", [])
    if not turns:
        raise ValueError(f"Dialogue {dialogue.get('dialogue_id')} in {file_path} has no turns.")

    domain = dialogue.get("domain", "unknown").replace("_", " ")
    system_prompt = f"Act as an information elicitation agent for {domain}."

    merged_blocks = []
    block_start_indices = []
    current_role = None
    utterances = []

    index = 0
    while index < len(turns) and turns[index]["role"] == "elicitor":
        index += 1

    if index >= len(turns):
        raise ValueError(f"Dialogue {dialogue.get('dialogue_id')} in {file_path} has no respondent turn to start with.")

    results = []

    # Build merged blocks
    for i in range(index, len(turns)):
        turn = turns[i]
        role = turn["role"]
        utterance = turn["utterance"].strip()

        if current_role is None:
            current_role = role
            utterances = [utterance]
            start_index = i
        else:
            if role == current_role:
                utterances.append(utterance)
            else:
                merged_content = f"{utterances[0]}"
                if len(utterances) > 1:
                    merged_content += "\n\n" + "\n\n".join(utterances[1:])
                merged_blocks.append((current_role, merged_content))
                block_start_indices.append(start_index)

                current_role = role
                utterances = [utterance]
                start_index = i

    if utterances:
        merged_content = f"{utterances[0]}"
        if len(utterances) > 1:
            merged_content += "\n\n" + "\n\n".join(utterances[1:])
        merged_blocks.append((current_role, merged_content))
        block_start_indices.append(start_index)

    # SLIDING WINDOW 
    for window_start in range(len(merged_blocks) - WINDOW_SIZE + 1):
        window = merged_blocks[window_start : window_start + WINDOW_SIZE]

        # Ensure last block is elicitor â†’ assistant
        if window[-1][0] == "elicitor":
            messages = [{"role": "system", "content": system_prompt}]

            for role, content in window:
                if role == "elicitor":
                    messages.append({"role": "assistant", "content": content})
                elif role == "respondent":
                    messages.append({"role": "user", "content": content})
                else:
                    raise ValueError(f"Unexpected role: {role}")

            #full_text = "\n".join([msg["content"] for msg in messages])
            
            # Format the conversation using the chat template
            chat_text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False
            )
            # Tokenize to count tokens accurately
            #print(chat_text) # check output
            tokens = tokenizer(chat_text, return_tensors=None, add_special_tokens=False)["input_ids"]
            token_count = len(tokens)

            if token_count <= TOKEN_LIMIT:
                results.append({
                    "block_id": f"{dialogue.get('dialogue_id')}:{block_start_indices[window_start]}",
                    "domain": dialogue.get('domain'),
                    "messages": messages
                })
            else:
                continue

    return results




def process_file(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        dialogues = json.load(f)

    if not isinstance(dialogues, list):
        raise ValueError(f"Expected list of dialogues in file {input_file}, found {type(dialogues)}")

    processed_dialogues = []

    for _, dialogue in enumerate(dialogues):
        
        
        try:
            processed_windows = process_dialogue(dialogue, input_file)
            for window in processed_windows:
                processed_dialogues.append(json.dumps(window, ensure_ascii=False))
        except Exception as e:
            raise RuntimeError(f"Error in {input_file}: {e}")

    return processed_dialogues



def process_split(input_folder, output_folder):
    json_files = [f for f in os.listdir(input_folder) if f.endswith('.json')]

    for json_file in json_files:
        file_path = os.path.join(input_folder, json_file)
        processed = process_file(file_path)

        output_file_path = os.path.join(output_folder, json_file.replace('.json', '.jsonl'))

        with open(output_file_path, 'w', encoding='utf-8') as f_out:
            for line in processed:
                f_out.write(line + '\n')

        print(f"Processed {len(processed)} dialogues from {json_file} into {output_file_path}")




In [5]:
# Procedure
splits = ["train", "dev", "test"]
os.makedirs(output_folder, exist_ok=True)

for split in splits:
    input_split_folder = os.path.join(base_folder, split)
    output_split_folder = os.path.join(output_folder, split)
    os.makedirs(output_split_folder, exist_ok=True)

    process_split(input_split_folder, output_split_folder)


Processed 2545 dialogues from train-001.json into /data/yield-v1/data/yield-v1-small10pct-finetuning/train/train-001.jsonl
Processed 8725 dialogues from train-000.json into /data/yield-v1/data/yield-v1-small10pct-finetuning/train/train-000.jsonl
Processed 2050 dialogues from dev-000.json into /data/yield-v1/data/yield-v1-small10pct-finetuning/dev/dev-000.jsonl
Processed 1051 dialogues from test-000.json into /data/yield-v1/data/yield-v1-small10pct-finetuning/test/test-000.jsonl
