In [1]:
import pandas as pd
import json
import random
import re
import os
from pathlib import Path

## Load all CSV files into DataFrames

In [2]:
project_root = str(Path().cwd().resolve().parent)
raw_data_dir = os.path.join(project_root, "data", "raw")
entities_dir = os.path.join(raw_data_dir, "entities")
intents_dir = os.path.join(raw_data_dir, "intents")

In [3]:
# --- Load Entity Value Lists ---
try:
    practitioner_df = pd.read_csv(os.path.join(entities_dir, "practitioner_name.csv"))
    practitioner_list = practitioner_df['text'].tolist()

    appointment_type_df = pd.read_csv(os.path.join(entities_dir, "appointment_type.csv"))
    appointment_type_list = appointment_type_df['text'].tolist()
    
    # Load the pre-generated appointment IDs
    appointment_id_df = pd.read_csv(os.path.join(entities_dir, "appointment_id.csv"))
    appointment_id_list = appointment_id_df['text'].tolist()
    
    print(f"Loaded {len(practitioner_list)} practitioners.")
    print(f"Loaded {len(appointment_type_list)} appointment types.")
    print(f"Loaded {len(appointment_id_list)} pre-generated appointment IDs.")

except FileNotFoundError as e:
    print(f"Error loading entity CSVs: {e}")
    print("Please ensure you have run id_generator.py and all entity files are in place.")


# --- Load Intent Template DataFrames ---
intent_dfs = {}
intent_files = [
    "schedule", "reschedule", "cancel", "query_avail",
    "greeting", "bye", "positive_reply", "negative_reply", "oos"
]

for intent_name in intent_files:
    try:
        path = os.path.join(intents_dir, f"{intent_name}.csv")
        intent_dfs[intent_name] = pd.read_csv(path)
        print(f"Loaded '{intent_name}.csv' with {len(intent_dfs[intent_name])} rows.")
    except FileNotFoundError:
        print(f"Warning: Could not find '{intent_name}.csv'. Skipping.")

Loaded 149 practitioners.
Loaded 147 appointment types.
Loaded 1500 pre-generated appointment IDs.
Loaded 'schedule.csv' with 139 rows.
Loaded 'reschedule.csv' with 143 rows.
Loaded 'cancel.csv' with 142 rows.
Loaded 'query_avail.csv' with 140 rows.
Loaded 'greeting.csv' with 150 rows.
Loaded 'bye.csv' with 150 rows.
Loaded 'positive_reply.csv' with 300 rows.
Loaded 'negative_reply.csv' with 150 rows.
Loaded 'oos.csv' with 1350 rows.


## Process Intents and Inject Entities

In [4]:
final_dataset = []

# --- Process Simple Intents ---
simple_intents = ["greeting", "bye", "positive_reply", "negative_reply", "oos"]
for intent_name in simple_intents:
    if intent_name in intent_dfs:
        for index, row in intent_dfs[intent_name].iterrows():
            final_dataset.append({"text": row['text'], "intent": row['intent'], "entities": []})
print(f"Processed {len(final_dataset)} entries from simple intents.")

# --- Process Complex Intents ---
complex_intents = ["schedule", "reschedule", "cancel", "query_avail"]

# The entity map now includes appointment_id from the start
entity_map = {
    'practitioner_name': practitioner_list,
    'appointment_type': appointment_type_list,
    'appointment_id': appointment_id_list
}

for intent_name in complex_intents:
    if intent_name in intent_dfs:
        for index, row in intent_dfs[intent_name].iterrows():
            template = row['text']
            injected_text = template
            entities = []
            placeholders = re.findall(r"\{(.+?)\}", template)
            
            for placeholder in placeholders:
                # The logic is now generalized for all entities
                if placeholder in entity_map:
                    value_to_inject = random.choice(entity_map[placeholder])
                    start_index = injected_text.find("{" + placeholder + "}")
                    if start_index != -1:
                        injected_text = injected_text.replace("{" + placeholder + "}", value_to_inject, 1)
                        end_index = start_index + len(value_to_inject)
                        entities.append({"start": start_index, "end": end_index, "label": placeholder})
                else:
                    print(f"Warning: Found unknown placeholder '{placeholder}'")

            final_dataset.append({
                "text": injected_text,
                "intent": row['intent'],
                "entities": sorted(entities, key=lambda e: e['start'])
            })
print(f"Processing complete. Total entries in dataset: {len(final_dataset)}.")

Processed 2100 entries from simple intents.
Processing complete. Total entries in dataset: 2664.


## Shuffle and Save the final JSONL dataset

In [5]:
random.shuffle(final_dataset)
print("Dataset shuffled.")

output_path_jsonl = os.path.join(raw_data_dir, "dataset.jsonl")
with open(output_path_jsonl, 'w', encoding='utf-8') as f:
    for entry in final_dataset:
        f.write(json.dumps(entry, ensure_ascii=False) + '\n')

print(f"✅ Success! Dataset correctly saved.")

Dataset shuffled.
✅ Success! Dataset correctly saved.
