In [4]:
import json
import random

# Define file paths
INPUT_FILE = 'generated_dialogues.jsonl'          # Replace with your actual file name
OUTPUT_FILE = 'updated_dataset.jsonl'      # The file after removal
REMOVED_FILE = 'removed_entries.jsonl'     # File to store removed entries

# Define the target services
TARGET_SERVICES = {'attraction', 'restaurant'}

# Optional: Set a random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

def standardize_services(services_list):
    """
    Standardize service names:
    - Convert to lowercase.
    - Singularize if necessary (e.g., 'restaurants' -> 'restaurant').
    """
    standardized = set()
    for service in services_list:
        service = service.lower().strip()
        if service.endswith('s') and service != 'bus':  # Retain 'bus' as it's already singular
            service = service.rstrip('s')  # Simple singularization
        standardized.add(service)
    return standardized

def main():
    # First pass: Identify all 'dialogue_id's with both target services
    matching_dialogue_ids = []
    total_entries = 0

    with open(INPUT_FILE, 'r', encoding='utf-8') as f:
        for line in f:
            total_entries += 1
            entry = json.loads(line)
            services = entry.get('services', [])
            standardized_services = standardize_services(services)
            if TARGET_SERVICES.issubset(standardized_services):
                dialogue_id = entry.get('dialogue_id')
                if dialogue_id:
                    matching_dialogue_ids.append(dialogue_id)

    total_matching = len(matching_dialogue_ids)
    print(f"Total entries in dataset: {total_entries}")
    print(f"Entries with both {TARGET_SERVICES}: {total_matching}")

    if total_matching == 0:
        print("No matching entries found. No removal performed.")
        return

    # Determine number of entries to remove (50%)
    num_to_remove = total_matching // 2
    print(f"Number of entries to remove (50%): {num_to_remove}")

    # Randomly select 'dialogue_id's to remove
    entries_to_remove_ids = set(random.sample(matching_dialogue_ids, num_to_remove))
    print("Selected entries for removal.")

    # Second pass: Write to updated and removed files
    updated_count = 0
    removed_count = 0

    with open(INPUT_FILE, 'r', encoding='utf-8') as fin, \
         open(OUTPUT_FILE, 'w', encoding='utf-8') as fout, \
         open(REMOVED_FILE, 'w', encoding='utf-8') as fremoved:

        for line in fin:
            entry = json.loads(line)
            dialogue_id = entry.get('dialogue_id')
            if dialogue_id in entries_to_remove_ids:
                fremoved.write(line)
                removed_count += 1
            else:
                fout.write(line)
                updated_count += 1

    print(f"Updated dataset size: {updated_count}")
    print(f"Removed entries size: {removed_count}")
    print(f"Updated dataset saved to {OUTPUT_FILE}")
    print(f"Removed entries saved to {REMOVED_FILE}")

if __name__ == "__main__":
    main()


Total entries in dataset: 50688
Entries with both {'attraction', 'restaurant'}: 2532
Number of entries to remove (50%): 1266
Selected entries for removal.
Updated dataset size: 49422
Removed entries size: 1266
Updated dataset saved to updated_dataset.jsonl
Removed entries saved to removed_entries.jsonl
