In [39]:
import json
import os
from collections import Counter
from typing import List, Dict

DATASET_PATH = "../datasets/slurp"
CLEANED_DATASET_PATH = "../cleaned-datasets/slurp"
datasets = ['devel.jsonl', 'test.jsonl', 'train.jsonl']

os.makedirs(CLEANED_DATASET_PATH, exist_ok=True)


In [40]:
def remove_fields(dataset_type: str):
    fields_to_remove =["slurp_id", "tokens", "recordings"]

    with open(f"{DATASET_PATH}/{dataset_type}", "r", encoding="utf-8") as inFile, \
        open(f"{CLEANED_DATASET_PATH}/{dataset_type}", "w", encoding="utf-8") as outFile:
        for line in inFile:
            print(line)
            if not line.strip():
                continue

            data = json.loads(line)
            for field in fields_to_remove:
                data.pop(field, None)
            outFile.write(json.dumps(data, ensure_ascii=False) + "\n")


In [41]:
for dataset in datasets:
    remove_fields(dataset)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [42]:
# drop all unique intents with occurrences < 30 from each split,
# then compute intersection across all 3 splits,
# and keep only intents in the intersection in each file

THRESHOLD = 30

def read_jsonl(rel_path: str) -> List[dict]:
    records: List[dict] = []
    with open(f"{CLEANED_DATASET_PATH}/{rel_path}", "r", encoding="utf-8") as f:
        for line in f:
            s = line.strip()
            if not s:
                continue
            records.append(json.loads(s))
    return records


def write_jsonl(rel_path: str, records: List[dict]) -> None:
    with open(f"{CLEANED_DATASET_PATH}/{rel_path}", "w", encoding="utf-8") as f:
        for rec in records:
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")


def intent_counts(records: List[dict]) -> Counter:
    c = Counter()
    for r in records:
        c[r.get("intent")] += 1
    return c


loaded = {split: read_jsonl(split) for split in datasets}

counts = {split: intent_counts(loaded[split]) for split in datasets}

intents_above_threshold: Dict[str, set] = {
    split: {intent for intent, n in cnt.items() if n >= THRESHOLD}
    for split, cnt in counts.items()
}

print(f"Threshold: {THRESHOLD}")
print("\nIntents with count >= {THRESHOLD} per split:")
for split in datasets:
    print(f"  {split}: {len(intents_above_threshold[split])} unique intents")

intersection_intents = set.intersection(*(intents_above_threshold[split] for split in datasets))
print(f"\nIntersection of intents across all splits: {len(intersection_intents)} intents")
print(f"Intents in intersection: {sorted(intersection_intents)}")

filtered = {
    split: [rec for rec in loaded[split] if rec.get("intent") in intersection_intents]
    for split in datasets
}

print("\nFiltering results:")
for split in datasets:
    original_count = len(loaded[split])
    filtered_count = len(filtered[split])
    dropped = original_count - filtered_count
    print(f"  {split}: {original_count} -> {filtered_count} (dropped {dropped}, {100*dropped/original_count:.1f}%)")

for split in datasets:
    write_jsonl(split, filtered[split])

print("\nCleaned datasets have been overwritten with filtered data.")

Threshold: 30

Intents with count >= {THRESHOLD} per split:
  devel.jsonl: 24 unique intents
  test.jsonl: 35 unique intents
  train.jsonl: 56 unique intents

Intersection of intents across all splits: 24 intents
Intents in intersection: ['alarm_set', 'calendar_query', 'calendar_remove', 'calendar_set', 'cooking_recipe', 'datetime_query', 'email_query', 'email_sendemail', 'general_quirky', 'lists_query', 'lists_remove', 'music_query', 'news_query', 'play_audiobook', 'play_music', 'play_podcasts', 'play_radio', 'qa_currency', 'qa_definition', 'qa_factoid', 'recommendation_locations', 'social_post', 'transport_query', 'weather_query']

Filtering results:
  devel.jsonl: 2033 -> 1489 (dropped 544, 26.8%)
  test.jsonl: 2974 -> 2153 (dropped 821, 27.6%)
  train.jsonl: 11514 -> 8129 (dropped 3385, 29.4%)

Cleaned datasets have been overwritten with filtered data.
