In [2]:
from datasets import load_dataset
import json

# Load all splits
dataset = load_dataset("anishthalamati/nyt-connections")

# Combine all splits into one list of samples
all_samples = []
for split_name in dataset:
    for sample in dataset[split_name]:
        all_samples.append(sample)

# Preprocess into (label, set of words, original sample)
samples = [
    (sample["label"], set(sample["text"].split()), sample)
    for sample in all_samples
]

# Output list
new_structures = []

for i in range(len(samples)):
    label1, words1, sample1 = samples[i]
    for j in range(i + 1, len(samples)):
        label2, words2, sample2 = samples[j]
        common_words = words1 & words2

        if not common_words:
            continue

        unique1 = list(words1 - common_words)
        unique2 = list(words2 - common_words)

        if not unique1 or not unique2:
            continue

        for common in common_words:
            for word1 in unique1:
                for word2 in unique2:
                    struct1 = {
                        "entity_pair": [word1, common],
                        "connection": label1,
                        "alt_first": {
                            "entity": word2,
                            "connection": label2
                        },
                        "origin": [sample1, sample2]
                    }

                    struct2 = {
                        "entity_pair": [common, word1],
                        "connection": label1,
                        "alt_second": {
                            "entity": word2,
                            "connection": label2
                        },
                        "origin": [sample1, sample2]
                    }

                    new_structures.append(struct1)
                    new_structures.append(struct2)

print(f"✅ Generated {len(new_structures)} new structured examples from all splits.")

# Save to JSONL
output_file = "../../datasets/nyt_connections_common.jsonl"
with open(output_file, "w") as f:
    for item in new_structures:
        f.write(json.dumps(item) + "\n")

print(f"📝 Saved full structured dataset to {output_file}")

✅ Generated 25594 new structured examples from all splits.
📝 Saved full structured dataset to ../../datasets/nyt_connections_common.jsonl
