In [None]:
import sys
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

from dataset_loader import *
from encoders import *

spectrogram_encoder = SpectrogramEncoder()
encoded_datasets = load_data(
    start_idx=0, 
    num_samples=10000, 
    encoding=spectrogram_encoder, 
    lang=['en', 'tr'], 
    split="train"
)

# Save the encoded datasets to storage
for lang, dataset in encoded_datasets.items():
    if dataset is not None:
        save_path = f"../datasets/fleurs/encoded_spectrogram/{lang}/train"
        print(f"Saving {lang} dataset to {save_path}...")
        dataset.save_to_disk(save_path)
        print(f"Saved {lang} dataset.")

  from .autonotebook import tqdm as notebook_tqdm


Loading google/fleurs (en_us) from local storage: ./datasets/fleurs/en_us/train...
Loading google/fleurs (tr_tr) from local storage: ./datasets/fleurs/tr_tr/train...
Deduplicating en: 2602 -> 1476 unique IDs
Deduplicating tr: 2526 -> 1402 unique IDs
Found 1373 common IDs across 2 languages.


Filter: 100%|██████████| 1476/1476 [00:04<00:00, 337.28 examples/s]
Filter: 100%|██████████| 1402/1402 [00:02<00:00, 502.36 examples/s]

Applying encoding: SpectrogramEncoder



Encoding en with Spectrogram (num_proc=8): 100%|██████████| 1373/1373 [00:20<00:00, 66.20 examples/s]
Encoding tr with Spectrogram (num_proc=8): 100%|██████████| 1373/1373 [00:23<00:00, 58.02 examples/s]


Saving en dataset to ./datasets/fleurs/encoded_spectrogram/en/train...


Saving the dataset (2/2 shards): 100%|██████████| 1373/1373 [00:00<00:00, 1488.66 examples/s]


Saved en dataset.
Saving tr dataset to ./datasets/fleurs/encoded_spectrogram/tr/train...


Saving the dataset (2/2 shards): 100%|██████████| 1373/1373 [00:00<00:00, 1612.64 examples/s]

Saved tr dataset.





In [None]:
# Verify parallelism of the encoded datasets
from datasets import load_from_disk

try:
    encoded_en = load_from_disk("../datasets/fleurs/encoded_spectrogram/en/train")
    encoded_tr = load_from_disk("../datasets/fleurs/encoded_spectrogram/tr/train")

    print(f"English encoded dataset size: {len(encoded_en)}")
    print(f"Turkish encoded dataset size: {len(encoded_tr)}")

    # Verify lengths
    if len(encoded_en) != len(encoded_tr):
        print("WARNING: Datasets have different lengths!")
    else:
        print("Lengths match.")

    # Verify IDs match
    mismatches = []
    # Check first 1000 or all if smaller, to be quick but verifying
    check_count = len(encoded_en)
    
    print(f"Checking ID alignment for {check_count} samples...")
    
    for i in range(check_count):
        id_en = encoded_en[i]['id']
        id_tr = encoded_tr[i]['id']
        
        if id_en != id_tr:
            mismatches.append((i, id_en, id_tr))
            if len(mismatches) >= 10: # Stop after finding some
                break
    
    if not mismatches:
        print("SUCCESS: All checked IDs match. The encoded datasets are parallel.")
        # Show a few examples
        for i in range(min(5, len(encoded_en))):
             print(f"Sample {i}: ID {encoded_en[i]['id']}")
    else:
        print(f"FAILURE: Found mismatches. First few (Index, EN, TR): {mismatches}")

except Exception as e:
    print(f"Could not verify datasets: {e}")
    print("Make sure you have run the previous cell to encode and save the datasets first.")

English encoded dataset size: 1373
Turkish encoded dataset size: 1373
Lengths match.
Checking ID alignment for 1373 samples...
SUCCESS: All checked IDs match. The encoded datasets are parallel.
Sample 0: ID 1
Sample 1: ID 2
Sample 2: ID 5
Sample 3: ID 6
Sample 4: ID 7
