In [2]:
import sys
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

from dataset_loader import *
from encoders import *

spectrogram_encoder = SpectrogramEncoder()
encoded_datasets = load_data(
    start_idx=0, 
    num_samples=10000, 
    encoding=spectrogram_encoder, 
    lang=['en', 'tr'], 
    split="train",
    dataset=["fleurs"]
)

# Save the encoded datasets to storage
for lang, dataset in encoded_datasets.items():
    if dataset is not None:
        save_path = f"../datasets/fleurs/encoded_spectrogram/{lang}/train"
        print(f"Saving {lang} dataset to {save_path}...")
        dataset.save_to_disk(save_path)
        print(f"Saved {lang} dataset.")

Loading google/fleurs (en_us) from local storage: /media/zawiatgf/New Volume/Personal Files/Abdurrahman Zawia/University/Grad Project/Speech-To-Speech-Model/datasets/fleurs/en_us/train...
Loading google/fleurs (tr_tr) from local storage: /media/zawiatgf/New Volume/Personal Files/Abdurrahman Zawia/University/Grad Project/Speech-To-Speech-Model/datasets/fleurs/tr_tr/train...


Map: 100%|██████████| 2602/2602 [00:06<00:00, 374.64 examples/s]
Map: 100%|██████████| 2526/2526 [00:07<00:00, 339.49 examples/s]


Deduplicating en: 2602 -> 1476 unique IDs
Deduplicating tr: 2526 -> 1402 unique IDs
Found 1373 common IDs across 2 languages.


Filter: 100%|██████████| 1476/1476 [00:05<00:00, 263.60 examples/s]
Filter: 100%|██████████| 1402/1402 [00:04<00:00, 340.12 examples/s]

Applying encoding: SpectrogramEncoder



Encoding en with Spectrogram (num_proc=8): 100%|██████████| 1373/1373 [00:22<00:00, 60.09 examples/s]
Encoding tr with Spectrogram (num_proc=8): 100%|██████████| 1373/1373 [00:24<00:00, 56.51 examples/s]


Saving en dataset to ../datasets/fleurs/encoded_spectrogram/en/train...


Saving the dataset (2/2 shards): 100%|██████████| 1373/1373 [00:00<00:00, 1415.88 examples/s]


Saved en dataset.
Saving tr dataset to ../datasets/fleurs/encoded_spectrogram/tr/train...


Saving the dataset (2/2 shards): 100%|██████████| 1373/1373 [00:00<00:00, 1426.01 examples/s]

Saved tr dataset.





In [None]:
# Verify parallelism of the encoded datasets
from datasets import load_from_disk

try:
    encoded_en = load_from_disk("../datasets/fleurs/encoded_spectrogram/en/train")
    encoded_tr = load_from_disk("../datasets/fleurs/encoded_spectrogram/tr/train")

    print(f"English encoded dataset size: {len(encoded_en)}")
    print(f"Turkish encoded dataset size: {len(encoded_tr)}")

    # Verify lengths
    if len(encoded_en) != len(encoded_tr):
        print("WARNING: Datasets have different lengths!")
    else:
        print("Lengths match.")

    # Verify IDs match
    mismatches = []
    # Check first 1000 or all if smaller, to be quick but verifying
    check_count = len(encoded_en)
    
    print(f"Checking ID alignment for {check_count} samples...")
    
    for i in range(check_count):
        id_en = encoded_en[i]['id']
        id_tr = encoded_tr[i]['id']
        
        if id_en != id_tr:
            mismatches.append((i, id_en, id_tr))
            if len(mismatches) >= 10: # Stop after finding some
                break
    
    if not mismatches:
        print("SUCCESS: All checked IDs match. The encoded datasets are parallel.")
        # Show a few examples
        for i in range(min(5, len(encoded_en))):
             print(f"Sample {i}: ID {encoded_en[i]['id']}")
    else:
        print(f"FAILURE: Found mismatches. First few (Index, EN, TR): {mismatches}")

except Exception as e:
    print(f"Could not verify datasets: {e}")
    print("Make sure you have run the previous cell to encode and save the datasets first.")

English encoded dataset size: 1373
Turkish encoded dataset size: 1373
Lengths match.
Checking ID alignment for 1373 samples...
SUCCESS: All checked IDs match. The encoded datasets are parallel.
Sample 0: ID 1
Sample 1: ID 2
Sample 2: ID 5
Sample 3: ID 6
Sample 4: ID 7


In [3]:
import sys
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

from dataset_loader import *

dataset = load_data(
    start_idx=0, 
    num_samples=50000,
    lang=['en', 'de'],
    split="train",
    dataset=["seamless_align"],
)

# # Save the encoded datasets to storage
# for lang, dataset in dataset.items():
#     if dataset is not None:
#         save_path = f"../datasets/seamless_align/encoded_spectrogram/{lang}/train"
#         print(f"Saving {lang} dataset to {save_path}...")
#         dataset.save_to_disk(save_path)
#         print(f"Saved {lang} dataset.")

Seamless Align: Detected pair 'deA-enA' for language 'en_us'.
Loading jhu-clsp/seamless-align-expressive (deA-enA) from local storage: /media/zawiatgf/New Volume/Personal Files/Abdurrahman Zawia/University/Grad Project/Speech-To-Speech-Model/datasets/seamless_align/deA-enA/train...
Seamless Align: Detected pair 'deA-enA' for language 'de_de'.
Loading jhu-clsp/seamless-align-expressive (deA-enA) from local storage: /media/zawiatgf/New Volume/Personal Files/Abdurrahman Zawia/University/Grad Project/Speech-To-Speech-Model/datasets/seamless_align/deA-enA/train...


Map: 100%|██████████| 50000/50000 [05:26<00:00, 153.04 examples/s]
Map: 100%|██████████| 50000/50000 [06:00<00:00, 138.65 examples/s]
Map: 100%|██████████| 50000/50000 [06:15<00:00, 133.14 examples/s]
Map: 100%|██████████| 50000/50000 [06:10<00:00, 135.12 examples/s]
Filter: 100%|██████████| 50000/50000 [22:22<00:00, 37.24 examples/s]


Removed 948 samples with invalid audio in en.


Filter: 100%|██████████| 50000/50000 [23:22<00:00, 35.64 examples/s]


Removed 1743 samples with invalid audio in de.
Found 47709 common IDs across 2 languages.


Filter: 100%|██████████| 49052/49052 [21:14<00:00, 38.49 examples/s]
Filter: 100%|██████████| 48257/48257 [22:21<00:00, 35.97 examples/s]


In [5]:
for i in range(len(dataset['en'])):
    if dataset['en'][i]['audio']['array'] == dataset['de'][i]['audio']['array']:
        print(f"Audio arrays are at id: {i} IDENTICAL.")

# play_audio(en_data[2])
# play_audio(de_data[2])

Audio arrays are at id: 669IDENTICAL.
Audio arrays are at id: 1101IDENTICAL.
Audio arrays are at id: 1109IDENTICAL.
Audio arrays are at id: 1475IDENTICAL.
Audio arrays are at id: 2369IDENTICAL.
Audio arrays are at id: 2788IDENTICAL.
Audio arrays are at id: 2838IDENTICAL.
Audio arrays are at id: 3517IDENTICAL.
Audio arrays are at id: 4001IDENTICAL.
Audio arrays are at id: 4189IDENTICAL.
Audio arrays are at id: 4759IDENTICAL.
Audio arrays are at id: 5011IDENTICAL.
Audio arrays are at id: 5119IDENTICAL.
Audio arrays are at id: 5173IDENTICAL.
Audio arrays are at id: 5777IDENTICAL.
Audio arrays are at id: 6263IDENTICAL.
Audio arrays are at id: 7263IDENTICAL.
Audio arrays are at id: 7280IDENTICAL.
Audio arrays are at id: 7580IDENTICAL.
Audio arrays are at id: 8622IDENTICAL.
Audio arrays are at id: 9974IDENTICAL.
Audio arrays are at id: 11737IDENTICAL.
Audio arrays are at id: 13524IDENTICAL.
Audio arrays are at id: 14866IDENTICAL.
Audio arrays are at id: 16342IDENTICAL.
Audio arrays are at id