In [4]:
import os
import pandas as pd
from datasets import Dataset, Audio

# List of dataset directories
data_dirs = [
    r"D:\Usuario\Desktop\Uni\4th year\Advanced Machine Learning\PROJECT GROUP\ca6\ca",
]

# Output directory for preprocessed files
output_dir = "preprocessed_audio_dataset_chunks_ca6"
os.makedirs(output_dir, exist_ok=True)

# Function to load and preprocess a single dataset
def load_local_dataset(data_dir):
    path = os.path.join(data_dir, "validated.tsv")
    validated_data = pd.read_csv(path, sep='\t')

    # Keep only the "path" column
    validated_data = validated_data[["path"]]
    validated_data["path"] = validated_data["path"].apply(lambda x: os.path.join(data_dir, "clips", x))

    # Convert to Hugging Face Dataset
    dataset = Dataset.from_pandas(validated_data)

    # Ensure audio files are loaded properly
    dataset = dataset.cast_column("path", Audio(sampling_rate=16_000))
    return dataset

# Function to preprocess and save in chunks
def preprocess_and_save(dataset, chunk_size=1000, output_dir="preprocessed_audio_dataset"):
    for i in range(0, len(dataset), chunk_size):
        # Select a chunk of the dataset
        chunk = dataset.select(range(i, min(i + chunk_size, len(dataset))))
        
        # Preprocess the chunk
        def prepare_audio(batch):
            batch["audio"] = batch["path"]["array"]
            return batch

        processed_chunk = chunk.map(prepare_audio, remove_columns=["path"])

        # Save the chunk to disk
        chunk_file = os.path.join(output_dir, f"chunk_{i // chunk_size}.arrow")
        processed_chunk.save_to_disk(chunk_file)
        print(f"Saved chunk {i // chunk_size} to {chunk_file}")

# Process each dataset directory
for data_dir in data_dirs:
    dataset = load_local_dataset(data_dir)
    preprocess_and_save(dataset, chunk_size=4000, output_dir=output_dir)

print("Preprocessing complete.")


  validated_data = pd.read_csv(path, sep='\t')


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/6 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 0 to preprocessed_audio_dataset_chunks_ca6\chunk_0.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/6 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 1 to preprocessed_audio_dataset_chunks_ca6\chunk_1.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 2 to preprocessed_audio_dataset_chunks_ca6\chunk_2.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 3 to preprocessed_audio_dataset_chunks_ca6\chunk_3.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 4 to preprocessed_audio_dataset_chunks_ca6\chunk_4.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 5 to preprocessed_audio_dataset_chunks_ca6\chunk_5.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 6 to preprocessed_audio_dataset_chunks_ca6\chunk_6.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 7 to preprocessed_audio_dataset_chunks_ca6\chunk_7.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 8 to preprocessed_audio_dataset_chunks_ca6\chunk_8.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 9 to preprocessed_audio_dataset_chunks_ca6\chunk_9.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 10 to preprocessed_audio_dataset_chunks_ca6\chunk_10.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 11 to preprocessed_audio_dataset_chunks_ca6\chunk_11.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 12 to preprocessed_audio_dataset_chunks_ca6\chunk_12.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 13 to preprocessed_audio_dataset_chunks_ca6\chunk_13.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 14 to preprocessed_audio_dataset_chunks_ca6\chunk_14.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 15 to preprocessed_audio_dataset_chunks_ca6\chunk_15.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 16 to preprocessed_audio_dataset_chunks_ca6\chunk_16.arrow


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved chunk 17 to preprocessed_audio_dataset_chunks_ca6\chunk_17.arrow


Map:   0%|          | 0/3210 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/3210 [00:00<?, ? examples/s]

Saved chunk 18 to preprocessed_audio_dataset_chunks_ca6\chunk_18.arrow
Preprocessing complete.
