In [1]:
import os
from datasets import Dataset, Audio

# Define paths
dataset_dir = "../Data/NictTib1"
wav_scp_path = os.path.join(dataset_dir, "wav.scp")
label_path = os.path.join(dataset_dir, "label.txt")

In [2]:
# Parse wav.scp file
def parse_wav_scp(wav_scp_path):
    wav_scp = {}
    with open(wav_scp_path, "r", encoding="utf-8") as f:
        for line in f:
            utt_id, audio_path = line.strip().split(maxsplit=1)
            wav_scp[utt_id] = audio_path
    return wav_scp

# Parse label.txt file
def parse_label(label_path):
    labels = {}
    with open(label_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(maxsplit=1)
            utt_id = parts[0]
            transcription = parts[1] if len(parts) > 1 else ""  # Handle empty transcriptions
            labels[utt_id] = transcription
    return labels

# Extract speaker ID from audio path
def extract_speaker_id(audio_path):
    # Assuming the path is like "Tibetan/data/speaker-id/speaker-session-id/wave-files"
    parts = audio_path.split(os.sep)
    speaker_id = parts[-3]  # Extract speaker-id from the path
    return speaker_id

In [3]:
# Load the data
wav_scp = parse_wav_scp(wav_scp_path)
labels = parse_label(label_path)

# Create a list of dictionaries for the dataset
data = []
for utt_id in wav_scp:
    audio_path = wav_scp[utt_id]
    transcription = labels.get(utt_id, "")
    speaker_id = extract_speaker_id(audio_path)  # Extract speaker ID
    if transcription:  # Only include entries with transcriptions
        data.append({
            "audio": audio_path,
            "transcription": transcription,
            "speaker_id": speaker_id,  # Add speaker ID
        })

In [4]:
# Convert to Hugging Face Dataset
dataset = Dataset.from_dict({
    "audio": [item["audio"] for item in data],
    "transcription": [item["transcription"] for item in data],
    "speaker_id": [item["speaker_id"] for item in data],  # Include speaker IDs
})

In [5]:
# Cast the 'audio' column to Audio type
#dataset = dataset.cast_column("audio", Audio())

In [6]:
dataset[0]

{'audio': 'data/006/006_01/006_72.wav',
 'transcription': 'ཀྲུང་གོའི་ཐའེ་ཝན་ལའང་གློག་ཀླད་ནད་དུག་གིས་ཤུགས་རྐྱེན་ཆེན་པོ་ཐེབས་ཡོད།',
 'speaker_id': '006'}

In [7]:
dataset = dataset.cast_column("audio", Audio())

In [8]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, AutoTokenizer, Wav2Vec2Config
import torch

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("openpecha/Garchen_Rinpoche_stt")
tokenizer = AutoTokenizer.from_pretrained("openpecha/Garchen_Rinpoche_stt")
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

config = Wav2Vec2Config.from_pretrained("openpecha/Garchen_Rinpoche_stt")

In [10]:
def prepare_dataset(batch):
    audio = batch["audio"]

    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["uni"]).input_ids
    return batch

In [12]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names, batch_size=1)

Map:   0%|          | 0/16646 [00:00<?, ? examples/s]

: 