In [3]:
import re
import os
import csv
import torch
import torchaudio
import torchaudio.transforms as T
import jiwer
from transformers import WhisperProcessor, WhisperForConditionalGeneration, TrainingArguments, Trainer,WhisperFeatureExtractor, WhisperTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset, DatasetDict
import pandas as pd
from datasets import Audio
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate

In [4]:

# Load transcriptions into a dataframe
csv_file = "transcription_updated.csv"
df = pd.read_csv(csv_file)

# Ensure column names match expected format
df = df.rename(columns={"audio_file": "audio", "transcription": "sentence"})

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Split dataset into train and validation sets
dataset = dataset.train_test_split(test_size=0.1, seed= 42)

# Wrap in DatasetDict
dataset = DatasetDict({"train": dataset["train"], "validation": dataset["test"]})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['filename', 'sentence', 'audio'],
        num_rows: 7987
    })
    validation: Dataset({
        features: ['filename', 'sentence', 'audio'],
        num_rows: 888
    })
})


In [5]:
# Define dataset to include audio files
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

dataset = dataset.remove_columns(["filename"])

In [6]:

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

print(dataset["train"][0])

processor = WhisperProcessor.from_pretrained("openai/whisper-small")

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Tagalog", task="transcribe")
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch


# Print dataset structure to verify columns
print("Dataset structure before preprocessing:")
print(dataset)



{'sentence': 'kahapon', 'audio': {'path': 'D:\\Schoolshit\\Thesis\\dataset\\_speaker_001\\rec_001_FIL\\rec_001_287_fil_003.wav', 'array': array([-8.23974609e-04, -7.01904297e-04, -6.10351562e-04, ...,
        9.15527344e-05, -6.10351562e-05,  9.15527344e-05]), 'sampling_rate': 16000}}
Dataset structure before preprocessing:
DatasetDict({
    train: Dataset({
        features: ['sentence', 'audio'],
        num_rows: 7987
    })
    validation: Dataset({
        features: ['sentence', 'audio'],
        num_rows: 888
    })
})


In [7]:

# Preprocess the dataset
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"])

# Print dataset structure after preprocessing
print("Dataset structure after preprocessing:")
print(dataset)



Map:   0%|          | 0/7987 [00:00<?, ? examples/s]

Map:   0%|          | 0/888 [00:00<?, ? examples/s]

Dataset structure after preprocessing:
DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 7987
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 888
    })
})


In [8]:

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch
    
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)


In [9]:
"""
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}
"""

'\nmetric = evaluate.load("wer")\n\ndef compute_metrics(pred):\n    pred_ids = pred.predictions\n    label_ids = pred.label_ids\n\n    # replace -100 with the pad_token_id\n    label_ids[label_ids == -100] = tokenizer.pad_token_id\n\n    # we do not want to group tokens when computing the metrics\n    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)\n\n    wer = 100 * metric.compute(predictions=pred_str, references=label_str)\n\n    return {"wer": wer}\n'

In [10]:
"""
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=5000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

# Define Trainer
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)
"""

'\n# Define training arguments\ntraining_args = Seq2SeqTrainingArguments(\n    output_dir="./whisper-finetuned",  # change to a repo name of your choice\n    per_device_train_batch_size=16,\n    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size\n    learning_rate=1e-5,\n    warmup_steps=500,\n    max_steps=5000,\n    gradient_checkpointing=True,\n    fp16=True,\n    evaluation_strategy="steps",\n    per_device_eval_batch_size=8,\n    predict_with_generate=True,\n    generation_max_length=225,\n    save_steps=1000,\n    eval_steps=1000,\n    logging_steps=25,\n    report_to=["tensorboard"],\n    load_best_model_at_end=True,\n    metric_for_best_model="wer",\n    greater_is_better=False,\n    push_to_hub=False,\n)\n\n# Define Trainer\ntrainer = Seq2SeqTrainer(\n    args=training_args,\n    model=model,\n    train_dataset=dataset["train"],\n    eval_dataset=dataset["validation"],\n    data_collator=data_collator,\n    compute_metrics=compute_metrics,\n  

In [11]:
"""
# Train model
trainer.train()
"""

'\n# Train model\ntrainer.train()\n'

In [12]:
"""
model.save_pretrained("./whisper-finetuned")
processor.save_pretrained("./whisper-finetuned")
"""

'\nmodel.save_pretrained("./whisper-finetuned")\nprocessor.save_pretrained("./whisper-finetuned")\n'

In [13]:
# USE : whisper model medium for Filipino
#model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained("./whisper-finetuned")
model = WhisperForConditionalGeneration.from_pretrained("./whisper-finetuned")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Using : {device}")

Using : cuda


In [14]:
def normalize_text(text):
    text = text.lower().strip()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

In [15]:
import jiwer

# Track WER separately for 1-word, 2-word, and 3+ word samples
total_wer_1_word = 0
total_wer_2_word = 0
total_wer_3_or_more_words = 0

count_1_word = 0
count_2_word = 0
count_3_or_more_words = 0

file_count = 0
total_wer = 0

for sample in dataset["validation"]:
    input_features = torch.tensor(sample["input_features"]).to(device)  # Precomputed spectrogram
    reference_text = processor.batch_decode([sample["labels"]], skip_special_tokens=True)[0]  # Decode labels

    print("Processing sample...")

    # Model inference
    with torch.no_grad():
        predicted_ids = model.generate(
            input_features.unsqueeze(0),  # Ensure batch dimension
            forced_decoder_ids=processor.get_decoder_prompt_ids(language="tagalog", task="transcribe"),
            max_new_tokens=444
        )

    # Decode predictions
    predicted_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    # Compute WER
    wer = jiwer.wer(predicted_text, reference_text) * 100

    # Count words in the reference text
    word_count = len(reference_text.split())

    if word_count == 1:
        total_wer_1_word += wer
        count_1_word += 1
    elif word_count == 2:
        total_wer_2_word += wer
        count_2_word += 1
    else:
        total_wer_3_or_more_words += wer
        count_3_or_more_words += 1

    total_wer += wer
    file_count += 1

    print(f"Reference: {reference_text}")
    print(f"Predicted: {predicted_text}")
    print(f"WER (File): {wer:.2f}%\n")

# Final WER Statistics
if count_1_word > 0:
    final_wer_1_word = total_wer_1_word / count_1_word
    print(f"\nFinal WER for 1-word samples: {final_wer_1_word:.2f}% ({count_1_word} samples)")

if count_2_word > 0:
    final_wer_2_word = total_wer_2_word / count_2_word
    print(f"Final WER for 2-word samples: {final_wer_2_word:.2f}% ({count_2_word} samples)")

if count_3_or_more_words > 0:
    final_wer_3_or_more_words = total_wer_3_or_more_words / count_3_or_more_words
    print(f"Final WER for 3 or more words: {final_wer_3_or_more_words:.2f}% ({count_3_or_more_words} samples)")

# Final WER Statistics
if file_count > 0:
    final_wer = total_wer / file_count
    print(f"\nFinal Overall WER: {final_wer:.2f}%")

print(f"\nTotal files processed: {file_count}")


Processing sample...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Reference: hindi ko alam
Predicted: hindi ko alam
WER (File): 0.00%

Processing sample...
Reference: hindi ko alam
Predicted: hindi ko alam
WER (File): 0.00%

Processing sample...
Reference: anim na beses
Predicted: anim na beses
WER (File): 0.00%

Processing sample...
Reference: limang araw
Predicted: limang araw
WER (File): 0.00%

Processing sample...
Reference: may nana
Predicted: may nana
WER (File): 0.00%

Processing sample...
Reference: kapag hinihingal
Predicted: kapag hinihingal
WER (File): 0.00%

Processing sample...
Reference: hindi ko alam
Predicted: hindi ko alam
WER (File): 0.00%

Processing sample...
Reference: wala
Predicted: wala
WER (File): 0.00%

Processing sample...
Reference: hindi ko matandaan
Predicted: hindi ko matandaan
WER (File): 0.00%

Processing sample...
Reference: nakalimutan ko na
Predicted: nakalimutan ko na
WER (File): 0.00%

Processing sample...
Reference: nung isang buwan
Predicted: nung isang buwan
WER (File): 0.00%

Processing sample...
Reference: q

In [25]:
import csv
import os
import re
import torch
import torchaudio
import torchaudio.transforms as T
import jiwer

# Define paths
root_folder = "D:\\Schoolshit\\Thesis\\dataset\\_speaker_002"
csv_file = "transcriptionSpeaker_002.csv"
output_csv = "wer_above_0.csv"

def load_transcriptions(csv_file):
    transcriptions = []
    with open(csv_file, newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader, None)  # Skip header
        for row in reader:
            if len(row) == 2:
                transcriptions.append(row[1])
    return transcriptions

def normalize_text(text):
    text = text.lower().strip()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

transcriptions = load_transcriptions(csv_file)

wav_files = []
for entry in sorted(os.scandir(root_folder), key=lambda e: e.name):
    if entry.is_dir():
        folder_path = entry.path
        for file in sorted(os.listdir(folder_path)):
            if file.endswith(".wav"):
                wav_files.append(os.path.join(folder_path, file))

def evaluate_dataset(wav_files, transcriptions, dataset_name):
    print(f"\nEvaluating {dataset_name} set...")
    
    file_count = 0
    total_wer = 0
    wer_records = []
    
    for i, file_path in enumerate(wav_files):
        print(f"Processing {file_path}...")
        waveform, sr = torchaudio.load(file_path)
        resampler = T.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)
        
        inputs = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        forced_decoder_ids = processor.get_decoder_prompt_ids(language="tagalog", task="transcribe")
        
        with torch.no_grad():
            predicted_ids = model.generate(
                inputs["input_features"],
                forced_decoder_ids=forced_decoder_ids,
                max_new_tokens=444
            )
        
        predicted_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        
        # Normalize texts before computing WER
        predicted_text = normalize_text(predicted_text)
        reference_text = normalize_text(transcriptions[i])
        
        wer = jiwer.wer(predicted_text, reference_text) * 100
        total_wer += wer
        file_count += 1
        
        print(f"Reference: {reference_text}")
        print(f"Predicted: {predicted_text}")
        print(f"WER (File): {wer:.2f}%\n")
        
        # Save words with WER > 0%
        if wer > 0:
            wer_records.append([file_path, reference_text, predicted_text, wer])
    
    if file_count > 0:
        final_wer = total_wer / file_count
        print(f"\nFinal Overall WER: {final_wer:.2f}%")
    print(f"\nTotal files processed: {file_count}")
    
    # Save results to CSV
    with open(output_csv, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["File Path", "Reference", "Predicted", "WER (%)"])
        writer.writerows(wer_records)
    print(f"Saved WER > 0% records to {output_csv}")

evaluate_dataset(wav_files, transcriptions, "Validation")



Evaluating Validation set...
Processing D:\Schoolshit\Thesis\dataset\_speaker_002\rec_002_fil\rec_002_101_FIL_001.wav...
Reference: hindi masakit
Predicted: hindi masakit
WER (File): 0.00%

Processing D:\Schoolshit\Thesis\dataset\_speaker_002\rec_002_fil\rec_002_101_FIL_002.wav...
Reference: isa
Predicted: isa
WER (File): 0.00%

Processing D:\Schoolshit\Thesis\dataset\_speaker_002\rec_002_fil\rec_002_101_FIL_003.wav...
Reference: dalawa
Predicted: dalawa
WER (File): 0.00%

Processing D:\Schoolshit\Thesis\dataset\_speaker_002\rec_002_fil\rec_002_101_FIL_004.wav...
Reference: lima
Predicted: lima
WER (File): 0.00%

Processing D:\Schoolshit\Thesis\dataset\_speaker_002\rec_002_fil\rec_002_101_FIL_005.wav...
Reference: anim
Predicted: anim
WER (File): 0.00%

Processing D:\Schoolshit\Thesis\dataset\_speaker_002\rec_002_fil\rec_002_101_FIL_006.wav...
Reference: walo
Predicted: walo
WER (File): 0.00%

Processing D:\Schoolshit\Thesis\dataset\_speaker_002\rec_002_fil\rec_002_101_FIL_007.wav...
