In [1]:
import torch
import torchaudio
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

print("Libraries loaded successfully!")
print("Torch version:", torch.__version__)
print("Torchaudio version:", torchaudio.__version__)


Libraries loaded successfully!
Torch version: 2.6.0
Torchaudio version: 2.6.0


In [5]:
import os

audio_path = "../audio_samples/Yasser_Al-Dossari_1_1_1.wav" 

if not os.path.exists(audio_path):
    print(f"ERROR: File not found → {audio_path}")
else:
    print(f"File found: {audio_path}")


File found: ../audio_samples/Yasser_Al-Dossari_1_1_1.wav


In [1]:
from huggingface_hub import login

#Hugging Face API token
login(input("Enter your Hugging Face Token: "))

In [None]:
import torchaudio
import librosa
import torch

# Path to test audio (wav)
audio_path = "../audio_samples/Yasser_Al-Dossari_1_1_1.wav"  

# Load + resample audio to 16kHz 
waveform, sample_rate = librosa.load(audio_path, sr=16000)

# Convert to tensor format
input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values

# Perform inference
with torch.no_grad():
    logits = model(input_values).logits

# Decode predictions
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]

print("Transcription:", transcription)


Transcription: بِسْ مِلْ لَاْ ھِرْ رَحْ مَاْ نِرْ رَ حِيْمْ


In [8]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

model_name = "facebook/wav2vec2-large-xlsr-53"

# Force re-download
processor = Wav2Vec2Processor.from_pretrained(model_name, force_download=True)
model = Wav2Vec2ForCTC.from_pretrained(model_name, force_download=True)

print("ASR Model Reloaded Successfully!")


preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

ValueError: Force download failed due to the above error.

In [2]:
import torch
import torchaudio
import librosa
import pandas as pd
import time
from datetime import datetime
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, WhisperProcessor, WhisperForConditionalGeneration

# Models to test
models = [
    # Wav2Vec2
    "jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
    "elgeish/wav2vec2-large-xlsr-53-arabic",
    "Nuwaisir/Quran_speech_recognizer",
    "IbrahimSalah/Wav2vecLarge_quran_syllables_recognition",
    "mohammed/arabic-speech-recognition",

    # Whisper 
    "openai/whisper-large-v3",
    "openai/whisper-small",
    "areaz/whisper-small-for-quran",
    "tarteel-ai/whisper-base-ar-quran",
    "cherifkhalifah/quran-recitation-errors-test"
]

# Test audio samples
audio_files = [
    "../audio_samples/Yasser_Al-Dosari_1_1.wav",
    "../audio_samples/Yasser_Al-Dosari_1_2.wav",
    "../audio_samples/Yasser_Al-Dosari_1_3.wav",
    "../audio_samples/Yasser_Al-Dosari_1_4.wav",
    "../audio_samples/Yasser_Al-Dosari_1_5.wav",
    "../audio_samples/Yasser_Al-Dossari_1_6.wav"
]


results = []

# Load + process audio
def load_audio(file_path):
    waveform, sample_rate = librosa.load(file_path, sr=16000)  #resample to 16kHz
    return waveform, sample_rate

#Loop thorugh models & samples
for model_name in models:
    print(f"\nLoading model: {model_name}")

    # Track load times
    start_time = time.time()

    # Identify arch.
    if "whisper" in model_name.lower():
        processor = WhisperProcessor.from_pretrained(model_name)
        model = WhisperForConditionalGeneration.from_pretrained(model_name)
        model_type = "Whisper"
    else:
        processor = Wav2Vec2Processor.from_pretrained(model_name)
        model = Wav2Vec2ForCTC.from_pretrained(model_name)
        model_type = "Wav2Vec2"

    model.eval()  # Set to eval mode
    load_time = time.time() - start_time
    print(f"Model loaded in {load_time:.2f} sec")

    # Loop audio samples
    for audio_path in audio_files:
        print(f"\nTesting audio sample: {audio_path}")
        
        # Load + preprocess
        waveform, sample_rate = load_audio(audio_path)

        # Track inference times
        start_time = time.time()

        if model_type == "Whisper":
            input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
            with torch.no_grad():
                predicted_ids = model.generate(input_features)
            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
            confidence = None  # Whisper default no score
        else:
            input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values
            with torch.no_grad():
                logits = model(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = processor.batch_decode(predicted_ids)[0]
            confidence = None  #TODO: check if no scores from Wav2Vec2

        inference_time = time.time() - start_time
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") #stamp run

        # Store results
        results.append({
            "Model": model_name,
            "Audio File": audio_path,
            "Transcription": transcription,
            "Confidence": confidence,
            "Runtime (sec)": round(inference_time, 2),
            "Timestamp": timestamp
        })

        print(f"Transcription: {transcription}")
        print(f"Runtime: {inference_time:.2f} sec\n")

# Convert results to df
df_results = pd.DataFrame(results)

# Export results
csv_filename = "ASR_Test_Results1.csv"
df_results.to_csv(csv_filename, index=False)

print(f"\nAll models tested! Results saved to {csv_filename}")

# Display results
# import ace_tools as tools
# tools.display_dataframe_to_user(name="ASR Test Results", dataframe=df_results)
display(df_results)



Loading model: jonatasgrosman/wav2vec2-large-xlsr-53-arabic




Model loaded in 1.13 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_1.wav
Transcription: بِسْمِ اللَهِ الرَّحْمَنِ الرّحِيمِ
Runtime: 3.46 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_2.wav
Transcription: الحمدري له رب لعالمٍ
Runtime: 2.24 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_3.wav
Transcription: الرَّحْمَن عَنِ الرَّحِيم
Runtime: 2.16 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_4.wav
Transcription: مالك يوم الدي
Runtime: 1.26 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_5.wav
Transcription: إيا كن اعبد وإيا كان استعي
Runtime: 1.04 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dossari_1_6.wav
Transcription: إهدِينَا الصِرَا - قَلْمُسْتَقِيمَ
Runtime: 0.37 sec


Loading model: elgeish/wav2vec2-large-xlsr-53-arabic
Model loaded in 1.34 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_1.wav
Transcription: bisomi All~ahi Alr~aHomani Alr~aHiymi
Runtime: 14.52 sec


Tes



Model loaded in 1.03 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_1.wav
Transcription: بسم الله الرحم الرحيم
Runtime: 3.25 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_2.wav
Transcription: الحمدرين إله ربن لعلم
Runtime: 1.70 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_3.wav
Transcription: الرحمن الرحي
Runtime: 0.96 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_4.wav
Transcription: مارك يوم الد
Runtime: 0.71 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_5.wav
Transcription: ا كا نعبد وإا كان ستع
Runtime: 0.76 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dossari_1_6.wav
Transcription: إهدنا الصرات المستقيم
Runtime: 0.41 sec


Loading model: openai/whisper-large-v3
Model loaded in 16.57 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_1.wav


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcription:  بسم الله الرحمن الرحيم
Runtime: 81.96 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_2.wav
Transcription:  الحمد لله رب العالمين
Runtime: 243.41 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_3.wav
Transcription:  الرحمن الرحيم
Runtime: 210.76 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_4.wav
Transcription:  مالك يوم الدين
Runtime: 133.37 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_5.wav
Transcription:  إياك نعبد وإياك نستعين
Runtime: 241.53 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dossari_1_6.wav
Transcription:  اهدنا الصراط المستقيما
Runtime: 89.41 sec


Loading model: openai/whisper-small
Model loaded in 1.69 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_1.wav
Transcription:  بسم الله الرحمن الرحيم
Runtime: 5.40 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_2.wav
Transcription:  الحمد لله رب العالمين
Runtime: 2.74 sec


Testing audio sample: ../a

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.


Model loaded in 1.13 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_1.wav
Transcription: بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ
Runtime: 3.60 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_2.wav
Transcription: الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ
Runtime: 2.19 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_3.wav
Transcription: الرَّحْمَٰنِ الرَّحِيمِ
Runtime: 1.90 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_4.wav
Transcription: مَالِكِ يَوْمِ الدِّينِ
Runtime: 1.79 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_5.wav
Transcription: يَاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ
Runtime: 2.34 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dossari_1_6.wav
Transcription: اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ
Runtime: 2.16 sec


Loading model: tarteel-ai/whisper-base-ar-quran
Model loaded in 1.17 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_1.wav
Transcription: بِسْمِ اللَّهِ الرَّحْمَنِ ا

You are using a model of type whisper to instantiate a model of type wav2vec2. This is not supported for all configurations of models and can yield errors.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at cherifkhalifah/quran-recitation-errors-test and are newly initialized: ['encoder.layer_norm.bias', 'encoder.layer_norm.weight', 'encoder.layers.0.attention.k_proj.bias', 'encoder.layers.0.attention.k_proj.weight', 'encoder.layers.0.attention.out_proj.bias', 'encoder.layers.0.attention.out_proj.weight', 'encoder.layers.0.attention.q_proj.bias', 'encoder.layers.0.attention.q_proj.weight', 'encoder.layers.0.attention.v_proj.bias', 'encoder.layers.0.attention.v_proj.weight', 'encoder.layers.0.feed_forward.intermediate_dense.bias', 'encoder.layers.0.feed_forward.intermediate_dense.weight', 'encoder.layers.0.feed_forward.output_dense.bias', 'encoder.layers.0.feed_forward.output_dense.weight', 'encoder.layers.0.final_layer_norm.bias', 'encoder.layers.0.final_l

Model loaded in 2.11 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_1.wav
Transcription: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Runtime: 0.13 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_2.wav
Transcription: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Runtime: 0.22 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_1_3.wav
Transcription: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

Unnamed: 0,Model,Audio File,Transcription,Confidence,Runtime (sec),Timestamp
0,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_1_1.wav,بِسْمِ اللَهِ الرَّحْمَنِ الرّحِيمِ,,3.46,2025-03-02 16:54:37
1,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_1_2.wav,الحمدري له رب لعالمٍ,,2.24,2025-03-02 16:54:39
2,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_1_3.wav,الرَّحْمَن عَنِ الرَّحِيم,,2.16,2025-03-02 16:54:41
3,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_1_4.wav,مالك يوم الدي,,1.26,2025-03-02 16:54:42
4,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_1_5.wav,إيا كن اعبد وإيا كان استعي,,1.04,2025-03-02 16:54:43
5,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dossari_1_6.wav,إهدِينَا الصِرَا - قَلْمُسْتَقِيمَ,,0.37,2025-03-02 16:54:44
6,elgeish/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_1_1.wav,bisomi All~ahi Alr~aHomani Alr~aHiymi,,14.52,2025-03-02 16:55:00
7,elgeish/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_1_2.wav,AlHmd ryl ll h rmDyn AlEAlam,,11.01,2025-03-02 16:55:11
8,elgeish/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_1_3.wav,Alr~aHomani Alr~aHiy,,14.12,2025-03-02 16:55:25
9,elgeish/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_1_4.wav,mElk ywm AlDy,,13.02,2025-03-02 16:55:38


In [3]:
import torch
import torchaudio
import librosa
import pandas as pd
import time
from datetime import datetime
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, WhisperProcessor, WhisperForConditionalGeneration

# Models to test
models = [
    # Wav2Vec2
    "jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
    "elgeish/wav2vec2-large-xlsr-53-arabic",
    "Nuwaisir/Quran_speech_recognizer",
    "IbrahimSalah/Wav2vecLarge_quran_syllables_recognition",
    "mohammed/arabic-speech-recognition",

    # Whisper 
    "openai/whisper-large-v3",
    "openai/whisper-small",
    "areaz/whisper-small-for-quran",
    "tarteel-ai/whisper-base-ar-quran",
    "cherifkhalifah/quran-recitation-errors-test"
]

# Test audio samples
audio_files = [
    "../audio_samples/Yasser_Al-Dosari_112_1.wav",
    "../audio_samples/Yasser_Al-Dosari_112_2.wav",
    "../audio_samples/Yasser_Al-Dosari_112_3.wav",
    "../audio_samples/Yasser_Al-Dosari_112_4.wav"
]


results = []

# Load + process audio
def load_audio(file_path):
    waveform, sample_rate = librosa.load(file_path, sr=16000)  #resample to 16kHz
    return waveform, sample_rate

#Loop thorugh models & samples
for model_name in models:
    print(f"\nLoading model: {model_name}")

    # Track load times
    start_time = time.time()

    # Identify arch.
    if "whisper" in model_name.lower():
        processor = WhisperProcessor.from_pretrained(model_name)
        model = WhisperForConditionalGeneration.from_pretrained(model_name)
        model_type = "Whisper"
    else:
        processor = Wav2Vec2Processor.from_pretrained(model_name)
        model = Wav2Vec2ForCTC.from_pretrained(model_name)
        model_type = "Wav2Vec2"

    model.eval()  # Set to eval mode
    load_time = time.time() - start_time
    print(f"Model loaded in {load_time:.2f} sec")

    # Loop audio samples
    for audio_path in audio_files:
        print(f"\nTesting audio sample: {audio_path}")
        
        # Load + preprocess
        waveform, sample_rate = load_audio(audio_path)

        # Track inference times
        start_time = time.time()

        if model_type == "Whisper":
            input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
            with torch.no_grad():
                predicted_ids = model.generate(input_features)
            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
            confidence = None  # Whisper default no score
        else:
            input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values
            with torch.no_grad():
                logits = model(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = processor.batch_decode(predicted_ids)[0]
            confidence = None  #TODO: check if no scores from Wav2Vec2

        inference_time = time.time() - start_time
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") #stamp run

        # Store results
        results.append({
            "Model": model_name,
            "Audio File": audio_path,
            "Transcription": transcription,
            "Confidence": confidence,
            "Runtime (sec)": round(inference_time, 2),
            "Timestamp": timestamp
        })

        print(f"Transcription: {transcription}")
        print(f"Runtime: {inference_time:.2f} sec\n")

# Convert results to df
df_results = pd.DataFrame(results)

# Export results
csv_filename = "ASR_Test_Results2.csv"
df_results.to_csv(csv_filename, index=False)

print(f"\nAll models tested! Results saved to {csv_filename}")

# Display results
# import ace_tools as tools
# tools.display_dataframe_to_user(name="ASR Test Results", dataframe=df_results)
display(df_results)



Loading model: jonatasgrosman/wav2vec2-large-xlsr-53-arabic




Model loaded in 1.18 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_112_1.wav
Transcription: قُلْ هُوَ اللَه وَأَحَادًاً
Runtime: 3.12 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_112_2.wav
Transcription: اللهو الصمادة
Runtime: 0.31 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_112_3.wav
Transcription: لم يا لينتاوع الأم يولادب
Runtime: 0.29 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_112_4.wav
Transcription: وَلَمْ يَكُونْ إِلَاى هُوَنكُوفُ وَدْنَ أَحَاندًا
Runtime: 0.38 sec


Loading model: elgeish/wav2vec2-large-xlsr-53-arabic
Model loaded in 0.95 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_112_1.wav
Transcription: qulo huwa All~ahu >aHaAdu
Runtime: 14.78 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_112_2.wav
Transcription: Alh AlSAmd
Runtime: 15.59 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_112_3.wav
Transcription: lamo yariDaA walamo yuwladab
Runtime: 14.86 sec


T

You are using a model of type whisper to instantiate a model of type wav2vec2. This is not supported for all configurations of models and can yield errors.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at cherifkhalifah/quran-recitation-errors-test and are newly initialized: ['encoder.layer_norm.bias', 'encoder.layer_norm.weight', 'encoder.layers.0.attention.k_proj.bias', 'encoder.layers.0.attention.k_proj.weight', 'encoder.layers.0.attention.out_proj.bias', 'encoder.layers.0.attention.out_proj.weight', 'encoder.layers.0.attention.q_proj.bias', 'encoder.layers.0.attention.q_proj.weight', 'encoder.layers.0.attention.v_proj.bias', 'encoder.layers.0.attention.v_proj.weight', 'encoder.layers.0.feed_forward.intermediate_dense.bias', 'encoder.layers.0.feed_forward.intermediate_dense.weight', 'encoder.layers.0.feed_forward.output_dense.bias', 'encoder.layers.0.feed_forward.output_dense.weight', 'encoder.layers.0.final_layer_norm.bias', 'encoder.layers.0.final_l

Model loaded in 1.95 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_112_1.wav
Transcription: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Runtime: 0.10 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_112_2.wav
Transcription: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Runtime: 0.09 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_112_3.wav
Transcription: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Runtime: 0.11 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_112_4.wav
Transcription: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Runtime: 0.14 sec


All models tested! Results saved to AS

Unnamed: 0,Model,Audio File,Transcription,Confidence,Runtime (sec),Timestamp
0,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_112_1.wav,قُلْ هُوَ اللَه وَأَحَادًاً,,3.12,2025-03-02 17:17:22
1,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_112_2.wav,اللهو الصمادة,,0.31,2025-03-02 17:17:22
2,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_112_3.wav,لم يا لينتاوع الأم يولادب,,0.29,2025-03-02 17:17:22
3,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_112_4.wav,وَلَمْ يَكُونْ إِلَاى هُوَنكُوفُ وَدْنَ أَحَاندًا,,0.38,2025-03-02 17:17:23
4,elgeish/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_112_1.wav,qulo huwa All~ahu >aHaAdu,,14.78,2025-03-02 17:17:38
5,elgeish/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_112_2.wav,Alh AlSAmd,,15.59,2025-03-02 17:17:54
6,elgeish/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_112_3.wav,lamo yariDaA walamo yuwladab,,14.86,2025-03-02 17:18:09
7,elgeish/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_112_4.wav,walamo yakuni lahu kuwfa waEDi | HaAdaA,,12.73,2025-03-02 17:18:22
8,Nuwaisir/Quran_speech_recognizer,../audio_samples/Yasser_Al-Dosari_112_1.wav,qulo huwa All~ahu >aHadN,,5.12,2025-03-02 17:18:38
9,Nuwaisir/Quran_speech_recognizer,../audio_samples/Yasser_Al-Dosari_112_2.wav,All~ahu AlS~amada,,2.09,2025-03-02 17:18:40


In [4]:
import torch
import torchaudio
import librosa
import pandas as pd
import time
from datetime import datetime
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, WhisperProcessor, WhisperForConditionalGeneration

# Models to test
models = [
    # Wav2Vec2
    "jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
    "elgeish/wav2vec2-large-xlsr-53-arabic",
    "Nuwaisir/Quran_speech_recognizer",
    "IbrahimSalah/Wav2vecLarge_quran_syllables_recognition",
    "mohammed/arabic-speech-recognition",

    # Whisper 
    "openai/whisper-large-v3",
    "openai/whisper-small",
    "areaz/whisper-small-for-quran",
    "tarteel-ai/whisper-base-ar-quran",
    "cherifkhalifah/quran-recitation-errors-test"
]

# Test audio samples
audio_files = [
    "../audio_samples/Yasser_Al-Dosari_2_225.wav"
]


results = []

# Load + process audio
def load_audio(file_path):
    waveform, sample_rate = librosa.load(file_path, sr=16000)  #resample to 16kHz
    return waveform, sample_rate

#Loop thorugh models & samples
for model_name in models:
    print(f"\nLoading model: {model_name}")

    # Track load times
    start_time = time.time()

    # Identify arch.
    if "whisper" in model_name.lower():
        processor = WhisperProcessor.from_pretrained(model_name)
        model = WhisperForConditionalGeneration.from_pretrained(model_name)
        model_type = "Whisper"
    else:
        processor = Wav2Vec2Processor.from_pretrained(model_name)
        model = Wav2Vec2ForCTC.from_pretrained(model_name)
        model_type = "Wav2Vec2"

    model.eval()  # Set to eval mode
    load_time = time.time() - start_time
    print(f"Model loaded in {load_time:.2f} sec")

    # Loop audio samples
    for audio_path in audio_files:
        print(f"\nTesting audio sample: {audio_path}")
        
        # Load + preprocess
        waveform, sample_rate = load_audio(audio_path)

        # Track inference times
        start_time = time.time()

        if model_type == "Whisper":
            input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
            with torch.no_grad():
                predicted_ids = model.generate(input_features)
            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
            confidence = None  # Whisper default no score
        else:
            input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values
            with torch.no_grad():
                logits = model(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = processor.batch_decode(predicted_ids)[0]
            confidence = None  #TODO: check if no scores from Wav2Vec2

        inference_time = time.time() - start_time
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") #stamp run

        # Store results
        results.append({
            "Model": model_name,
            "Audio File": audio_path,
            "Transcription": transcription,
            "Confidence": confidence,
            "Runtime (sec)": round(inference_time, 2),
            "Timestamp": timestamp
        })

        print(f"Transcription: {transcription}")
        print(f"Runtime: {inference_time:.2f} sec\n")

# Convert results to df
df_results = pd.DataFrame(results)

# Export results
csv_filename = "ASR_Test_Results3.csv"
df_results.to_csv(csv_filename, index=False)

print(f"\nAll models tested! Results saved to {csv_filename}")

# Display results
display(df_results)



Loading model: jonatasgrosman/wav2vec2-large-xlsr-53-arabic




Model loaded in 1.46 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_2_225.wav
Transcription: أله لاا إِلاها إِلاهُوالحيُو القَيومِ لا تَأخذُهُسِنَةٌ ولالَهمْ له ما فِي السّماواتِ وما في الأرض منذًا للذِي يَشفعُو عِنادَهُ إِلا بِإِذنِه يَعلَمُوا ما بًينَ أَيدِيهِم وَما خلفهُمْ ولا يحِيطُونَ بِشَيعِن عِلمِهِيعِلا فِما شا وسِعَكَرصِيُهُ السّماواتِ ولأرضَ وَلا يَءُودُهُ حِفظٍ ما ولا يَؤُودُهُ حِفظما وهُوَ لعَلُِِ العظِيم
Runtime: 11.20 sec


Loading model: elgeish/wav2vec2-large-xlsr-53-arabic
Model loaded in 1.29 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_2_225.wav
Transcription: All~ahu laA AaA <ilaA ha<irlaA huwaloHawliTay laA ta>ox*uhuwASinapN walaA namo lahuhinaA fiy Als~amaA gaAti wamaA filaro man Darina*ihya$ofaEiEn A *ahuw <ilaA biri*onK yaEolamu maA bana >ardiyhimo wamaA xalfahuwmo walaA yurHyTuwna b$aririEirihrilimaA fimaA$oEa waSiyEakaroSihi wamuls~amaA waAti wali>aroDagalaA yarguwDuwA HaforuhmaA  walaA yagiE*uhuwA HayofoDuhmaA gahugalEaliyyliloEa

You are using a model of type whisper to instantiate a model of type wav2vec2. This is not supported for all configurations of models and can yield errors.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at cherifkhalifah/quran-recitation-errors-test and are newly initialized: ['encoder.layer_norm.bias', 'encoder.layer_norm.weight', 'encoder.layers.0.attention.k_proj.bias', 'encoder.layers.0.attention.k_proj.weight', 'encoder.layers.0.attention.out_proj.bias', 'encoder.layers.0.attention.out_proj.weight', 'encoder.layers.0.attention.q_proj.bias', 'encoder.layers.0.attention.q_proj.weight', 'encoder.layers.0.attention.v_proj.bias', 'encoder.layers.0.attention.v_proj.weight', 'encoder.layers.0.feed_forward.intermediate_dense.bias', 'encoder.layers.0.feed_forward.intermediate_dense.weight', 'encoder.layers.0.feed_forward.output_dense.bias', 'encoder.layers.0.feed_forward.output_dense.weight', 'encoder.layers.0.final_layer_norm.bias', 'encoder.layers.0.final_l

Model loaded in 1.98 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_2_225.wav
Transcription: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

Unnamed: 0,Model,Audio File,Transcription,Confidence,Runtime (sec),Timestamp
0,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_2_225.wav,أله لاا إِلاها إِلاهُوالحيُو القَيومِ لا تَأخذ...,,11.2,2025-03-02 17:26:53
1,elgeish/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_2_225.wav,All~ahu laA AaA <ilaA ha<irlaA huwaloHawliTay ...,,16.5,2025-03-02 17:27:11
2,Nuwaisir/Quran_speech_recognizer,../audio_samples/Yasser_Al-Dosari_2_225.wav,laAl~ahu laAAAAaAaAA <il~aA ha}ill~aA huwa Alo...,,12.73,2025-03-02 17:27:25
3,IbrahimSalah/Wav2vecLarge_quran_syllables_reco...,../audio_samples/Yasser_Al-Dosari_2_225.wav,ءَلْ لَاْ ھُ لَاْ ءِ لَ ھَ ءِلْ لَاْ ھُ وَلْ ح...,,12.12,2025-03-02 17:27:44
4,mohammed/arabic-speech-recognition,../audio_samples/Yasser_Al-Dosari_2_225.wav,ب اله لاإل ه إللهوالحيلقيلا تأخذهسنتووللوم له ...,,11.17,2025-03-02 17:27:56
5,openai/whisper-large-v3,../audio_samples/Yasser_Al-Dosari_2_225.wav,الله لا إله إلا هو الحي القيوم لا تأخذه سنة و...,,83.97,2025-03-02 17:29:36
6,openai/whisper-small,../audio_samples/Yasser_Al-Dosari_2_225.wav,111 – الله لا إله إلا هو الحي القيوم,,4.43,2025-03-02 17:29:43
7,areaz/whisper-small-for-quran,../audio_samples/Yasser_Al-Dosari_2_225.wav,اللَّهُ لَا إِلَٰهَ إِلَّا هُوَ الْحَيُّ الْقَ...,,12.49,2025-03-02 17:29:56
8,tarteel-ai/whisper-base-ar-quran,../audio_samples/Yasser_Al-Dosari_2_225.wav,اللَّهُ لَا إِلَهَ إِلَّا هُوَ الْحَيُّ الْقَي...,,19.86,2025-03-02 17:30:17
9,cherifkhalifah/quran-recitation-errors-test,../audio_samples/Yasser_Al-Dosari_2_225.wav,!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!...,,2.47,2025-03-02 17:30:22


In [5]:
import torch
import torchaudio
import librosa
import pandas as pd
import time
from datetime import datetime
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, WhisperProcessor, WhisperForConditionalGeneration

# Models to test
models = [
    # Wav2Vec2
    "jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
    "elgeish/wav2vec2-large-xlsr-53-arabic",
    "Nuwaisir/Quran_speech_recognizer",
    "IbrahimSalah/Wav2vecLarge_quran_syllables_recognition",
    "mohammed/arabic-speech-recognition",

    # Whisper 
    "openai/whisper-large-v3",
    "openai/whisper-small",
    "areaz/whisper-small-for-quran",
    "tarteel-ai/whisper-base-ar-quran",
    "cherifkhalifah/quran-recitation-errors-test"
]

# Test audio samples
audio_files = [
    "../audio_samples/Yasser_Al_Dosari_2_282.wav"
]


results = []

# Load + process audio
def load_audio(file_path):
    waveform, sample_rate = librosa.load(file_path, sr=16000)  #resample to 16kHz
    return waveform, sample_rate

#Loop thorugh models & samples
for model_name in models:
    print(f"\nLoading model: {model_name}")

    # Track load times
    start_time = time.time()

    # Identify arch.
    if "whisper" in model_name.lower():
        processor = WhisperProcessor.from_pretrained(model_name)
        model = WhisperForConditionalGeneration.from_pretrained(model_name)
        model_type = "Whisper"
    else:
        processor = Wav2Vec2Processor.from_pretrained(model_name)
        model = Wav2Vec2ForCTC.from_pretrained(model_name)
        model_type = "Wav2Vec2"

    model.eval()  # Set to eval mode
    load_time = time.time() - start_time
    print(f"Model loaded in {load_time:.2f} sec")

    # Loop audio samples
    for audio_path in audio_files:
        print(f"\nTesting audio sample: {audio_path}")
        
        # Load + preprocess
        waveform, sample_rate = load_audio(audio_path)

        # Track inference times
        start_time = time.time()

        if model_type == "Whisper":
            input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
            with torch.no_grad():
                predicted_ids = model.generate(input_features)
            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
            confidence = None  # Whisper default no score
        else:
            input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values
            with torch.no_grad():
                logits = model(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = processor.batch_decode(predicted_ids)[0]
            confidence = None  #TODO: check if no scores from Wav2Vec2

        inference_time = time.time() - start_time
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") #stamp run

        # Store results
        results.append({
            "Model": model_name,
            "Audio File": audio_path,
            "Transcription": transcription,
            "Confidence": confidence,
            "Runtime (sec)": round(inference_time, 2),
            "Timestamp": timestamp
        })

        print(f"Transcription: {transcription}")
        print(f"Runtime: {inference_time:.2f} sec\n")

# Convert results to df
df_results = pd.DataFrame(results)

# Export results
csv_filename = "ASR_Test_Results4.csv"
df_results.to_csv(csv_filename, index=False)

print(f"\nAll models tested! Results saved to {csv_filename}")

# Display results
display(df_results)



Loading model: jonatasgrosman/wav2vec2-large-xlsr-53-arabic




Model loaded in 2.05 sec

Testing audio sample: ../audio_samples/Yasser_Al_Dosari_2_282.wav
Transcription: يَا أَييُهَلِنَدِينَ أَأمَنُيإِذاتَدَايَمةُ بِدَيلِنْ إِلَا مَمَمَمَمَا أَتَلِمُسَ مَ فَاكتُبُو وَيَكتُبَّينَكُمْ كاِكُمبِلعَدل وَلَا يَأبَكتِبُ أَْ يَكْتُبَكمَا عيِلََهُللَأََغَغَاه فَليَكْتُبَ ولِيُمْرِلِللَذِيعَلَيهِلحَووأَِيَتَقِلَاهَرَدَهُوَلَا يَبَخَسْمِن هُشَلآا فَإِاكَا َلَذِيعَلَيلحَقُسَفِيهَ أَأُطَعِيفَ أَولَا يَستَقِيعُ أَيُلِلَهُوَسَِيُ لِل وَلِيُهو بِلعَدْلِ وَاسْتَشْهِدُوا شَهِيدَيلِلِررِّجَ أَلِكُمْ فَإِللَيَكُونَارٌََيلِسَرْةلُووَرَأَتَاللِنَنَرْطلَلِنَشُّهَدَامَمَم أَل تَطِلَئِحْدَاهُمَا فَتُذَكِرَ ئِحْدَاهُمَنلأَخْرَا وَلَا يَبَشُّأَدَامَمَََمَأَءُإِذامَادُرُوَ وَلَا تَسأَمُوهأَتَكْتُكُوهُ سَتِيرَ أَودِيرَ إِلَ مَأََمَمَا أجَر فَا لِكُمْ أَقْسَطُعِدَلَ هِ وَأَطَمُ لِشَهاتَتِ وَأََنَمَمَمَمَمَا أَلِلَأتَرْتاكو إِنْ لَا ََمَا أَنمَتَكُونَتِتَارًَحَاتِرَةً نتُذِيرُونَ هَا دَلَكُمْ فََيسَعِلَيْكُمْتُنَاحُ أَللَا تَكْتُكُوهَا وَأَشْهِدُوا إِداتَبَا يَعْتُمْ وَلَا يُطَ

You are using a model of type whisper to instantiate a model of type wav2vec2. This is not supported for all configurations of models and can yield errors.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at cherifkhalifah/quran-recitation-errors-test and are newly initialized: ['encoder.layer_norm.bias', 'encoder.layer_norm.weight', 'encoder.layers.0.attention.k_proj.bias', 'encoder.layers.0.attention.k_proj.weight', 'encoder.layers.0.attention.out_proj.bias', 'encoder.layers.0.attention.out_proj.weight', 'encoder.layers.0.attention.q_proj.bias', 'encoder.layers.0.attention.q_proj.weight', 'encoder.layers.0.attention.v_proj.bias', 'encoder.layers.0.attention.v_proj.weight', 'encoder.layers.0.feed_forward.intermediate_dense.bias', 'encoder.layers.0.feed_forward.intermediate_dense.weight', 'encoder.layers.0.feed_forward.output_dense.bias', 'encoder.layers.0.feed_forward.output_dense.weight', 'encoder.layers.0.final_layer_norm.bias', 'encoder.layers.0.final_l

Model loaded in 2.54 sec

Testing audio sample: ../audio_samples/Yasser_Al_Dosari_2_282.wav
Transcription:  embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass surprising surprising embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass surprising surprising embarrass embarrass surprising embarrass surprising embarrass embarrass embarrass embarrass embarrass embarrass embarrass embarrass surprising surprising embarrass embarrass surprising surprising embarrass embarrass embarrass em

Unnamed: 0,Model,Audio File,Transcription,Confidence,Runtime (sec),Timestamp
0,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al_Dosari_2_282.wav,يَا أَييُهَلِنَدِينَ أَأمَنُيإِذاتَدَايَمةُ بِ...,,38.23,2025-03-02 17:31:24
1,elgeish/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al_Dosari_2_282.wav,>aniywhanina*inna>amanu <*aA ta*aA ya tubi*ann...,,46.04,2025-03-02 17:32:11
2,Nuwaisir/Quran_speech_recognizer,../audio_samples/Yasser_Al_Dosari_2_282.wav,yaA >aywha ll~abiyla >a manu <i*aA tabaA yanat...,,40.03,2025-03-02 17:33:02
3,IbrahimSalah/Wav2vecLarge_quran_syllables_reco...,../audio_samples/Yasser_Al_Dosari_2_282.wav,يَاْ ءَيْ يُ ھَلْ لَ ذِيْ نَ ءَاْ مَ نُوْ ءِ ذ...,,41.16,2025-03-02 17:33:52
4,mohammed/arabic-speech-recognition,../audio_samples/Yasser_Al_Dosari_2_282.wav,أإوملبلأمإذاتديتبتينن إلى مماأتلسمفكتوبوه ولي...,,41.58,2025-03-02 17:34:35
5,openai/whisper-large-v3,../audio_samples/Yasser_Al_Dosari_2_282.wav,يا أيها الذين آمنوا إذا تداينتن بدين إلى أجل ...,,96.22,2025-03-02 17:36:25
6,openai/whisper-small,../audio_samples/Yasser_Al_Dosari_2_282.wav,30 يا أيها الذين آمنوا إذا تداياتوا بدين إلى ...,,5.32,2025-03-02 17:36:32
7,areaz/whisper-small-for-quran,../audio_samples/Yasser_Al_Dosari_2_282.wav,يَا أَيُّهَا الَّذِينَ آمَنُوا إِذَا تَدَايَنت...,,11.33,2025-03-02 17:36:55
8,tarteel-ai/whisper-base-ar-quran,../audio_samples/Yasser_Al_Dosari_2_282.wav,يَا أَيُّهَا الَّذِينَ آمَنُوا إِذَا تَدَايَنْ...,,15.82,2025-03-02 17:37:14
9,cherifkhalifah/quran-recitation-errors-test,../audio_samples/Yasser_Al_Dosari_2_282.wav,embarrass embarrass embarrass embarrass embar...,,9.11,2025-03-02 17:37:26


In [6]:
import torch
import torchaudio
import librosa
import pandas as pd
import time
from datetime import datetime
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, WhisperProcessor, WhisperForConditionalGeneration

# Models to test
models = [
    # Wav2Vec2
    "jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
    "elgeish/wav2vec2-large-xlsr-53-arabic",
    "Nuwaisir/Quran_speech_recognizer",
    "IbrahimSalah/Wav2vecLarge_quran_syllables_recognition",
    "mohammed/arabic-speech-recognition",

    # Whisper 
    "openai/whisper-large-v3",
    "openai/whisper-small",
    "areaz/whisper-small-for-quran",
    "tarteel-ai/whisper-base-ar-quran",
    "cherifkhalifah/quran-recitation-errors-test"
]

# Test audio samples
audio_files = [
    "../audio_samples/Yasser_Al-Dosari_19_1.wav",
    "../audio_samples/Yasser_Al-Dosari_20_1.wav",
    "../audio_samples/Yasser_Al-Dosari_36_1.wav",
    "../audio_samples/Yasser_Al-Dosari_42_1.wav"
]


results = []

# Load + process audio
def load_audio(file_path):
    waveform, sample_rate = librosa.load(file_path, sr=16000)  #resample to 16kHz
    return waveform, sample_rate

#Loop thorugh models & samples
for model_name in models:
    print(f"\nLoading model: {model_name}")

    # Track load times
    start_time = time.time()

    # Identify arch.
    if "whisper" in model_name.lower():
        processor = WhisperProcessor.from_pretrained(model_name)
        model = WhisperForConditionalGeneration.from_pretrained(model_name)
        model_type = "Whisper"
    else:
        processor = Wav2Vec2Processor.from_pretrained(model_name)
        model = Wav2Vec2ForCTC.from_pretrained(model_name)
        model_type = "Wav2Vec2"

    model.eval()  # Set to eval mode
    load_time = time.time() - start_time
    print(f"Model loaded in {load_time:.2f} sec")

    # Loop audio samples
    for audio_path in audio_files:
        print(f"\nTesting audio sample: {audio_path}")
        
        # Load + preprocess
        waveform, sample_rate = load_audio(audio_path)

        # Track inference times
        start_time = time.time()

        if model_type == "Whisper":
            input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
            with torch.no_grad():
                predicted_ids = model.generate(input_features)
            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
            confidence = None  # Whisper default no score
        else:
            input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values
            with torch.no_grad():
                logits = model(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = processor.batch_decode(predicted_ids)[0]
            confidence = None  #TODO: check if no scores from Wav2Vec2

        inference_time = time.time() - start_time
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") #stamp run

        # Store results
        results.append({
            "Model": model_name,
            "Audio File": audio_path,
            "Transcription": transcription,
            "Confidence": confidence,
            "Runtime (sec)": round(inference_time, 2),
            "Timestamp": timestamp
        })

        print(f"Transcription: {transcription}")
        print(f"Runtime: {inference_time:.2f} sec\n")

# Convert results to df
df_results = pd.DataFrame(results)

# Export results
csv_filename = "ASR_Test_Results5.csv"
df_results.to_csv(csv_filename, index=False)

print(f"\nAll models tested! Results saved to {csv_filename}")

# Display results
# import ace_tools as tools
# tools.display_dataframe_to_user(name="ASR Test Results", dataframe=df_results)
display(df_results)



Loading model: jonatasgrosman/wav2vec2-large-xlsr-53-arabic




Model loaded in 1.24 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_19_1.wav
Transcription: كفها يا عييوصد
Runtime: 4.38 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_20_1.wav
Transcription: أأَهاء
Runtime: 0.19 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_36_1.wav
Transcription: يا أصي
Runtime: 0.48 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_42_1.wav
Transcription: حامي
Runtime: 0.47 sec


Loading model: elgeish/wav2vec2-large-xlsr-53-arabic
Model loaded in 0.88 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_19_1.wav
Transcription: ka>fhA ynEyySdA
Runtime: 14.41 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_20_1.wav
Transcription: whA
Runtime: 7.50 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_36_1.wav
Transcription: yaASiyni
Runtime: 2.00 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_42_1.wav
Transcription: HAmyn
Runtime: 0.48 sec


Loading model: Nuwaisir/Q

You are using a model of type whisper to instantiate a model of type wav2vec2. This is not supported for all configurations of models and can yield errors.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at cherifkhalifah/quran-recitation-errors-test and are newly initialized: ['encoder.layer_norm.bias', 'encoder.layer_norm.weight', 'encoder.layers.0.attention.k_proj.bias', 'encoder.layers.0.attention.k_proj.weight', 'encoder.layers.0.attention.out_proj.bias', 'encoder.layers.0.attention.out_proj.weight', 'encoder.layers.0.attention.q_proj.bias', 'encoder.layers.0.attention.q_proj.weight', 'encoder.layers.0.attention.v_proj.bias', 'encoder.layers.0.attention.v_proj.weight', 'encoder.layers.0.feed_forward.intermediate_dense.bias', 'encoder.layers.0.feed_forward.intermediate_dense.weight', 'encoder.layers.0.feed_forward.output_dense.bias', 'encoder.layers.0.feed_forward.output_dense.weight', 'encoder.layers.0.final_layer_norm.bias', 'encoder.layers.0.final_l

Model loaded in 2.00 sec

Testing audio sample: ../audio_samples/Yasser_Al-Dosari_19_1.wav
Transcription: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Runtime: 0.50 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_20_1.wav
Transcription: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Runtime: 0.05 sec


Testing audio sample: ../audio_samples/Yasser_Al-Dosari_36_1.wav
Transcription: !!!!!!!!!!!!!!!!!!!!!!!!

Unnamed: 0,Model,Audio File,Transcription,Confidence,Runtime (sec),Timestamp
0,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_19_1.wav,كفها يا عييوصد,,4.38,2025-03-02 17:40:05
1,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_20_1.wav,أأَهاء,,0.19,2025-03-02 17:40:05
2,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_36_1.wav,يا أصي,,0.48,2025-03-02 17:40:06
3,jonatasgrosman/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_42_1.wav,حامي,,0.47,2025-03-02 17:40:06
4,elgeish/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_19_1.wav,ka>fhA ynEyySdA,,14.41,2025-03-02 17:40:22
5,elgeish/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_20_1.wav,whA,,7.5,2025-03-02 17:40:29
6,elgeish/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_36_1.wav,yaASiyni,,2.0,2025-03-02 17:40:31
7,elgeish/wav2vec2-large-xlsr-53-arabic,../audio_samples/Yasser_Al-Dosari_42_1.wav,HAmyn,,0.48,2025-03-02 17:40:32
8,Nuwaisir/Quran_speech_recognizer,../audio_samples/Yasser_Al-Dosari_19_1.wav,kalfo~aA yaAEiy yawSaAda,,6.26,2025-03-02 17:40:39
9,Nuwaisir/Quran_speech_recognizer,../audio_samples/Yasser_Al-Dosari_20_1.wav,raAhaA,,0.72,2025-03-02 17:40:40


In [7]:
model_name = "openai/whisper-large-v3"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

waveform, sample_rate = librosa.load("../audio_samples/Yasser_Al-Dosari_19_1.wav", sr=16000)

input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features

# Perform inference with confidence check
output = model.generate(input_features, return_dict_in_generate=True, output_scores=True)

# Check if confidence scores exist
if "scores" in output:
    print("Confidence Scores Available:", output["scores"])
else:
    print("No confidence scores provided by model.")




Confidence Scores Available: (tensor([[ 0.4467,    -inf,    -inf,  ..., -3.4640, -1.4090, -4.1936]]), tensor([[ 0.7507,    -inf,    -inf,  ..., -1.9020, -0.6599, -1.5491]]), tensor([[ 2.5995,    -inf,    -inf,  ..., -1.6763, -1.9276, -4.6736]]), tensor([[ 3.3006,    -inf,    -inf,  ..., -3.2542, -1.1555, -3.5255]]), tensor([[ 1.6550,    -inf,    -inf,  ..., -1.7817, -0.8417, -2.8127]]), tensor([[ 0.9226,    -inf,    -inf,  ..., -0.8326, -0.9346, -0.2151]]), tensor([[ 1.6676,    -inf,    -inf,  ..., -2.6020, -1.6851, -2.3814]]), tensor([[-0.3214,    -inf,    -inf,  ...,  0.0683,  0.6217,  0.8082]]), tensor([[ 3.7132,    -inf,    -inf,  ..., -3.5504, -3.0817, -5.0022]]))


In [8]:
model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

waveform, sample_rate = librosa.load("../audio_samples/Yasser_Al-Dosari_19_1.wav", sr=16000)

input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features

# Perform inference with confidence check
output = model.generate(input_features, return_dict_in_generate=True, output_scores=True)

# Check if confidence scores exist
if "scores" in output:
    print("Confidence Scores Available:", output["scores"])
else:
    print("No confidence scores provided by model.")


Confidence Scores Available: (tensor([[2.2863,   -inf,   -inf,  ..., 0.7428, 1.1539, 0.9136]]), tensor([[6.6136,   -inf,   -inf,  ..., 0.8602, 2.1934, 2.9324]]), tensor([[5.8940,   -inf,   -inf,  ..., 1.8555, 2.3277, 1.1500]]), tensor([[ 3.7190,    -inf,    -inf,  ..., -0.1803,  1.2867, -1.0666]]), tensor([[4.4006,   -inf,   -inf,  ..., 1.8719, 3.1062, 0.1681]]), tensor([[5.0028,   -inf,   -inf,  ..., 2.5379, 4.4602, 2.8707]]), tensor([[7.1068,   -inf,   -inf,  ..., 3.7309, 5.3738, 4.9451]]), tensor([[5.9824,   -inf,   -inf,  ..., 1.8115, 4.3894, 2.2312]]), tensor([[12.3368,    -inf,    -inf,  ...,  6.3642,  7.8752,  4.7585]]))


In [9]:
model_name = "areaz/whisper-small-for-quran"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

waveform, sample_rate = librosa.load("../audio_samples/Yasser_Al-Dosari_19_1.wav", sr=16000)

input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features

# Perform inference with confidence check
output = model.generate(input_features, return_dict_in_generate=True, output_scores=True)

# Check if confidence scores exist
if "scores" in output:
    print("Confidence Scores Available:", output["scores"])
else:
    print("No confidence scores provided by model.")


Confidence Scores Available: (tensor([[7.0041,   -inf,   -inf,  ..., 5.6202, 5.5087, 5.5764]]), tensor([[10.7067,    -inf,    -inf,  ...,  7.3576,  7.7059,  7.1671]]), tensor([[11.3455,    -inf,    -inf,  ...,  9.2720,  9.0600,  7.8532]]), tensor([[5.1740,   -inf,   -inf,  ..., 6.1220, 8.2860, 7.4448]]), tensor([[14.4798,    -inf,    -inf,  ..., 11.8897, 12.9172,  7.9000]]))


In [10]:
model_name = "tarteel-ai/whisper-base-ar-quran"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

waveform, sample_rate = librosa.load("../audio_samples/Yasser_Al-Dosari_19_1.wav", sr=16000)

input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features

# Perform inference with confidence check
output = model.generate(input_features, return_dict_in_generate=True, output_scores=True)

# Check if confidence scores exist
if "scores" in output:
    print("Confidence Scores Available:", output["scores"])
else:
    print("No confidence scores provided by model.")


Confidence Scores Available: (tensor([[-2.3277, -0.2643,  3.2322,  ...,  2.4382,  2.4980,  4.0420]]), tensor([[11.7368, 10.2380, 10.5927,  ..., 10.9159, 10.3146, 11.0031]]), tensor([[14.9061, 14.9290, 14.4619,  ..., 13.5511, 13.6797, 14.0803]]), tensor([[ 0.8593,  3.9046,  4.9028,  ..., -0.4469,  0.3671, -0.4713]]), tensor([[ 2.6728,  3.9126,  2.9306,  ...,  0.6534, -0.1332, -1.2278]]), tensor([[10.3623, 12.8753, 10.6782,  ...,  9.6621,  8.7771,  8.6699]]), tensor([[7.0583, 9.5065, 8.9502,  ..., 6.8669, 6.1135, 5.8283]]), tensor([[12.4085, 15.8481, 12.8217,  ..., 11.0386,  9.6883,  7.0158]]))


In [11]:
model_name = "cherifkhalifah/quran-recitation-errors-test"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

waveform, sample_rate = librosa.load("../audio_samples/Yasser_Al-Dosari_19_1.wav", sr=16000)

input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features

# Perform inference with confidence check
output = model.generate(input_features, return_dict_in_generate=True, output_scores=True)

# Check if confidence scores exist
if "scores" in output:
    print("Confidence Scores Available:", output["scores"])
else:
    print("No confidence scores provided by model.")


generation_config.json:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Confidence Scores Available: (tensor([[-3.5006,    -inf,    -inf,  ..., -2.8497, -2.6034, -2.5443]]), tensor([[-0.5999,    -inf,    -inf,  ..., -2.3763, -2.5435, -2.6145]]), tensor([[-7.9800,    -inf,    -inf,  ..., -7.5964, -7.6658, -8.2732]]), tensor([[-5.4559,    -inf,    -inf,  ..., -5.0399, -5.2673, -5.9340]]), tensor([[-1.7812,    -inf,    -inf,  ..., -4.1782, -4.4339, -6.3213]]), tensor([[-5.3714,    -inf,    -inf,  ..., -5.8701, -6.2278, -6.6222]]), tensor([[-13.4768,     -inf,     -inf,  ..., -13.4632, -13.3510, -13.5240]]), tensor([[-2.0493,    -inf,    -inf,  ..., -3.8041, -4.0773, -4.4783]]), tensor([[-11.0208,     -inf,     -inf,  ..., -10.7565, -11.4009, -11.2289]]), tensor([[-6.2984,    -inf,    -inf,  ..., -5.0227, -4.6833, -3.9032]]), tensor([[-4.3412,    -inf,    -inf,  ..., -6.7656, -6.5235, -7.2147]]), tensor([[-7.7085,    -inf,    -inf,  ..., -9.3692, -9.8170, -8.8977]]), tensor([[-4.7334,    -inf,    -inf,  ..., -3.9028, -3.5991, -3.2686]]), tensor([[-9.2222,    -