In [None]:
!pip install vosk
!pip install jiwer
from google.colab import drive
drive.mount('/content/drive')

# Assuming the vosk model is not yet installed
!wget https://alphacephei.com/vosk/models/vosk-model-en-us-0.22-lgraph.zip
!unzip vosk-model-en-us-0.22-lgraph.zip -d model


In [None]:
!unzip "/content/drive/MyDrive/Projects/AudioFiles_final/voices_final_trimmed_2.zip" -d "/content/"


In [None]:
# Correct paths assuming the structure mentioned and process them correctly
audio_paths_file = '/content/audio_paths_file.txt'
text_file = '/content/text_file.txt'

# Load the paths and texts
with open(audio_paths_file, 'r') as f:
    # Assuming each line is "ID PATH" and you need the PATH part
    audio_paths = [line.strip().split(' ')[1] for line in f.readlines()]

with open(text_file, 'r') as f:
    transcriptions = [line.strip() for line in f.readlines()]

assert len(audio_paths) == len(transcriptions), "Mismatch between audio paths and transcriptions count"


In [None]:
from vosk import Model, KaldiRecognizer
import wave
import json
import pandas as pd
from jiwer import wer, cer
import numpy as np

model_path = './model/vosk-model-en-us-0.22-lgraph'
model = Model(model_path)


In [None]:
def transcribe_audio(file_path, model):
    try:
        wf = wave.open(file_path, "rb")
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return ""
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        rec.AcceptWaveform(data)
    wf.close()
    return json.loads(rec.FinalResult())['text']

from tqdm import tqdm

def evaluate_transcriptions(audio_paths, transcriptions, model):
    evaluation_results = []
    for audio_path, transcription in tqdm(zip(audio_paths, transcriptions), total=len(audio_paths), desc="Evaluating Transcriptions"):
        try:
            predicted_transcription = transcribe_audio(audio_path, model)
            evaluation_results.append({
                'audio_path': audio_path,
                'predicted_transcription': predicted_transcription,
                'ground_truth': transcription,
                'wer': wer(transcription, predicted_transcription),
                'cer': cer(transcription, predicted_transcription)
            })
        except Exception as e:
            print(f"Error processing {audio_path}: {e}")
    return pd.DataFrame(evaluation_results)



In [None]:
results_df = evaluate_transcriptions(audio_paths, transcriptions, model)


In [None]:
overall_wer = np.mean(results_df['wer']) * 100
overall_cer = np.mean(results_df['cer']) * 100

results_text = f"""Dataset: /content/output_data_directory/eval_dataset
WER: {overall_wer:.2f}%, CER: {overall_cer:.2f}%\n"""

for index, row in results_df.iterrows():
    results_text += f'\nReference: "{row["ground_truth"]}"\nPredicted:  "{row["predicted_transcription"]}"\n'

results_file_path = '/content/evaluation_results.txt'
with open(results_file_path, 'w') as file:
    file.write(results_text)

print(f'Results saved to {results_file_path}')
