In [None]:
%pip install -r requirements.txt

In [None]:
# LINTERS
###############################
# !nbqa mypy jiwer.ipynb
# !nbqa black jiwer.ipynb
# !nbqa flake8 jiwer.ipynb
###############################

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

In [None]:
import subprocess
import mlflow # mlFlow
from pyngrok import ngrok # workaround for localhost

wandb_token = os.getenv("WANDB_TOKEN")
ngrok_token = os.getenv("NGROK_TOKEN")

# https://dashboard.ngrok.com/authtokens
ngrok.set_auth_token(ngrok_token)
port = "5000"

mlflow_proc = subprocess.Popen(["mlflow", "ui", "--port", port])
mlflow.autolog()
# mlflow_proc.terminate()

public_url = ngrok.connect(port)
print(f"MLflow UI: {public_url}")

In [None]:
############################
# Whisper dependencies
from faster_whisper import WhisperModel
############################

###############################
# DeepGram dependencies
import requests
import wave
import io
import time
import os
import torch
import logging
import json
import threading
from datetime import datetime
import deepgram
from deepgram import (
  DeepgramClient,
  DeepgramClientOptions,
  AgentWebSocketEvents,
  AgentKeepAlive,
  PrerecordedOptions,
  FileSource
)
################################

# Elevenlabs dependencies
################################
from elevenlabs.client import ElevenLabs
from io import BytesIO
import requests
################################

import assemblyai as aai
from rev_ai import apiclient
from openai import OpenAI
from google.cloud import speech
from google.oauth2 import service_account
from openai import AzureOpenAI
import psutil
import pandas as pd
import numpy as np

# Initialize the client
deepgram = DeepgramClient(os.getenv("DEEPCLIENT_TOKEN"))
elevenlabs_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_TOKEN"))
aai.settings.api_key = os.getenv("ASSEMBLY_TOKEN")
rev_token = os.getenv("REV_TOKEN")

###################################
# Mozilla Common Voice for testing
from datasets import load_dataset, Audio

# Greek dataset
dataset = load_dataset("mozilla-foundation/common_voice_17_0", "el", split="train")

# Squeezing to 16hz
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
###################################

In [None]:
"""
Model Loader Pipeline
"""
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

import torch.nn as nn
import torch
import torchaudio
import soundfile as sf

class ModelPipeline:
  def __init__(self, model_name: str, audio: str):
    self.model_name = model_name
    self.audio = audio
    self.processor = None
    self.model = None

  # Whisper
  #####################################
  def load_whisper_model(self) -> None:
    self.model = WhisperModel(self.model_name)

  def whisper_process_logic(self) -> str:
    text_stored = ""
    segments, _ = self.model.transcribe(self.audio, language="el") # <- specify language output
    for segment in segments:
      # print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) # <- to see with seconds
      text_stored += segment.text + " "
    return text_stored
  #####################################

  # HuggingFace
  #####################################
  def load_hf_model(self) -> None:
    self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name)

  def load_hf_processor(self) -> None:
    self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)

  def hf_process_logic(self) -> str:
    speech = self.audio["array"]
    sr = self.audio["sampling_rate"]

    if sr != 16000:
        speech = librosa.resample(speech, orig_sr=sr, target_sr=16000)
        sr = 16000

    inputs = self.processor(speech, sampling_rate=sr, return_tensors="pt")

    with torch.no_grad():
        logits = self.model(**inputs).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = self.processor.decode(predicted_ids[0])

    return transcription
  #####################################

  # DeepGram
  #####################################
  def deepgram_process_logic(self) -> str:
        # Handle different audio input formats
        if isinstance(self.audio, dict) and "array" in self.audio:
            # HuggingFace dataset format
            audio_array = self.audio["array"]
            sample_rate = self.audio["sampling_rate"]
        elif isinstance(self.audio, str):
            # File path format
            audio_array, sample_rate = sf.read(self.audio)
        else:
            raise ValueError(f"Unsupported audio format: {type(self.audio)}")

        # Convert to bytes buffer
        buffer = io.BytesIO()
        sf.write(buffer, audio_array, sample_rate, format='WAV')
        buffer.seek(0)  # Reset to the beginning

        # Configure options
        options = PrerecordedOptions(
            model="nova-2",
            smart_format=True,
            language="el",
        )

        # Transcribe using buffer data
        response = deepgram.listen.rest.v("1").transcribe_file(
            {"buffer": buffer, "mimetype": "audio/wav"},
            options
        )
        transcript = response["results"]["channels"][0]["alternatives"][0]["transcript"]

        return transcript
  #####################################

  # Elevenlabs
  #####################################
  def eleven_process_logic(self) -> str:
    # Check if self.audio is a URL or a local file path
    if self.audio.startswith(('http://', 'https://')):
        # It's a URL, use requests to download
        response = requests.get(self.audio)
        response.raise_for_status()  # Raise an exception for bad status codes
        audio_data = BytesIO(response.content)
    else:
        # It's a local file path, read directly
        try:
            with open(self.audio, 'rb') as f:
                audio_data = BytesIO(f.read())
        except FileNotFoundError:
            raise FileNotFoundError(f"Audio file not found: {self.audio}")
        except PermissionError:
            raise PermissionError(f"Permission denied accessing audio file: {self.audio}")

    transcription = elevenlabs_client.speech_to_text.convert(
        file=audio_data,
        model_id="scribe_v1", # Model to use, for now only "scribe_v1" is supported
        tag_audio_events=True, # Tag audio events like laughter, applause, etc.
        language_code="ell", # Language of the audio file. If set to None, the model will detect the language automatically.
        diarize=True, # Whether to annotate who is speaking
    )
    return transcription.text # <- there was an error with output
  #####################################

  # AssemblyAI
  #####################################
  def assembly_process_logic(self) -> str:
    # audio_file = "./local_file.mp3"
    # audio_file = "https://assembly.ai/wildfires.mp3"
    audio_file = self.audio

    config = aai.TranscriptionConfig(speech_model=aai.SpeechModel.best)

    transcript = aai.Transcriber(config=config).transcribe(audio_file)

    if transcript.status == "error":
      raise RuntimeError(f"Transcription failed: {transcript.error}")

    return transcript.text
  #####################################

  # Speechmatics
  ######################################
  def speechmatics_process_logic(self) -> str:
    from speechmatics.models import ConnectionSettings
    from speechmatics.batch_client import BatchClient
    from httpx import HTTPStatusError

    API_KEY = os.getenv("SPEECHMATICS_TOKEN")
    PATH_TO_FILE = self.audio
    LANGUAGE = "el"

    settings = ConnectionSettings(
        url="https://asr.api.speechmatics.com/v2",
        auth_token=API_KEY,
    )

    # Define transcription parameters
    conf = {
        "type": "transcription",
        "transcription_config": {
            "language": LANGUAGE
        }
    }

    # Open the client using a context manager
    with BatchClient(settings) as client:
        job_id = client.submit_job(
            audio=PATH_TO_FILE,
            transcription_config=conf,
        )
        print(f'job {job_id} submitted successfully, waiting for transcript')

        # Note that in production, you should set up notifications instead of polling.
        # Notifications are described here: https://docs.speechmatics.com/features-other/notifications
        transcript = client.wait_for_completion(job_id, transcription_format='txt')
        # To see the full output, try setting transcription_format='json-v2'.
        return transcript

  ######################################
  
  # OpenAI
  ######################################
  def azure_openai_process_logic(self) -> str:
    """
    Process audio using Azure OpenAI transcription models.
    """
    # Azure OpenAI configuration
    client = AzureOpenAI(
        api_key=os.getenv("AZURE_OPENAI_TOKEN"),
        api_version="2024-02-01",
        azure_endpoint="https://ai-mcoublm5-eastus2.cognitiveservices.azure.com/openai/deployments/gpt-4o-transcribe/audio/transcriptions?api-version=2025-03-01-preview"
    )

    with open(self.audio, "rb") as audio_file:
        transcription = client.audio.transcriptions.create(
            model="gpt-4o-transcribe",
            file=audio_file,
            language="el"  # <- put language here
        )

    return transcription.text
  ######################################

  # Google Cloud
  ######################################
  def google_cloud_process_logic(self) -> str:
    import os
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'google.json'
    client = speech.SpeechClient()

    with open(self.audio, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
        language_code="el-GR"
    )

    response = client.recognize(config=config, audio=audio)

    transcript = ""
    for result in response.results:
        transcript += result.alternatives[0].transcript + " "

    return transcript.strip()
  ######################################

  # Google Gemini
  ######################################

  def google_gemini_process_logic(self) -> str:
    from google import genai
    client = genai.Client(api_key="")

    response = client.models.generate_content(
        model="gemini-2.5-pro",
        contents="Explain how AI works in a few words",
    )

    return response.text
  ######################################

In [None]:
"""
  Jiwer Loader Pipeline
"""
import jiwer
from jiwer import wer,cer

"""
wer - word error rate
cer - character error rate
"""

class JiwerMetricsPipeline:
  def __init__(self, hypothesis: str, truth: str):
    self.hypothesis = hypothesis
    self.truth = truth

  def compute_metrics(self) -> tuple[float, float]:
    transformation = jiwer.Compose([
        jiwer.ToLowerCase(),
        jiwer.RemovePunctuation(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip()
    ])

    normalized_truth = transformation(self.truth)
    normalized_hypothesis = transformation(self.hypothesis)

    wer_score = wer(normalized_truth, normalized_hypothesis)
    cer_score = cer(normalized_truth, normalized_hypothesis)

    return wer_score, cer_score

In [None]:
run_counter = 0

# Metrics extensions
from mlflow.models import infer_signature
from mlflow.data.pandas_dataset import PandasDataset

def launch_model(model: str, sentence: str, audio: str) -> None:
  global run_counter
  run_counter += 1
  with mlflow.start_run(run_name=f"{model}_audio_{run_counter}"):

    # Dataset Mlflow
    #################################
    dataset_info = pd.DataFrame({
      'audio_path': [audio],
      'truth_text': [sentence],
      'text_length': [len(sentence)]
    })

    dataset = mlflow.data.from_pandas(
      dataset_info,
      source=f"huggingface",
      name="mozilla_common_voice",
    )

    mlflow.log_input(dataset, context="training")
    #################################

    # Log params for mlflow
    #################################
    mlflow.log_param("model_name", model)
    mlflow.log_param("timestamp", datetime.now().isoformat())
    #################################

    # Specs logging
    #################################
    mlflow.log_param("cpu_count", psutil.cpu_count())
    mlflow.log_param("memory_total_gb", round(psutil.virtual_memory().total / 1024**3, 1))
    #################################

    # Time logging
    ##########################
    start_time = time.time()

    llm = ModelPipeline(model, audio)

    memory_before = psutil.virtual_memory().used / 1024**3 # before whisper.process_logic
    ###############################

    """
    whisper
    """
    # llm.load_whisper_model()

    # process_start = time.time()
    # whisper_output = llm.whisper_process_logic()
    # process_time = time.time() - process_start

    """
    huggingface
    """
    # llm.load_hf_model()
    # llm.load_hf_processor()

    # process_start = time.time()
    # hf_output = llm.hf_process_logic()
    # process_time = time.time() - process_start

    """
    deepgram
    """
    # process_start = time.time()
    # llm_output = llm.deepgram_process_logic()
    # process_time = time.time() - process_start

    """
    elevenlabs
    """
    # process_start = time.time()
    # llm_output = llm.eleven_process_logic()
    # process_time = time.time() - process_start

    """
    assembly
    """
    # process_start = time.time()
    # llm_output = llm.assembly_process_logic()
    # process_time = time.time() - process_start

    """
    speechmatics
    """
    # process_start = time.time()
    # llm_output = llm.speechmatics_process_logic()
    # process_time = time.time() - process_start

    """
    azure openai
    """
    # process_start = time.time()
    # llm_output = llm.azure_openai_process_logic()
    # process_time = time.time() - process_start

    """
    google cloud
    """
    # process_start = time.time()
    # llm_output = llm.google_cloud_process_logic()
    # process_time = time.time() - process_start

    """
    google gemini
    """
    process_start = time.time()
    llm_output = llm.google_gemini_process_logic()
    process_time = time.time() - process_start

    total_time = time.time() - start_time

    memory_after = psutil.virtual_memory().used / 1024**3 # after whisper.process_logic

    mlflow.log_metric("process_time_seconds", round(process_time,2))
    mlflow.log_metric("total_time_seconds", total_time)
    mlflow.log_metric("memory_usage_change_gb", round(memory_after - memory_before, 3))
    mlflow.log_metric("cpu_usage_percent", psutil.cpu_percent())
    #############################

    err_metrics = JiwerMetricsPipeline(llm_output, sentence).compute_metrics()

    #wer, cer
    mlflow.log_metric("total_wer", round(err_metrics[0], 2))
    mlflow.log_metric("total_cer", round(err_metrics[1], 2))


In [None]:
# Common launch
for i in range(10):
    
    dataset_sentence = dataset[i]["sentence"]
    # for whisper models and other
    path_audio = dataset[i]["audio"]["path"]
    # for hf models and deepgram models
    common_audio = dataset[i]["audio"]

    launch_model("model-type", dataset_sentence, path_audio)
    print(f"Processed sample {i+1}/10")