# Evaluation of Language Identification Models (2 Speechbrain models and Whisper by OpenAI)

## Requirements (installations, imports)

In [None]:
# Mount drive
# Für andere Authoren: Verknüpfung von geteiltem Ordner in eigener Ablage erstellen; eventuell Links anpassen
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install necessary packages
install = True # Change if already installed
if install:
  ! pip install speechbrain
  ! pip install git+https://github.com/openai/whisper.git
  ! pip install jiwer

In [None]:
# Import necessary packages

import os

import whisper

import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import torchaudio

from tqdm.notebook import tqdm

from speechbrain.pretrained import EncoderClassifier

In [None]:
# Check for existing audio backend
print(str(torchaudio.get_audio_backend()))

## Speechbrain model; trained on VoxLingua107; 107 languages; Official error rate: 7%

In [None]:
# Get language identifier from speechbrain
language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="tmp")

## Whisper; trained on multiple sources; 99 languages; Unknown accuracy

In [None]:
# Check for GPU and set device

DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Set Runtime to GPU in Google Colab
print(DEVICE)

In [None]:
"""
Available Models:
Size      Parameters	English-only  model	    Multilingual model	    Required VRAM	Relative speed
tiny	    39 M	      tiny.en	      tiny	    ~1 GB	                  ~32x
base	    74 M	      base.en	      base	    ~1 GB	                  ~16x
small	    244 M	      small.en	    small	    ~2 GB	                  ~6x
medium	  769 M	      medium.en	    medium	  ~5 GB	                  ~2x
large	    1550 M	    N/A	          large	    ~10 GB	                1x
"""

# Choose and load model
model = whisper.load_model("medium")

## Combination evaluation

In [None]:
# Set path to directory to be evaluated
dir = '/content/drive/MyDrive/Language Identification/Fleurs_dev/'

sample_no = 20


group_1 = ['hr', 'da', 'nl', 'en', 'fi', 'fr', 'gl', 'de', 'it'] # WE
group_2 = ['bn', 'gu', 'hi', 'kn', 'ml', 'mr', 'ne', 'pa', 'ur'] # SA

groups = [group_1, group_2]

# Use model to identify language for every file in the specified directory
g = 1

all = 0
all_correct_tdnn = 0
all_correct_whisper = 0 

for group in groups:
  whisper_acc = {}
  tdnn_acc = {}

  for ln in group:

    if ln == 'zh':
      continue

    path = dir + ln + '/'

    total = 0
    whisper_correct = 0
    tdnn_correct = 0

    for f in os.listdir(path)[0:sample_no]:
      file_path = path + f

      # load audio and pad/trim it to fit 30 seconds
      audio = whisper.load_audio(file_path)
      audio = whisper.pad_or_trim(audio)

      # make log-Mel spectrogram and move to the same device as the model
      mel = whisper.log_mel_spectrogram(audio).to(model.device)

      # detect the spoken language
      _, probs = model.detect_language(mel)
      ln_whisper = max(probs, key=probs.get)

      if ln_whisper=='jw':
        ln_whisper = 'jv'
      elif ln_whisper=='no':
        ln_whisper='nb'

      if (ln_whisper == ln) or (ln_whisper=='zh' and (ln=='yue' or ln=='cmn')):
        whisper_correct += 1
        all_correct_whisper += 1

      if ln_whisper in group:
        signal = language_id.load_audio(file_path)
        prediction =  language_id.classify_batch(signal)

        # Get values from output neurons belonging to specific languages, save them to a list

        group_1_dict = {'hr':prediction[0][0][36], 'da':prediction[0][0][17], 'nl':prediction[0][0][68], 'en':prediction[0][0][20], 
                        'fi':prediction[0][0][26], 'fr':prediction[0][0][28], 'gl':prediction[0][0][29], 'de':prediction[0][0][18],
                        'it':prediction[0][0][43]}

        group_2_dict = {'bn':prediction[0][0][9], 'gu':prediction[0][0][31], 'hi':prediction[0][0][35], 'kn':prediction[0][0][50],
                        'ml':prediction[0][0][61], 'mr':prediction[0][0][63], 'ne':prediction[0][0][67], 'pa':prediction[0][0][72], 
                        'ur':prediction[0][0][100]}

        # Get key of maximum value
        if group == group_1:
          ln_tdnn = max(group_1_dict, key=group_1_dict.get)
        elif group == group_2:
          ln_tdnn = max(group_2_dict, key=group_2_dict.get)

        if (ln_tdnn == ln) or (ln_tdnn=='cmn' and ln=='yue'):
          tdnn_correct += 1
          all_correct_tdnn += 1

      total += 1
      all += 1

    whisper_acc[ln] = whisper_correct / total
    tdnn_acc[ln] = tdnn_correct / total
  
  sum = 0
  for val in whisper_acc.values():
    sum += val
  whisper_avg = sum / len(whisper_acc)

  sum = 0
  for val in tdnn_acc.values():
    sum += val
  tdnn_avg = sum / len(tdnn_acc)

  if whisper_avg>=tdnn_avg:
    recommendation = 'No combination recommended for Group ' + str(g) + '.'
  else:
    recommendation = 'Combination recommended for Group ' + str(g) + '.'

  print('Group', str(g))
  print('Whisper: ', whisper_acc, '; Avg: ', whisper_avg)
  print('Combination: ', tdnn_acc, '; Avg: ', tdnn_avg)
  print(recommendation, '\n')

  g += 1

