# Load audios

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import zipfile
import os
import shutil
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm

def unzip_files(zip_file_path, destination_folder):

  # destination_folder = zip_file_path.split('.zip')[0]

  # Create the destination folder if it doesn't exist
  if not os.path.exists(destination_folder):
      os.makedirs(destination_folder)

  # Open the zip file
  with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
      # Get total number of files to extract
      total_files = len(zip_ref.namelist())

      # Extract all contents to the destination folder with progress bar
      for file in tqdm(zip_ref.namelist(), total=total_files, desc=f'Extracting {zip_file_path}', unit='files'):
          zip_ref.extract(file, destination_folder)

In [None]:
# shutil.rmtree('OriginalAudios')
# shutil.rmtree('ClonedAudiosout')
unzip_files('/content/drive/MyDrive/Colab Notebooks/TFM/Exp2/OriginalAudios.zip', 'OriginalAudios')
unzip_files('/content/drive/MyDrive/Colab Notebooks/TFM/Exp2/ClonedAudiosout.zip', 'ClonedAudiosout')

Extracting /content/drive/MyDrive/Colab Notebooks/TFM/Exp2/OriginalAudios.zip: 100%|██████████| 7171/7171 [00:31<00:00, 225.49files/s]
Extracting /content/drive/MyDrive/Colab Notebooks/TFM/Exp2/ClonedAudiosout.zip: 100%|██████████| 13355/13355 [01:35<00:00, 140.03files/s]


# Install packages

In [None]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.4 rapidfuzz-3.9.0


In [None]:
!pip install -U openai-whisper

Collecting openai-whisper
  Downloading openai-whisper-20231117.tar.gz (798 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/798.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.7/798.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m788.5/798.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.6/798.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.7 MB/s

# Obtain transcriptions

In [None]:
import whisper
from IPython.display import clear_output

whisper = whisper.load_model("base")

def speech_to_text(metadata_csv_path):
  metadata_df = pd.read_csv(metadata_csv_path)

  for index, row in metadata_df.iterrows():

    clear_output(wait=True)

    percentage = round(100 * (1+index) / len(metadata_df), 3)
    print(f'{percentage} %')

    asr_text = whisper.transcribe(row['Audio path'])['text']

    metadata_df.loc[index, 'Transcription'] = asr_text

  return metadata_df

100%|███████████████████████████████████████| 139M/139M [00:06<00:00, 23.8MiB/s]


In [None]:
metadata_df = speech_to_text('metadata_with_secs.csv')

100.0 %




In [None]:
# metadata_df

# Obtain WER

In [None]:
import jiwer

transforms = jiwer.Compose(
    [
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
        jiwer.ReduceToListOfListOfWords(),
    ]
)

In [None]:
for index, row in metadata_df.iterrows():

  clear_output(wait=True)

  percentage = round(100 * (1+index) / len(metadata_df), 2)
  print(f'{percentage} %')

  reference = metadata_df.loc[index, 'Text']
  hypothesis = metadata_df.loc[index, 'Transcription']

  try:
    metadata_df.loc[index, 'WER'] = jiwer.wer(
                      reference,
                      hypothesis,
                      truth_transform=transforms,
                      hypothesis_transform=transforms,
                  )

  except:
    metadata_df.loc[index, 'WER'] = np.nan

100.0 %


In [None]:
# metadata_df

In [None]:
columns = metadata_df.columns.to_list()[:11] + metadata_df.columns.to_list()[-2:] + metadata_df.columns.to_list()[11:-2]
metadata_df = metadata_df[columns]
# metadata_df

In [None]:
metadata_df.to_csv('metadata_with_wer.csv', index=False)

In [None]:
from google.colab import files
files.download('metadata_with_wer.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>