## Install Requirements

In [1]:
pip install datasets jiwer transformers colorednoise pyctcdecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 26.2 MB/s 
[?25hCollecting jiwer
  Downloading jiwer-2.5.1-py3-none-any.whl (15 kB)
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 55.1 MB/s 
[?25hCollecting colorednoise
  Downloading colorednoise-2.1.0-py3-none-any.whl (4.5 kB)
Collecting pyctcdecode
  Downloading pyctcdecode-0.4.0-py2.py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 3.4 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 52.0 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |██████████████████████

## Imports

In [2]:
import pandas as pd
import numpy as np
import os
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
from datasets import load_dataset
from jiwer import wer
import librosa
import nltk
import tarfile
import torch
import urllib.request
import soundfile as sf

nltk.download('punkt')

import audio_preprocess

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Required Functions

In [3]:
def download_and_extract_dataset_from_url(url, datasets_path):
    """
    downloads and extracts dataset from url into datasets_path/
    """
    temp = os.path.join(datasets_path, url.split('/')[-1])
    urllib.request.urlretrieve(url, temp)
    file = tarfile.open(temp)
    file.extractall(datasets_path)
    file.close()
    os.remove(temp)

In [4]:
def map_to_ground_truth(batch):
    """
    inserts ground truth in dataset
    """
    transcription_file_path = batch['audio']['path'][:-10] + '.trans.txt'
    f = open(transcription_file_path, 'r')
    lines= str.splitlines(f.read())
    txt=lines[int(batch['audio']['path'][-7:-5])].split(' ', 1)[1]
    batch['txt'] = txt
    return batch

In [5]:
def load_wav2vec_model(hf_path: str):
    """
    load and return wav2vec tokenizer and model from huggingface
    """
    tokenizer = Wav2Vec2Tokenizer.from_pretrained(hf_path)
    model = Wav2Vec2ForCTC.from_pretrained(hf_path).to(device)
    return tokenizer, model

In [6]:
def map_to_pred(batch):
    """
    predicts transcription
    """
    #tokenize
    input_values = tokenizer(batch["audio"]["array"], return_tensors="pt").input_values
    #take logits
    logits = model(input_values.to(device)).logits
    #take argmax (find most probable word id)
    predicted_ids = torch.argmax(logits, dim=-1)
    #get the words from the predicted word ids
    transcription = tokenizer.decode(predicted_ids[0])
    batch["transcription"] = transcription
    return batch

In [7]:
def add_noise_to_dataset(batch):
    """
    adds noise to dataset
    """
    batch['audio']['array'] = audio_preprocess.add_noise(batch['audio']['array'])
    return batch

In [8]:
def downsample_dataset(batch):
    """
    downsamples dataset
    """
    batch['audio']['array'] = audio_preprocess.down_sample(batch['audio']['array'])
    return batch

In [9]:
def format_wer(text, transcription, decimal=1):
  return round(100 * wer(text, transcription), decimal)

## wav2vec 2.0 4gram on Noisy Data

In [10]:
# set paths
datasets_path_clean = os.path.join(os.getcwd(), 'datasets_clean')
datasets_path_other = os.path.join(os.getcwd(), 'datasets_other')
# create folders if they do not already exist
if not os.path.exists(datasets_path_clean):
    os.makedirs(datasets_path_clean)
if not os.path.exists(datasets_path_other):
    os.makedirs(datasets_path_other)
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
download_and_extract_dataset_from_url('https://www.openslr.org/resources/12/test-clean.tar.gz', datasets_path_clean)
download_and_extract_dataset_from_url('https://www.openslr.org/resources/12/test-other.tar.gz', datasets_path_other)

In [12]:
# load extracted ls data as dataset
librispeech_clean = load_dataset("datasets_clean/LibriSpeech", "clean", split='test')
librispeech_other = load_dataset("datasets_other/LibriSpeech", "other", split='test')

Resolving data files:   0%|          | 0/2707 [00:00<?, ?it/s]



Downloading and preparing dataset audiofolder/LibriSpeech to /root/.cache/huggingface/datasets/audiofolder/LibriSpeech-c8c50476963a50ef/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...
                

Downloading data files #3:   0%|          | 0/164 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/164 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/164 [00:00<?, ?obj/s]

Downloading data files #12:   0%|          | 0/163 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/164 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/163 [00:00<?, ?obj/s]

Downloading data files #4:   0%|          | 0/164 [00:00<?, ?obj/s]

Downloading data files #11:   0%|          | 0/164 [00:00<?, ?obj/s]

Downloading data files #5:   0%|          | 0/164 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/163 [00:00<?, ?obj/s]

Downloading data files #2:   0%|          | 0/164 [00:00<?, ?obj/s]

Downloading data files #9:   0%|          | 0/164 [00:00<?, ?obj/s]

Downloading data files #10:   0%|          | 0/164 [00:00<?, ?obj/s]

Downloading data files #6:   0%|          | 0/164 [00:00<?, ?obj/s]

Downloading data files #8:   0%|          | 0/164 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/163 [00:00<?, ?obj/s]

     

Downloading data files #1:   0%|          | 0/6 [00:00<?, ?obj/s]

Downloading data files #3:   0%|          | 0/6 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/6 [00:00<?, ?obj/s]

   

Downloading data files #2:   0%|          | 0/6 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/5 [00:00<?, ?obj/s]

  

Downloading data files #4:   0%|          | 0/6 [00:00<?, ?obj/s]

Downloading data files #5:   0%|          | 0/6 [00:00<?, ?obj/s]

 

Downloading data files #6:   0%|          | 0/6 [00:00<?, ?obj/s]

 

Downloading data files #9:   0%|          | 0/5 [00:00<?, ?obj/s]

  

Downloading data files #8:   0%|          | 0/5 [00:00<?, ?obj/s]

Downloading data files #10:   0%|          | 0/5 [00:00<?, ?obj/s]

  

Downloading data files #11:   0%|          | 0/5 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/5 [00:00<?, ?obj/s]

Downloading data files #12:   0%|          | 0/5 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/5 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/5 [00:00<?, ?obj/s]

Extracting data files:   0%|          | 0/87 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /root/.cache/huggingface/datasets/audiofolder/LibriSpeech-c8c50476963a50ef/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


Resolving data files:   0%|          | 0/3029 [00:00<?, ?it/s]



Downloading and preparing dataset audiofolder/LibriSpeech to /root/.cache/huggingface/datasets/audiofolder/LibriSpeech-9078579ce3b163a2/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...
                

Downloading data files #3:   0%|          | 0/184 [00:00<?, ?obj/s]

Downloading data files #9:   0%|          | 0/184 [00:00<?, ?obj/s]

Downloading data files #2:   0%|          | 0/184 [00:00<?, ?obj/s]

Downloading data files #8:   0%|          | 0/184 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/184 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/183 [00:00<?, ?obj/s]

Downloading data files #11:   0%|          | 0/183 [00:00<?, ?obj/s]

Downloading data files #4:   0%|          | 0/184 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/183 [00:00<?, ?obj/s]

Downloading data files #10:   0%|          | 0/184 [00:00<?, ?obj/s]

Downloading data files #6:   0%|          | 0/184 [00:00<?, ?obj/s]

Downloading data files #12:   0%|          | 0/183 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/183 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/184 [00:00<?, ?obj/s]

Downloading data files #5:   0%|          | 0/184 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/184 [00:00<?, ?obj/s]

   

Downloading data files #0:   0%|          | 0/6 [00:00<?, ?obj/s]

   

Downloading data files #2:   0%|          | 0/6 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/6 [00:00<?, ?obj/s]

  

Downloading data files #3:   0%|          | 0/6 [00:00<?, ?obj/s]

 

Downloading data files #8:   0%|          | 0/6 [00:00<?, ?obj/s]

Downloading data files #4:   0%|          | 0/6 [00:00<?, ?obj/s]

 

Downloading data files #6:   0%|          | 0/6 [00:00<?, ?obj/s]

 

Downloading data files #5:   0%|          | 0/6 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/6 [00:00<?, ?obj/s]

  

Downloading data files #12:   0%|          | 0/5 [00:00<?, ?obj/s]

 

Downloading data files #9:   0%|          | 0/6 [00:00<?, ?obj/s]

 

Downloading data files #10:   0%|          | 0/5 [00:00<?, ?obj/s]

Downloading data files #11:   0%|          | 0/5 [00:00<?, ?obj/s]

 

Downloading data files #14:   0%|          | 0/5 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/5 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/5 [00:00<?, ?obj/s]

Extracting data files:   0%|          | 0/90 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to /root/.cache/huggingface/datasets/audiofolder/LibriSpeech-9078579ce3b163a2/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


In [13]:
librispeech_clean = librispeech_clean.map(map_to_ground_truth)
librispeech_other = librispeech_other.map(map_to_ground_truth)

  0%|          | 0/2620 [00:00<?, ?ex/s]

  0%|          | 0/2939 [00:00<?, ?ex/s]

In [14]:
librispeech_clean_noisy = librispeech_clean.map(add_noise_to_dataset)
librispeech_clean_downsampled = librispeech_clean.map(downsample_dataset)

  0%|          | 0/2620 [00:00<?, ?ex/s]

  0%|          | 0/2620 [00:00<?, ?ex/s]

In [15]:
tokenizer, model = load_wav2vec_model("facebook/wav2vec2-base-960h")

Downloading:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/163 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.


Downloading:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
ls_clean_result = librispeech_clean.map(map_to_pred, remove_columns=["audio"])
ls_other_result = librispeech_other.map(map_to_pred, remove_columns=["audio"])
ls_clean_noisy_result = librispeech_clean_noisy.map(map_to_pred, remove_columns=["audio"])
ls_clean_downsampled_result = librispeech_clean_downsampled.map(map_to_pred, remove_columns=["audio"])

  0%|          | 0/2620 [00:00<?, ?ex/s]

  0%|          | 0/2939 [00:00<?, ?ex/s]

  0%|          | 0/2620 [00:00<?, ?ex/s]

  0%|          | 0/2620 [00:00<?, ?ex/s]

In [18]:
print('WER: wav2vec2-base-960h, ls-test-clean:', format_wer(ls_clean_result["txt"], ls_clean_result["transcription"]), '%.')
print('WER: wav2vec2-base-960h, ls-test-other:', format_wer(ls_other_result["txt"], ls_other_result["transcription"]), '%.')
print('WER: wav2vec2-base-960h, ls-test-clean, noisy:', format_wer(ls_clean_noisy_result["txt"], ls_clean_noisy_result["transcription"]), '%.')
print('WER: wav2vec2-base-960h, ls-test-clean, downsampled:', format_wer(ls_clean_downsampled_result["txt"], ls_clean_downsampled_result["transcription"]), '%.')

WER: wav2vec2-base-960h, ls-test-clean: 3.4 %.
WER: wav2vec2-base-960h, ls-test-other: 9.3 %.
WER: wav2vec2-base-960h, ls-test-clean, noisy: 8.3 %.
WER: wav2vec2-base-960h, ls-test-clean, downsampled: 4.2 %.
