## Install Requirements

In [1]:
pip install datasets jiwer transformers colorednoise pyctcdecode https://github.com/kpu/kenlm/archive/master.zip

Collecting https://github.com/kpu/kenlm/archive/master.zip
  Using cached https://github.com/kpu/kenlm/archive/master.zip
  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


## Imports

In [2]:
import pandas as pd
import numpy as np
import os
import kenlm
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC, Wav2Vec2Tokenizer, Wav2Vec2Processor, AutoModelForCTC, AutoProcessor
from datasets import load_dataset
from jiwer import wer
import librosa
import nltk
import tarfile
import torch
import urllib.request
import soundfile as sf

nltk.download('punkt')

import audio_preprocess

[nltk_data] Downloading package punkt to /home/jw4169/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Required Functions

In [3]:
def download_and_extract_dataset_from_url(url, datasets_path):
    """
    downloads and extracts dataset from url into datasets_path/
    """
    temp = os.path.join(datasets_path, url.split('/')[-1])
    urllib.request.urlretrieve(url, temp)
    file = tarfile.open(temp)
    file.extractall(datasets_path)
    file.close()
    os.remove(temp)

In [4]:
def map_to_ground_truth(batch):
    """
    inserts ground truth in dataset
    """
    transcription_file_path = batch['audio']['path'][:-10] + '.trans.txt'
    f = open(transcription_file_path, 'r')
    lines= str.splitlines(f.read())
    txt=lines[int(batch['audio']['path'][-7:-5])].split(' ', 1)[1]
    batch['txt'] = txt
    return batch

In [5]:
def load_wav2vec_model(process_path: str):
    """
    load and return wav2vec tokenizer and model from huggingface
    """
    model = AutoModelForCTC.from_pretrained(process_path).to("cuda")
    processor = AutoProcessor.from_pretrained(process_path)
    return processor, model

In [6]:
def map_to_pred(batch):
    """
    predicts transcription
    """
    #tokenize
    inputs = processor(batch["audio"]["array"], sampling_rate=16_000, return_tensors="pt")
    inputs = {k: v.to("cuda") for k,v in inputs.items()}
    #take logits
    with torch.no_grad():
        logits = model(**inputs).logits

    transcription = processor.batch_decode(logits.cpu().numpy()).text[0]
    batch["transcription"] = transcription
    return batch
    

In [7]:
def add_noise_to_dataset(batch):
    """
    adds noise to dataset
    """
    batch['audio']['array'] = audio_preprocess.add_noise(batch['audio']['array'])
    return batch

In [8]:
def downsample_dataset(batch, output_sr=16000):
    """
    downsamples dataset
    """
    batch['audio']['array'] = audio_preprocess.down_sample(batch['audio']['array'], output_sr)
    return batch

In [9]:
def format_wer(text, transcription, decimal=1):
  return round(100 * wer(text, transcription), decimal)

## wav2vec 2.0 on Noisy Data

In [10]:
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
processor, model = load_wav2vec_model("patrickvonplaten/wav2vec2-base-960h-4-gram")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at patrickvonplaten/wav2vec2-base-960h-4-gram and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Please use `allow_patterns` and `ignore_patterns` instead.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
import utils
from datasets import load_dataset, load_from_disk

# set paths for input/output
root = '/home'
datasets_path = os.path.join(root, 'datasets')
predictions_path = os.path.join(root, 'predictions')
# # create folders if they do not already exist
# if not os.path.exists(datasets_path): os.makedirs(datasets_path)
# if not os.path.exists(predictions_path): os.makedirs(predictions_path)

!gsutil -m cp -n -r gs://capstone_datasets/librispeech/test/predictions/* .

In [13]:
import utils
from datasets import load_dataset, load_from_disk

In [14]:
# load datasets and remove results

sr = [500, 1000, 2000, 4000, 8000, 16000]
datasets = {}
for i in sr:
    datasets[i] = load_from_disk('lr_clean_test_ds_{}Hz_w2v2_base_960h'.format(i))

FileNotFoundError: Directory lr_clean_test_ds_500Hz_w2v2_base_960h not found

In [18]:
noise = [1,2,3,4,5,6]
datasets_noise = {}

# load datasets and remove results
for i in noise:
    datasets_noise[i] = load_from_disk('lr_clean_test_ns_{}%_w2v2_base_960h'.format(i))
    datasets_noise[i] = datasets_noise[i].remove_columns(['logits', 'transcription', 'label'])

In [16]:
results = {}
for i in sr:
    print("Start eval on", i, "Hz")
    # eval on batches
    results[i] = datasets[i].map(map_to_pred,
                            writer_batch_size=1000)


Start eval on 500 Hz


  0%|          | 0/2620 [00:00<?, ?ex/s]

Start eval on 1000 Hz


  0%|          | 0/2620 [00:00<?, ?ex/s]

Start eval on 2000 Hz


  0%|          | 0/2620 [00:00<?, ?ex/s]

Start eval on 4000 Hz


  0%|          | 0/2620 [00:00<?, ?ex/s]

Start eval on 8000 Hz


  0%|          | 0/2620 [00:00<?, ?ex/s]

Start eval on 16000 Hz


  0%|          | 0/2620 [00:00<?, ?ex/s]

In [19]:
results_noise = {}
for i in noise:
    print("Start eval on", i, "% noisy Librispeech")
    results_noise[i] = datasets_noise[i].map(map_to_pred,
                            writer_batch_size=1000)


Start eval on 1 % noisy Librispeech


  0%|          | 0/2620 [00:00<?, ?ex/s]

Start eval on 2 % noisy Librispeech


  0%|          | 0/2620 [00:00<?, ?ex/s]

Start eval on 3 % noisy Librispeech


  0%|          | 0/2620 [00:00<?, ?ex/s]

Start eval on 4 % noisy Librispeech


  0%|          | 0/2620 [00:00<?, ?ex/s]

Start eval on 5 % noisy Librispeech


  0%|          | 0/2620 [00:00<?, ?ex/s]

Start eval on 6 % noisy Librispeech


  0%|          | 0/2620 [00:00<?, ?ex/s]

In [None]:
for i in sr:
   print('WER: wav2vec2.0+4gram, ls-test-clean-{}Hz:'.format(i), format_wer(results[i]["ground_truth"], results[i]["transcription"]), '%.')

In [21]:
for i in noise:
    print('WER: wav2vec2.0+4gram, ls-test-clean-{}%:'.format(i), format_wer(results_noise[i]["ground_truth"], results_noise[i]["transcription"]), '%.')

WER: wav2vec2.0+4gram, ls-test-clean-1%: 6.0 %.
WER: wav2vec2.0+4gram, ls-test-clean-2%: 23.8 %.
WER: wav2vec2.0+4gram, ls-test-clean-3%: 50.0 %.
WER: wav2vec2.0+4gram, ls-test-clean-4%: 70.2 %.
WER: wav2vec2.0+4gram, ls-test-clean-5%: 82.5 %.
WER: wav2vec2.0+4gram, ls-test-clean-6%: 89.7 %.
