In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect
"""
# If you're using Google Colab and not running locally, run this cell.

## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install unidecode
!pip install matplotlib>=3.3.2
!apt-get install libsox-fmt-all libsox-dev sox > /dev/null
!pip install torchaudio
!python -m pip install git+https://github.com/facebookresearch/WavAugment.git > /dev/null
!pip install wandb

## Install NeMo
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

# install beam search decoder
!apt-get install -y swig
!git clone https://github.com/NVIDIA/NeMo -b "$BRANCH"
!cd NeMo && bash scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh

%rm -rf asr
!git clone https://github.com/alexjercan/asr-toolkit.git asr > /dev/null

"""
Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!
Alternatively, you can uncomment the exit() below to crash and restart the kernel, in the case
that you want to use the "Run All Cells" (or similar) option.
"""
# exit()
from IPython.display import clear_output
clear_output()

In [1]:
import os
import re
import wget
import gzip
import shutil

import nemo
import nemo.collections.asr as nemo_asr
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import augment
import torchaudio
import torchaudio.datasets
from torchaudio.datasets.librispeech import load_librispeech_item
from pathlib import Path
from google.colab import files

from datetime import datetime as dt
from tqdm import tqdm
import matplotlib.pyplot as plt

from asr.metrics import ASRMetricFunction, CTCLossFunction
from asr.visualisation import play_audio, print_err_html, print_stats, plot_waveform
from asr.general import set_parameter_requires_grad, load_checkpoint, save_checkpoint, tensors_to_device, tensor_to_string
from asr.models import BeamSearchDecoderWithLM
from asr.datasets import LibriSpeechBookDataset
from IPython.display import YouTubeVideo, clear_output
clear_output()

print('Setup complete. Using torch %s %s' % (torch.__version__, torch.cuda.get_device_properties(0) if torch.cuda.is_available() else 'CPU'))

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME='stt_en_jasper10x5dr'
LM_3GRAM_PATH = '3-gram.arpa'
ROOT = os.path.join(".")

Setup complete. Using torch 1.9.0+cu102 _CudaDeviceProperties(name='Tesla K80', major=3, minor=7, total_memory=11441MB, multi_processor_count=13)


In [4]:
class LibriSpeechBookDataset(torchaudio.datasets.LIBRISPEECH):
    def __init__(self, root, url, folder_in_archive="LibriSpeech", download=False):
        super(LibriSpeechBookDataset, self).__init__(root, url, folder_in_archive, download)

        chapterpaths = {p.stem:str(p) for p in Path(self._path).glob('*/*/')}
        names = ["ID", "READER", "MINUTES", "SUBSET", "PROJ.", "BOOK ID", "CH. TITLE", "PROJECT TITLE"]
        converters = {"BOOK ID": str.strip, "SUBSET": str.strip, "CH. TITLE" : str.strip, "PROJECT TITLE" : str.strip}
        chapters = os.path.join(root, folder_in_archive, "CHAPTERS.TXT")
        df = pd.read_csv(chapters, delimiter='|', comment=';', names=names, converters=converters)
        df = df[df["SUBSET"] == os.path.basename(url)].groupby("BOOK ID")
        df = pd.DataFrame({"CHAPTERS": df["ID"].apply(list), "MINUTES": df["MINUTES"].apply(sum)})
        df['CHAPTER_PATH'] = df.apply(lambda row: [chapterpaths[str(x)] for x in row["CHAPTERS"]], axis=1)
        df.reset_index(level=0, inplace=True)
        df = df.astype({"BOOK ID": 'object'})

        names = ["BOOK ID", "BOOK TITLE"]
        converters = {"BOOK ID": str.strip, "BOOK TITLE": str.strip}
        books = os.path.join(root, folder_in_archive, "BOOKS.TXT")
        dfp = pd.read_csv(books, delimiter='|', comment=';', names=names, converters=converters, usecols=names)
        df = pd.merge(df, dfp, how="inner", on="BOOK ID")

        self._walker = df

    def __getitem__(self, n):
        row = self._walker.iloc[n]

        audiofileids = [str(p.stem) for chapterpath in row["CHAPTER_PATH"] for p in Path(chapterpath).glob('*' + self._ext_audio)]
        items = [load_librispeech_item(fileid, self._path, self._ext_audio, self._ext_txt) for fileid in audiofileids]

        waveforms, _, utterances, _, _, _ = zip(*items)
        return torch.cat(waveforms, dim=1), " ".join(utterances), row["BOOK TITLE"], row["MINUTES"]

In [6]:
def download_lm(lm_path):
    %rm -v "{lm_path}"*
    !wget "https://www.openslr.org/resources/11/{lm_path}.gz" -O "{lm_path}.gz"
    !gzip -cdv "{lm_path}.gz" > "{lm_path}"

model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name=MODEL_NAME, strict=False).to(DEVICE)

VOCABULARY = list(map(lambda x: x.upper(), model.decoder.vocabulary))
vocab = VOCABULARY + ['<pad>']
BLANK = len(vocab) - 1 

DICTIONARY = dict(zip(vocab, range(len(vocab))))
LABELS = {v:k for k, v in DICTIONARY.items()}

train_dataset = LibriSpeechBookDataset(root=ROOT, url="train-clean-100", folder_in_archive="LibriSpeech", download=True)
dev_dataset = LibriSpeechBookDataset(root=ROOT, url="dev-clean", folder_in_archive="LibriSpeech", download=True)
test_dataset = LibriSpeechBookDataset(root=ROOT, url="test-clean", folder_in_archive="LibriSpeech", download=True)

if not os.path.exists(LM_3GRAM_PATH):
    download_lm(LM_3GRAM_PATH)
beam_search_lm = BeamSearchDecoderWithLM(
    vocab=VOCABULARY,
    beam_width=16,
    alpha=1.5, beta=1.5,
    lm_path=LM_3GRAM_PATH,
    num_cpus=max(os.cpu_count(), 1))
clear_output()

In [7]:
def get_best_transcriptions(transcriptions):
    return list(map(lambda xs: xs[0][1], transcriptions))

In [21]:
loop = tqdm(train_dataset, position=0, leave=True)
df = pd.DataFrame(None, columns=["REAL TEXT", "BOOK TITLE", "DURATION"])

for batch_idx, (waveform, transcription, booktitle, duration) in enumerate(loop):
    df = df.append({"REAL TEXT": transcription, "BOOK TITLE": booktitle, "DURATION": duration}, ignore_index=True)

loop.close()
df.to_csv("train-clean-100.csv")
files.download("train-clean-100.csv")

print(df["REAL TEXT"].apply(lambda t: len(t.split(" "))).describe())
print(df["DURATION"].describe())

100%|██████████| 305/305 [03:13<00:00,  1.58it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count      305.000000
mean      3226.773770
std       2456.083601
min         78.000000
25%       1445.000000
50%       2792.000000
75%       4135.000000
max      19479.000000
Name: REAL TEXT, dtype: float64
count    305.000000
mean      19.675639
std       15.336142
min        0.540000
25%        9.170000
50%       17.020000
75%       25.110000
max      127.620000
Name: DURATION, dtype: float64


In [None]:
model.eval()
loop = tqdm(train_dataset, position=0, leave=True)
df = pd.DataFrame(None, columns=["TEXT", "REAL TEXT", "BOOK TITLE", "DURATION"])

for batch_idx, (waveform, transcription, booktitle, duration) in enumerate(loop):
    waveform = waveform[0].to(DEVICE).unsqueeze(0)
    valid_lengths = torch.tensor([waveform.shape[-1]], device=DEVICE)

    with torch.no_grad():
        log_probs, encoded_len, greedy_predictions = model(input_signal=waveform, input_signal_length=valid_lengths)
        transcriptions = beam_search_lm(log_probs=log_probs, log_probs_length=encoded_len)

    best_transcriptions = get_best_transcriptions(transcriptions)
    df = df.append({"TEXT": best_transcriptions[0], "REAL TEXT": transcription, "BOOK TITLE": booktitle, "DURATION": duration}, ignore_index=True)

loop.close()
df.to_csv("train-clean-100.csv")
files.download("train-clean-100.csv")

In [None]:
model.eval()
loop = tqdm(test_dataset, position=0, leave=True)
df = pd.DataFrame(None, columns=["TEXT", "REAL TEXT", "BOOK TITLE", "DURATION"])

for batch_idx, (waveform, transcription, booktitle, duration) in enumerate(loop):
    waveform = waveform[0].to(DEVICE).unsqueeze(0)
    valid_lengths = torch.tensor([waveform.shape[-1]], device=DEVICE)

    with torch.no_grad():
        log_probs, encoded_len, greedy_predictions = model(input_signal=waveform, input_signal_length=valid_lengths)
        transcriptions = beam_search_lm(log_probs=log_probs, log_probs_length=encoded_len)

    best_transcriptions = get_best_transcriptions(transcriptions)
    df = df.append({"TEXT": best_transcriptions[0], "REAL TEXT": transcription, "BOOK TITLE": booktitle, "DURATION": duration}, ignore_index=True)

loop.close()
df.to_csv("test-clean.csv")
files.download("test-clean.csv")

  0%|          | 0/54 [00:00<?, ?it/s][NeMo W 2021-08-16 11:09:03 patch_utils:50] torch.stft() signature has been updated for PyTorch 1.7+
    Please update PyTorch to remain compatible with later versions of NeMo.
    To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
      return torch.floor_divide(self, other)
    
 35%|███▌      | 19/54 [04:47<08:19, 14.28s/it]

In [None]:
model.eval()
loop = tqdm(dev_dataset, position=0, leave=True)
df = pd.DataFrame(None, columns=["TEXT", "REAL TEXT", "BOOK TITLE", "DURATION"])

for batch_idx, (waveform, transcription, booktitle, duration) in enumerate(loop):
    waveform = waveform[0].to(DEVICE).unsqueeze(0)
    valid_lengths = torch.tensor([waveform.shape[-1]], device=DEVICE)

    with torch.no_grad():
        log_probs, encoded_len, greedy_predictions = model(input_signal=waveform, input_signal_length=valid_lengths)
        transcriptions = beam_search_lm(log_probs=log_probs, log_probs_length=encoded_len)

    best_transcriptions = get_best_transcriptions(transcriptions)
    df = df.append({"TEXT": best_transcriptions[0], "REAL TEXT": transcription, "BOOK TITLE": booktitle, "DURATION": duration}, ignore_index=True)

loop.close()
df.to_csv("dev-clean.csv")
files.download("dev-clean.csv")