# ESPnet2 ASR Inference

## 1.1 Connect to Google Drive

In [9]:
# Connect Google drive
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


## 1.2 Installation

In [None]:
!pip install -q espnet_model_zoo

In [2]:
import time
import torch
import string
import tarfile
import pandas as pd
import soundfile
import numpy as np
from csv import reader

In [4]:
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.asr_inference import Speech2Text

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package cmudict to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


## 1.3 Functions

In [5]:
def text_normalizer(text):
    text = text.upper()
    return text.translate(str.maketrans('', '', string.punctuation))

In [6]:
def compute_wer(hyp_sentence="",ref_sentence=""):
    """
    Inputs: 
    hyp_sentence: str- Sentence of text from the ASR Hypothesis
    ref_sentence: str-Sentence of text from the Ground Truth Reference
    Returns:
    wer_score: float- WER Score as a floating point number rounded to two decimal places       
    """
    ## Fill your code here
    hyp_word = hyp_sentence.split()
    ref_word = ref_sentence.split()

    m = len(hyp_word)
    n = len(ref_word)

    w_table = np.zeros([m + 1, n + 1])
    w_table[0, :] = np.arange(n + 1)
    w_table[:, 0] = np.arange(m + 1)

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if hyp_word[i - 1] == ref_word[j - 1]:
                w_table[i, j] = w_table[i - 1, j - 1]
            else:
                w_table[i, j] = 1 + min(w_table[i - 1, j], w_table[i, j - 1], w_table[i - 1, j - 1])

    score = w_table[m, n] / n
    return score * 100

## 1.4 Parameters

In [7]:
#@title Choose English ASR model { run: "auto" }
# Change and copy parameter on the right

fs = 16000 #@param {type:"integer"}
tag = 'kamo-naoyuki/wsj' #@param ["Shinji Watanabe/spgispeech_asr_train_asr_conformer6_n_fft512_hop_length256_raw_en_unnorm_bpe5000_valid.acc.ave", "kamo-naoyuki/librispeech_asr_train_asr_conformer6_n_fft512_hop_length256_raw_en_bpe5000_scheduler_confwarmup_steps40000_optim_conflr0.0025_sp_valid.acc.ave", "kamo-naoyuki/wsj"] {type:"string"}

In [9]:
d = ModelDownloader()
speech2text = Speech2Text(
    **d.download_and_unpack(tag),
    # device="cuda",
    minlenratio=0.0,
    maxlenratio=0.0,
    ctc_weight=0.3,
    beam_size=10,
    batch_size=0,
    nbest=1
)

## 2.1 Unzip Data

In [None]:
%cd /content/gdrive/MyDrive/CMU/18781_Speech_Rec/data

/content/gdrive/MyDrive/CMU/18781_Speech_Rec/data


In [None]:
# Untar LDC93S6B (csr_1_senn) (wsj0)
filename = "LDC93S6B.tgz"
tf = tarfile.open(filename)
tf.extractall('/content')

In [None]:
# Untar LDC94S13B (csr_senn) (wsj1)
filename = "LDC94S13B.tgz"
tf = tarfile.open(filename)
tf.extractall('/content')

## 2.2 Read Data and Evaluate ASR

### 2.2.1 WSJ

In [None]:
# LDC93S6B (csr_1_senn) (wsj0)
with open('/content/gdrive/MyDrive/CMU/18781_Speech_Rec/csr_1_senn.csv', 'r') as read_obj:
    csv_reader = reader(read_obj)
    for row in csv_reader:
        info = row[0]
        path, id, ref_text = info.split('|')
        path = "/".join(path.split('/')[4:])
        speech, rate = soundfile.read("/content/csr_1_senn/" + path)

        nbests = speech2text(speech)
        hyp_text, *_ = nbests[0]
        
        hyp_text = text_normalizer(hyp_text)
        ref_text = text_normalizer(ref_text)

        print(f"Input Speech: /content/csr_1_senn/{path}")
        print(f"Reference text: {ref_text}")
        print(f"ASR hypothesis: {hyp_text}")
        print(f"Score: {compute_wer(hyp_text, ref_text)}")
        print("*" * 50)

In [None]:
# LDC94S13B (csr_senn) (wsj1)
with open('/content/gdrive/MyDrive/CMU/18781_Speech_Rec/csr_senn.csv', 'r') as read_obj:
    csv_reader = reader(read_obj)
    for row in csv_reader:
        info = row[0]
        path, id, ref_text = info.split('|')
        path = "/".join(path.split('/')[4:])
        speech, rate = soundfile.read("/content/csr_senn/" + path)

        nbests = speech2text(speech)
        hyp_text, *_ = nbests[0]
        
        hyp_text = text_normalizer(hyp_text)
        ref_text = text_normalizer(ref_text)

        print(f"Input Speech: /content/csr_senn/{path}")
        print(f"Reference text: {ref_text}")
        print(f"ASR hypothesis: {hyp_text}")
        print(f"Score: {compute_wer(hyp_text, ref_text)}")
        print("*" * 50)

### 2.2.2 Single Audio File

In [19]:
# Example
speech, rate = soundfile.read("/home/ubuntu/mnt/jl/ID-DEID/data/wav/4k0/4k0a010a.wv1.wav")
nbests = speech2text(speech)

hyp_text, *_ = nbests[0]
# ref_text = "IT WILL NOT BE SAFE FOR YOU TO STAY HERE NOW"
# ref_text = "IT WILL BE NO DISAPPOINTMENT TO ME"

hyp_text = text_normalizer(hyp_text)
# ref_text = text_normalizer(ref_text)

# print(f"Reference text: {ref_text}")
print(f"ASR hypothesis: {hyp_text}")
# print(f"Score: {compute_wer(hyp_text, ref_text)}")


ASR hypothesis: PRICES COULD FALL FURTHER BARRING A PERSIAN GULF WAR
