# ESPnet2 real streaming Transformer demonstration
Details in "Streaming Transformer ASR with Blockwise Synchronous Beam Search"
(https://arxiv.org/abs/2006.14941)

This local notebook provides a demonstration of streaming ASR based on Transformer using ESPnet2.

You can recognize a recorded audio file or a speech online.

## Import packages
Make sure that you have installed the latest ESPnet

In [1]:
import sys
from pathlib import Path
import espnet
from espnet2.bin.asr_inference_streaming import Speech2TextStreaming
import argparse
import numpy as np
import wave
import yaml

  from .autonotebook import tqdm as notebook_tqdm


## Define paths and prepare for inference

In [2]:
# Define paths
data_dir = Path("data")
exp_dir = Path("exp/asr_stats_raw_jp_word/train")
config_path = data_dir / "config.yaml"
model_path = data_dir / "valid.acc.best.pth"
feats_stats_path = data_dir / "feats_stats.npz"
token_path = data_dir / "tokens.txt"

# Load configuration
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Initialize Speech2Text with local model
speech2text = Speech2TextStreaming(
    asr_train_config=str(config_path),
    asr_model_file=str(model_path),
    token_type=None,
    bpemodel=None,
    maxlenratio=0.0,
    minlenratio=0.0,
    beam_size=20,
    ctc_weight=0.5,
    lm_weight=0.0,
    penalty=0.0,
    nbest=1,
    device="cpu",
    disable_repetition_detection=True,
    decoder_text_length_limit=0,
    encoded_feat_length_limit=0
)

## Define helper functions

In [3]:
prev_lines = 0
def progress_output(text):
    global prev_lines
    lines=['']
    for i in text:
        if len(lines[-1]) > 100:
            lines.append('')
        lines[-1] += i
    for i,line in enumerate(lines):
        if i == prev_lines:
            sys.stderr.write('\n\r')
        else:
            sys.stderr.write('\r\033[B\033[K')
        sys.stderr.write(line)

    prev_lines = len(lines)
    sys.stderr.flush()

In [4]:
def recognize(wavfile):
    with wave.open(wavfile, 'rb') as wavfile:
        ch=wavfile.getnchannels()
        bits=wavfile.getsampwidth()
        rate=wavfile.getframerate()
        nframes=wavfile.getnframes()
        buf = wavfile.readframes(-1)
        data=np.frombuffer(buf, dtype='int16')
    speech = data.astype(np.float16)/32767.0
    sim_chunk_length = 640
    if sim_chunk_length > 0:
        for i in range(len(speech)//sim_chunk_length):
            results = speech2text(speech=speech[i*sim_chunk_length:(i+1)*sim_chunk_length], is_final=False)
            if results is not None and len(results) > 0:
                nbests = [text for text, token, token_int, hyp in results]
                text = nbests[0] if nbests is not None and len(nbests) > 0 else ""
                progress_output(nbests[0])
            else:
                progress_output("")
            
        results = speech2text(speech[(i+1)*sim_chunk_length:len(speech)], is_final=True)
    else:
        results = speech2text(speech, is_final=True)
    nbests = [text for text, token, token_int, hyp in results]
    progress_output(nbests[0])

## Recognize the audio file

In [7]:
# Specify the path to your audio file
wavfile='data/sample.wav'
recognize(wavfile)

[B[K(D す ー ) (F あ の ) し ま せ ん ふ ら ん す る よ り あ を 隣 で 探 し て い る ん で す け ど ー (F え っ と ー ) そ う だ な ちょ っ と 家 族
[B[K の 誕 生 日 用 に 使 う ん で す け ど ー (F え ー と ) に っ ぽ り の 近 く で 何 か い い 店 や り ま す か ねと ー ) そ う だ な ちょ っ と 家 族