In [7]:
import sys
import espnet
from espnet2.bin.asr_inference_streaming import Speech2TextStreaming
from espnet_model_zoo.downloader import ModelDownloader
import argparse
import numpy as np
import wave
import pyaudio
import os
import gradio as gr

In [16]:
tag        = "D-Keqi/espnet_asr_train_asr_streaming_transformer_raw_en_bpe500_sp_valid.acc.ave"
d          = ModelDownloader()
model_info = d.download_and_unpack(tag)

# Initialize streaming ASR
speech2text = Speech2TextStreaming(
    **model_info,
    token_type="bpe",
    maxlenratio=0.0,
    minlenratio=0.0,
    beam_size=5,          # Faster streaming
    ctc_weight=0.7,       # Hybrid CTC/attention
    lm_weight=0.3,        # No external LM
    penalty=0.0,
    nbest=1,
    device="cpu",        # Use GPU if available
    disable_repetition_detection=True,  # Avoid stuck loops
)

print('recording ........')

CHUNK    = 5120 # 2048 5120
FORMAT   = pyaudio.paInt16
CHANNELS = 1
RATE     = 16000 # 16000

# Time per chunk
# ms = (chunk/rate) * 1000

p = pyaudio.PyAudio()

'''
rate=RATE: Sampling rate in Hz (e.g., 44100 means 44100 samples per second).
How many samples captured per second
frames_per_buffer=CHUNK: How many audio samples per chunk are processed at a time.
A typical value is 1024.

input=True: Specifies that this is an input stream (microphone).
channels=CHANNELS: Number of audio channels (e.g., 1 for mono, 2 for stereo).
format=FORMAT: Specifies the data type (e.g., pyaudio.paInt16 for 16-bit integers)
'''

stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)

'''
RATE/CHUNK = chunks per second
* RECORD_SECONDS = total chunks
'''

CHUNKS_PER_SECOND = int(RATE/CHUNK)
CUT_AT_CHUNK = CHUNKS_PER_SECOND * 10

################
BUFFERED_CHUNKS = 15
# Audio buffer to store recent chunks
audio_buffer = []

chunk_counter = 0
final_flag = False

# TOTAL_CHUNKS = int((RATE/CHUNK)*RECORD_SECONDS) + 1

while True:

    ''' for each chunk '''

    data = stream.read(CHUNK) # for each loop, reading chunk size samples (blocking)

    data = np.frombuffer(data, dtype='int16') # builds numpy array from buffer-like object

    data_for_vad = data.astype(np.float32)/32767.0

    data = data.astype(np.float16)/32767.0 # normalize data as float

    ##############
    # Store current chunk in buffer (for overlap)
    audio_buffer.append(data)
    if len(audio_buffer) > BUFFERED_CHUNKS:
        audio_buffer.pop(0)  # Remove oldest chunk

    chunk_counter += 1

    '''
    results = [
        # a list of tuples because of multiple alternative transcriptions
        but here nbest=1 during initialization so only one tuple will be returned
        (
            'text' - The recognized transcription as a string
            'token' - The tokenized representation of the recognized text (a list of chars)
            'token_int' - The token IDs (integer representation of tokens)
            'hypothesis object' - The full decoding hypothesis object that contains additional detailed information about the recognition process and scores
        )
    ]
    '''

    ############## FROM DEEPSEEK
    # Check if cutoff reached
    if chunk_counter >= CUT_AT_CHUNK:
        chunk_counter = 0
        final_flag = True

        # Process current chunk + buffered chunks
        combined_audio = np.concatenate(audio_buffer) if audio_buffer else data
        results = speech2text(speech=combined_audio, is_final=final_flag)

        # Reset but keep buffer for next iteration
        speech2text.reset()
        audio_buffer = audio_buffer[-BUFFERED_CHUNKS:]  # Retain last N chunks
    else:
        final_flag = False
        results = speech2text(speech=data, is_final=final_flag)

    # Print results
    if results and results[0][0]:
        print(results[0][0])
    #################

Fetching 32 files: 100%|██████████| 32/32 [00:00<?, ?it/s]


recording ........
of the cla
of the cla
of the clas
of the clas
of the class i start
of the class i start to feel sc
of the class i start to feel sc
of the class i start to feel scared
of the class i start to feel scared
of the class i start to feel scared
of the class i start to feel scared
of the class i start to feel scared
of the class i start to feel scared my hand
of the class i start to feel scared my hand shak
of the class i start to feel scared my hand shak
of the class i start to feel scared my hand shake
of the class i start to feel scared my hand shake and my vo
of the class i start to feel scared my hand shake and my vo
of the class i start to feel scared my hand shake and my voice become
of the class i start to feel scared my hand shake and my voice become
of the class i start to feel scared my hand shake and my voice becomes by it
of the class i start to feel scared my hand shake and my voice becomes by it
of the class i start to feel scared my hand shake and my voice b

KeyboardInterrupt: 