In [1]:
import sys
sys.path.append("../")

In [2]:
import toml
from src.dataset import AudioToTextDataLayer
from src.helpers import process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, AmpOptimizations, print_dict, __ctc_decoder_predictions_tensor
from src.model import AudioPreprocessing, GreedyCTCDecoder, JasperEncoderDecoder
from src.parts.features import audio_from_file
import torch
import random
import numpy as np
import time

In [3]:
def run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels):
            features = audio_processor(audio, audio_len)
            torch.cuda.synchronize()
            t0 = time.perf_counter()
            t_log_probs_e = encoderdecoder(features[0])
            torch.cuda.synchronize()
            t1 = time.perf_counter()
            t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
            hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels)
            print("INFERENCE TIME\t\t: {} ms".format((t1-t0)*1000.0))
            print("TRANSCRIPT\t\t:", hypotheses[0])


def inference(wav, model, model_toml, seed=42, cudnn_benchmark=False):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.benchmark = cudnn_benchmark
    print("CUDNN BENCHMARK ", cudnn_benchmark)

    optim_level = 0

    jasper_model_definition = toml.load(model_toml)
    dataset_vocab = jasper_model_definition['labels']['labels']
    ctc_vocab = add_ctc_labels(dataset_vocab)

    featurizer_config = jasper_model_definition['input_eval']
    featurizer_config["optimization_level"] = optim_level
    use_conv_mask = jasper_model_definition['encoder'].get('convmask', True)
        
    print('=== model_config ===')
    print_dict(jasper_model_definition)
    print('=== feature_config ===')
    print_dict(featurizer_config)
    data_layer = None
    
    audio_preprocessor = AudioPreprocessing(**featurizer_config)
    encoderdecoder = JasperEncoderDecoder(jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab))

    print("loading model from ", model)
    checkpoint = torch.load(model, map_location="cpu")
    for k in audio_preprocessor.state_dict().keys():
        checkpoint['state_dict'][k] = checkpoint['state_dict'].pop("audio_preprocessor." + k)
    audio_preprocessor.load_state_dict(checkpoint['state_dict'], strict=False)
    encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False)

    greedy_decoder = GreedyCTCDecoder()

    print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize)

    audio_preprocessor.eval()
    encoderdecoder.eval()
    greedy_decoder.eval()
    
    audio, audio_len = audio_from_file(wav)
    
    run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, ctc_vocab)

In [4]:
inference("example1.wav", "../models/Jasper_1612265229.5877585-epoch-179.pt", "../configs/jasper10x5dr_sp_offline_specaugment.toml", seed=42)

CUDNN BENCHMARK  False
=== model_config ===
Arguments:
	   encoder : {'activation': 'relu', 'convmask': True}
	     input : {'normalize': 'per_feature', 'sample_rate': 16000, 'window_size': 0.02, 'window_stride': 0.01, 'window': 'hann', 'features': 64, 'n_fft': 512, 'frame_splicing': 1, 'dither': 1e-05, 'feat_type': 'logfbank', 'normalize_transcripts': True, 'trim_silence': True, 'pad_to': 16, 'max_duration': 15, 'speed_perturbation': True, 'cutout_rect_regions': 0, 'cutout_rect_time': 60, 'cutout_rect_freq': 25, 'cutout_x_regions': 2, 'cutout_y_regions': 2, 'cutout_x_width': 6, 'cutout_y_width': 6}
	input_eval : {'normalize': 'per_feature', 'sample_rate': 16000, 'window_size': 0.02, 'window_stride': 0.01, 'window': 'hann', 'features': 64, 'n_fft': 512, 'frame_splicing': 1, 'dither': 1e-05, 'feat_type': 'logfbank', 'normalize_transcripts': True, 'trim_silence': True, 'pad_to': 16, 'optimization_level': 0}
	    jasper : [{'filters': 256, 'repeat': 1, 'kernel': [11], 'stride': [2], 'dila

In [15]:
import pyaudio

CHUNK = 4024
WIDTH = 2
CHANNELS = 1
RATE = 16000
RECORD_SECONDS = 5

p = pyaudio.PyAudio()

stream = p.open(format=p.get_format_from_width(WIDTH),
                channels=CHANNELS,
                rate=RATE,
                input=True,
                output=True,
                frames_per_buffer=CHUNK)

print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    frames.append(stream.read(CHUNK))  #read audio stream
#     stream.write(data, CHUNK)  #play back audio stream

data = b''.join(frames)
print("* done")

stream.stop_stream()
stream.close()

p.terminate()

wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(data)
wf.close()

* recording
* done
