In [1]:
import argparse
import numpy as np
import shlex
import subprocess
import sys
import wave


In [2]:
from deepspeech import Model, printVersions
from timeit import default_timer as timer

In [3]:
try:
    from shhlex import quote
except ImportError:
    from pipes import quote


In [4]:
# Define the sample rate for audio

SAMPLE_RATE = 16000
# These constants control the beam search decoder

# Beam width used in the CTC decoder when building candidate transcriptions
BEAM_WIDTH = 500

# The alpha hyperparameter of the CTC decoder. Language Model weight
LM_ALPHA = 0.75

# The beta hyperparameter of the CTC decoder. Word insertion bonus.
LM_BETA = 1.85


# These constants are tied to the shape of the graph used (changing them changes
# the geometry of the first layer), so make sure you use the same constants that
# were used during training

# Number of MFCC features to use
N_FEATURES = 26

# Size of the context window used for producing timesteps in the input vector
N_CONTEXT = 9


In [5]:
def convert_samplerate(audio_path):
    sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), SAMPLE_RATE)
    try:
        output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
    except OSError as e:
        raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(SAMPLE_RATE, e.strerror))

    return SAMPLE_RATE, np.frombuffer(output, np.int16)


def metadata_to_string(metadata):
    return ''.join(item.character for item in metadata.items)


class VersionAction(argparse.Action):
    def __init__(self, *args, **kwargs):
        super(VersionAction, self).__init__(nargs=0, *args, **kwargs)

    def __call__(self, *args, **kwargs):
        printVersions()
        exit(0)



In [8]:
alphabet = '/home/absin/Downloads/deepspeech-0.5.1-models/alphabet.txt'
model = '/home/absin/Downloads/deepspeech-0.5.1-models/output_graph.pb'
lm = '/home/absin/Downloads/deepspeech-0.5.1-models/lm.binary'
trie = '/home/absin/Downloads/deepspeech-0.5.1-models/trie'

In [9]:
print('Loading model from file {}'.format(model), file=sys.stderr)
model_load_start = timer()
ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
model_load_end = timer() - model_load_start
print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)


Loading model from file /home/absin/Downloads/deepspeech-0.5.1-models/output_graph.pb
Loaded model in 0.0963s.


In [10]:
print('Loading language model from files {} {}'.format(lm, trie), file=sys.stderr)
lm_load_start = timer()
ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)
lm_load_end = timer() - lm_load_start
print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr)

Loading language model from files /home/absin/Downloads/deepspeech-0.5.1-models/lm.binary /home/absin/Downloads/deepspeech-0.5.1-models/trie
Loaded language model in 0.159s.


In [24]:
print(dir(ds))
import time

['__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_impl', 'enableDecoderWithLM', 'feedAudioContent', 'finishStream', 'finishStreamWithMetadata', 'intermediateDecode', 'setupStream', 'stt', 'sttWithMetadata']


In [42]:
from scipy.io import wavfile
fs, frames = wavfile.read('/home/absin/Downloads/RE16255e9fec360777c31418c2853372dc_1.wav')

In [45]:
frames.shape[0]

1518720

In [44]:
frames[0:4000]

array([8, 7, 4, ..., 3, 4, 5], dtype=int16)

In [None]:
stream_context = ds.setupStream()
start = 0
start_time = time.time()
while(start<frames.shape[0]):
    ds.feedAudioContent(stream_context, np.frombuffer(frames[start:start+4000], np.int16))
    start += 4000
    print('Done '+str(start/16000)+":"+str(time.time()-start_time))
    
    print("Recognized: %s" % ds.intermediateDecode(stream_context))
text = ds.finishStream(stream_context)
print("Recognized: %s" % text)

Done 0.25:0.0016019344329833984
Recognized: 
Done 0.5:0.0038461685180664062
Recognized: 
Done 0.75:0.23262929916381836
Recognized: 
Done 1.0:0.4796748161315918
Recognized: 
Done 1.25:0.7293710708618164
Recognized: 
