In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install deepspeech-gpu

Collecting deepspeech-gpu
[?25l  Downloading https://files.pythonhosted.org/packages/59/ff/f0a16fabbad933f1e5eca297840d0ed3582ee9ddb0ac996823cca7a61c1a/deepspeech_gpu-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (22.3MB)
[K     |████████████████████████████████| 22.3MB 1.2MB/s 
Installing collected packages: deepspeech-gpu
Successfully installed deepspeech-gpu-0.9.3


In [3]:
!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   652  100   652    0     0   3134      0 --:--:-- --:--:-- --:--:--  3119
100  180M  100  180M    0     0  37.2M      0  0:00:04  0:00:04 --:--:-- 47.0M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   654  100   654    0     0   3388      0 --:--:-- --:--:-- --:--:--  3371
100  909M  100  909M    0     0  42.2M      0  0:00:21  0:00:21 --:--:-- 44.7M


In [7]:
!apt -qq install -y sox

The following additional packages will be installed:
  libmagic-mgc libmagic1 libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa
  libsox-fmt-base libsox3
Suggested packages:
  file libsox-fmt-all
The following NEW packages will be installed:
  libmagic-mgc libmagic1 libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa
  libsox-fmt-base libsox3 sox
0 upgraded, 8 newly installed, 0 to remove and 15 not upgraded.
Need to get 760 kB of archives.
After this operation, 6,717 kB of additional disk space will be used.
Selecting previously unselected package libopencore-amrnb0:amd64.
(Reading database ... 145480 files and directories currently installed.)
Preparing to unpack .../0-libopencore-amrnb0_0.1.3-2.1_amd64.deb ...
Unpacking libopencore-amrnb0:amd64 (0.1.3-2.1) ...
Selecting previously unselected package libopencore-amrwb0:amd64.
Preparing to unpack .../1-libopencore-amrwb0_0.1.3-2.1_amd64.deb ...
Unpacking libopencore-amrwb0:amd64 (0.1.3-2.1) ...
Selecting previously unselected pa

In [4]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function

import argparse
import numpy as np
import shlex
import subprocess
import sys
import wave
import json
from os.path import join

from deepspeech import Model, version
from timeit import default_timer as timer
import time

try:
    from shhlex import quote
except ImportError:
    from pipes import quote


def convert_samplerate(audio_path, desired_sample_rate):
    sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate)
    try:
        output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
    except OSError as e:
        raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(desired_sample_rate, e.strerror))

    return desired_sample_rate, np.frombuffer(output, np.int16)


def metadata_to_string(metadata):
    return ''.join(token.text for token in metadata.tokens)


def words_from_candidate_transcript(metadata):
    word = ""
    word_list = []
    word_start_time = 0
    # Loop through each character
    for i, token in enumerate(metadata.tokens):
        # Append character to word if it's not a space
        if token.text != " ":
            if len(word) == 0:
                # Log the start time of the new word
                word_start_time = token.start_time

            word = word + token.text
        # Word boundary is either a space or the last character in the array
        if token.text == " " or i == len(metadata.tokens) - 1:
            word_duration = token.start_time - word_start_time

            if word_duration < 0:
                word_duration = 0

            each_word = dict()
            each_word["word"] = word
            each_word["start_time"] = round(word_start_time, 4)
            each_word["duration"] = round(word_duration, 4)

            word_list.append(each_word)
            # Reset
            word = ""
            word_start_time = 0

    return word_list


def metadata_json_output(metadata):
    json_result = dict()
    json_result["transcripts"] = [{
        "confidence": transcript.confidence,
        "words": words_from_candidate_transcript(transcript),
    } for transcript in metadata.transcripts]
    return json.dumps(json_result, indent=2)



class VersionAction(argparse.Action):
    def __init__(self, *args, **kwargs):
        super(VersionAction, self).__init__(nargs=0, *args, **kwargs)

    def __call__(self, *args, **kwargs):
        print('DeepSpeech ', version())
        exit(0)


In [12]:
def evaluate_model(model_file_path, scorer_file_path) :

    print('Loading model from file {}'.format(model_file_path))
    model_load_start = timer()
    # sphinx-doc: python_ref_model_start
    ds = Model(model_file_path)
    # sphinx-doc: python_ref_model_stop
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    desired_sample_rate = ds.sampleRate()

    if scorer_file_path:
        print('Loading scorer from files {}'.format(scorer_file_path))
        scorer_load_start = timer()
        ds.enableExternalScorer(scorer_file_path)
        scorer_load_end = timer() - scorer_load_start
        print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)
    
    # Data path
    script_dir_path = '/content/drive/MyDrive/Speech2Pickup/train_script'
    npzfile = np.load('/content/drive/MyDrive/Speech2Pickup/divide_img_idx.npz')
    test_img_idx = npzfile['arr_1']

    # Set empty arrays
    img_idxs = []
    pos_outputs = []
    real_text_inputs = []
    STT_text_inputs = []
    total_data_count = 0
    STT_correct_count = 0
    STT_error_data = []
    times = []

    for idx in test_img_idx:
        script_file = '%04d.txt' % idx
        
        curr_file = open(join(script_dir_path, script_file), 'r')
        curr_file_lines = curr_file.readlines()
        for i in range(len(curr_file_lines)):
            pos_output = np.asarray(curr_file_lines[i].split()[ :2], dtype=np.float32)
            words = curr_file_lines[i].split()[2: ]
            real_text_input = ' '.join(words)

            for i in range(2):
                total_data_count += 1
                print('Processing {} data'.format(total_data_count))

                wav_file_path = '/content/drive/MyDrive/Speech2Pickup/test_speech_data/{}.wav'.format(total_data_count)
                fin = wave.open(wav_file_path, 'rb')
                fs_orig = fin.getframerate()
                if fs_orig != desired_sample_rate:
                    # print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr)
                    fs_new, audio = convert_samplerate(wav_file_path, desired_sample_rate)
                else:
                    audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
                audio_length = fin.getnframes() * (1/fs_orig)
                fin.close()

                inference_start = time.time()
                STT_text_input = ds.stt(audio)
                STT_text_input = STT_text_input.lower()
                inference_end = time.time()
                inter_time = inference_end - inference_start
                times.append(inter_time)

                img_idxs.append(idx)
                pos_outputs.append(pos_output)
                real_text_inputs.append(real_text_input)
                STT_text_inputs.append(STT_text_input)

                if real_text_input != STT_text_input:
                    STT_error_data.append((real_text_input, STT_text_input))
                    print(STT_error_data[-1])
                else:
                    STT_correct_count += 1
    
    accuracy = (STT_correct_count/total_data_count)*100
    print('Total number of data: {}'.format(total_data_count))
    print('Total number of correct STT data: {}'.format(STT_correct_count))
    print('STT accuracy: {}%'.format(accuracy))
    print(STT_error_data)
    np.savez_compressed('/content/drive/MyDrive/Speech2Pickup/ASR_deepspeech2_text2pickup_evaluate', img_idxs=np.asarray(img_idxs), pos_outputs=np.asarray(pos_outputs), real_text_inputs=real_text_inputs, STT_text_inputs=STT_text_inputs)

    times = np.asarray(times)
    np.savez_compressed('/content/drive/MyDrive/Speech2Pickup/ASR_deepspeech2_STT_time_evaluate', times=times)

In [13]:
model_file_path = 'deepspeech-0.9.3-models.pbmm'
scorer_file_path = 'deepspeech-0.9.3-models.scorer'
evaluate_model(model_file_path, scorer_file_path)

Loading model from file deepspeech-0.9.3-models.pbmm
Loading scorer from files deepspeech-0.9.3-models.scorer
Processing 1 data


Loaded model in 0.0116s.
Loaded scorer in 0.000266s.


Processing 2 data
Processing 3 data
Processing 4 data
Processing 5 data
('pick up the right blue block', 'pick up the right blue black')
Processing 6 data
Processing 7 data
Processing 8 data
Processing 9 data
Processing 10 data
Processing 11 data
Processing 12 data
Processing 13 data
Processing 14 data
Processing 15 data
Processing 16 data
Processing 17 data
Processing 18 data
Processing 19 data
Processing 20 data
Processing 21 data
Processing 22 data
Processing 23 data
Processing 24 data
Processing 25 data
Processing 26 data
Processing 27 data
Processing 28 data
Processing 29 data
Processing 30 data
Processing 31 data
Processing 32 data
Processing 33 data
Processing 34 data
Processing 35 data
('pick up the block on the rightmost bottom', 'pick up the block on the right most bottom')
Processing 36 data
('pick up the block on the rightmost bottom', 'pick up the block on the right most bottom')
Processing 37 data
Processing 38 data
Processing 39 data
Processing 40 data
Processing 41 data