# Isolated Word Recognition

In [1]:
import librosa
import os, sys
import numpy as np
import scipy
import cPickle as pickle
from sklearn.mixture import GaussianMixture

## Import Audio Files

In [2]:
DATA_DIR = '/Users/alexwang/Google Drive/CompSci/speech-recognition/data'

In [46]:
def read_file_list(filename):
    """Read a text file with one item per line."""
    items = []
    with open(filename, 'r') as f:
        for line in f:
            items.append(line.strip())
    return items

no_list_file_ids = read_file_list(os.path.join(DATA_DIR, 'nolist_train.txt'))
yes_list_file_ids = read_file_list(os.path.join(DATA_DIR, 'yeslist_train.txt'))

all_train_files = no_list_file_ids + yes_list_file_ids
print all_train_files

['no/no1', 'no/no2', 'no/no3', 'no/no4', 'no/no5', 'no/no6', 'no/no7', 'no/no8', 'no/no9', 'no/no10', 'yes/yes1', 'yes/yes2', 'yes/yes3', 'yes/yes4', 'yes/yes5', 'yes/yes6', 'yes/yes7', 'yes/yes8', 'yes/yes9', 'yes/yes10']


## Generate the MFCC vectors

In [4]:
def write_mfcc_file(filename, mfcc_array):
    """Write the mfcc vectors for that file"""
    # Create the enclosing directory if needed.
    directory = os.path.dirname(filename)
    if directory and not os.path.exists(directory):
        os.makedirs(directory)
    with open(filename, "wb") as f:
        pickle.dump(mfcc_array, f, pickle.HIGHEST_PROTOCOL)
    return

def read_mfcc_file(filename):
    """Read back an array of mfcc vectors."""
    with open(filename, "rb") as f:
        mfcc_array = pickle.load(f)
    return mfcc_array

for file_id in all_train_files:
    filename = os.path.join(DATA_DIR, 'mfcc', file_id + '.pkl')
    y, sr = librosa.load(os.path.join(DATA_DIR, file_id + '.wav'), sr=None)
    mfcc_file = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=39).transpose() # I think n_mfcc specifies the dimension of the vector
    write_mfcc_file(filename, mfcc_file)

In [5]:
# Initialize GMM
gmm_no = GaussianMixture(covariance_type="diag", n_components=39)
gmm_yes = GaussianMixture(covariance_type="diag", n_components=39)

In [6]:
# Train the "no" GMM
for file_id in no_list_file_ids:
    mfcc_array = read_mfcc_file(os.path.join(DATA_DIR, 'mfcc', file_id + '.pkl')) # read the computed pkl files
    gmm_no.fit(mfcc_array)

In [18]:
# Train the "yes" GMM
for file_id in yes_list_file_ids:
    mfcc_array = read_mfcc_file(os.path.join(DATA_DIR, 'mfcc', file_id + '.pkl')) # read the computed pkl files
    gmm_yes.fit(mfcc_array)

In [19]:
# # we are currently using the training data to test the model as a sanity check
# i = 0
# for file_id in no_list_file_ids:
#     mfcc_array = read_mfcc_file(os.path.join(DATA_DIR, 'mfcc', file_id + '.pkl')) # read the computed pkl files
#     print "---------------"
#     print file_id
#     print gmm_no.score(mfcc_array)
#     print gmm_yes.score(mfcc_array)
#     print gmm_no.score(mfcc_array) > gmm_yes.score(mfcc_array)
#     i += 1

# print "***************"
# j=0
# for file_id in yes_list_file_ids:
#     mfcc_array = read_mfcc_file(os.path.join(DATA_DIR, 'mfcc', file_id + '.pkl')) # read the computed pkl files
#     print "---------------"
#     print file_id
#     print gmm_no.score(mfcc_array)
#     print gmm_yes.score(mfcc_array)
#     print gmm_no.score(mfcc_array) < gmm_yes.score(mfcc_array)
#     j += 1

## Making the Record Audio Interface

In [20]:
import pyaudio
import wave
from pydub import AudioSegment
from pydub.playback import play

In [21]:
def detect_leading_silence(sound, silence_threshold=-50.0, chunk_size=10):
    '''
    sound is a pydub.AudioSegment
    silence_threshold in dB
    chunk_size in ms

    iterate over chunks until you find the first one with sound
    '''
    trim_ms = 0
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold:
        trim_ms += chunk_size

    return trim_ms

def trim_audio():
    sound = AudioSegment.from_file("data/output.wav", format="wav") # hard coding for now

    start_trim = detect_leading_silence(sound)
    end_trim = detect_leading_silence(sound.reverse())

    duration = len(sound)    
    trimmed_sound = sound[start_trim:duration-end_trim]
    
    trimmed_sound.export("data/output.wav", format="wav")
    play(trimmed_sound)

In [22]:
def record_audio():
    """PyAudio example: Record a few seconds of audio and save to a WAVE file."""
    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 2
    RATE = 44100
    RECORD_SECONDS = 3
    WAVE_OUTPUT_FILENAME = "data/output.wav" # hard coding for now

    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    print "* Listening..."

    frames = []

    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print "* Done Listening"

    stream.stop_stream()
    stream.close()
    p.terminate()
    
    # Write the data to a wav file
    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

## Recognition Demo

In [43]:
def run_recognition_system_demo(directory):
    file_id = 'output'
    record_audio() # Make sure this function writes the output file before proceeding
    trim_audio()
    
    y, sr = librosa.load(os.path.join(DATA_DIR, file_id + '.wav'), sr=None)
    test_mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=39).transpose()
    
    if gmm_no.score(test_mfcc) >= gmm_yes.score(test_mfcc):
        print "I think you just said 'no'."
    else:
        print "I think you just said 'yes'."
    
    print gmm_no.score(test_mfcc)
    print gmm_yes.score(test_mfcc)

run_recognition_system_demo(DATA_DIR)

* Listening...
* Done Listening
I think you just said 'yes'.
-5492.73028832
-4624.30339896


## Testing the Accuracy of the Model

In [13]:
from sklearn.metrics import accuracy_score

In [65]:
no_list_test = read_file_list(os.path.join(DATA_DIR, 'nolist_test.txt'))
yes_list_test = read_file_list(os.path.join(DATA_DIR, 'yeslist_test.txt'))
print yes_list_test

[]


In [62]:
def test_recognition_system(directory, file_id):
    y, sr = librosa.load(os.path.join(DATA_DIR, file_id + '.wav'), sr=None)
    test_mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=39).transpose()
    
    if gmm_no.score(test_mfcc) >= gmm_yes.score(test_mfcc):
#         print "I think you just said 'no'."
        return "n"
    else:
#         print "I think you just said 'yes'."
        return "y"
    
#     print gmm_no.score(test_mfcc)
#     print gmm_yes.score(test_mfcc)

    return

In [66]:
# Testing the accuracy of recognizing "no"
predictions_no = []
truth_labels = ['n']*9
for filename in no_list_test:
    predictions_no.append(test_recognition_system(DATA_DIR, filename))

accuracy_score(truth_labels, predictions_no)

ValueError: Found input variables with inconsistent numbers of samples: [10, 9]

In [None]:
# Testing the accuracy of recognizing "yes"
predictions_yes = []
truth_labels = ['y']*10
for filename in yes_list_test:
    predictions_yes.append(test_recognition_system(DATA_DIR, filename))

accuracy_score(truth_labels, predictions_yes)