In [None]:
import os
import numpy as np
import pandas as pd
from scipy.fftpack import fft
from scipy.io import wavfile
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from python_speech_features import mfcc

## Data used: Speech Commands Data Set v0.01

In [None]:
ROOT_DIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__'))))
DATA_DIR = os.path.join(ROOT_DIR, 'data_asr', 'data')
DATA_INFO = os.path.join(ROOT_DIR, 'data_asr', 'data_info')

In [None]:
data_list = []

In [None]:
test_files = pd.read_csv(os.path.join(DATA_INFO, 'testing_list.txt'), sep = ' ', header = None)[0].tolist()
val_files = pd.read_csv(os.path.join(DATA_INFO, 'validation_list.txt'), sep = ' ', header = None)[0].tolist()

In [None]:
test_lab = [os.path.dirname(i) for i in test_files]
val_lab = [os.path.dirname(i) for i in val_files]

In [None]:
test_list = [os.path.join(DATA_DIR, f) for f in test_files if f.endswith('.wav')]
val_list = [os.path.join(DATA_DIR, f) for f in val_files if f.endswith('.wav')]

In [None]:
for root, dirs, files in os.walk(DATA_DIR):
    data_list += [root + '/'+ f for f in files if f.endswith('.wav')]

In [None]:
train_list = list(set(data_list) - set(test_list) - set(val_list))

In [None]:
train_lab = [os.path.basename(os.path.dirname(i)) for i in train_list]

In [None]:
lab = LabelEncoder()

train_encode = lab.fit_transform(train_lab)
y_encode = lab.fit_transform(train_encode)

val_encode = lab.fit_transform(val_lab)
yval_encode = lab.fit_transform(val_encode)

test_encode = lab.fit_transform(test_lab)
ytest_encode = lab.fit_transform(test_encode)

In [None]:
lab_binarizer = LabelBinarizer()
lab_binarizer.fit(range(len(np.unique(test_lab))))

ytrain = lab_binarizer.transform(y_encode)
yval = lab_binarizer.transform(yval_encode)
ytest = lab_binarizer.transform(ytest_encode)

In [None]:
def wav2np(wav_list):
    
    samp_list, x_data = [], []
    
    for i in wav_list:
        samp_, x = wavfile.read(i)
        samp_list.append(samp_)
        x_data.append(x)
        
    return samp_list, x_data

In [None]:
samp_train, x_train = wav2np(train_list)
samp_val, x_val = wav2np(val_list)
samp_test, x_test = wav2np(test_list)

In [None]:
#Zero padding function for data with less than 16000 samples
def len_pad(arr):
    
    arr_list = []
    for i in arr:
        arr_list.append([j for j in i])
        
    for i in arr_list:
        if len(i) >= 16000:
            i = i[:16000]
        else:
            i.extend([0] * (16000 - len(i)))
            
    return np.asarray(arr_list)

In [None]:
x_test1 = len_pad(x_test)

In [None]:
x_test1.shape

In [None]:
x_val1 = len_pad(x_val)

In [None]:
x_val1.shape

In [None]:
x_train1 = len_pad(x_train)

In [None]:
x_train1.shape

In [None]:
np.save('asr_train.npy', x_train1)

In [None]:
np.save('asr_val.npy', x_val1)

In [None]:
np.save('asr_test.npy', x_test1)

## Load and pre-process array individually to save memory

In [None]:
#x_train = np.load('asr_train.npy')

In [None]:
#x_val = np.load('asr_val.npy')

In [None]:
x_test = np.load('asr_test.npy')

In [None]:
#Returns mfcc features of shape (98, 40)
def mfcc_gen(arr):
    
    arr_clean = []
    for i in arr:
        arr_clean.append(mfcc(i, winlen = 0.03, numcep = 40, nfilt = 40))
        
    return np.asarray(arr_clean)

In [None]:
#train_feat = mfcc_gen(x_train)

In [None]:
#np.save('train_feat.npy', train_feat)

In [None]:
#val_feat = mfcc_gen(x_val)

In [None]:
#np.save('val_feat.npy', val_feat)

In [None]:
test_feat = mfcc_gen(x_test)

In [None]:
test_feat.shape

In [None]:
np.save('test_feat.npy', test_feat)