In [None]:
import os
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#from keras.utils import to_categorical
#from scipy.fftpack import fft
from scipy.io import wavfile
#from sklearn.preprocessing import LabelEncoder
from python_speech_features import mfcc

## Data used: Speech Commands Data Set v0.02

In [None]:
ROOT_DIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__'))))
DATA_DIR = os.path.abspath('/data/aumkar/data_asr/data')
DATA_INFO = os.path.abspath('/data/aumkar/data_asr/data_info')
FEATURES_PATH = os.path.abspath('/data/aumkar/data_asr/features')

In [None]:
data_list = []

In [None]:
test_files = pd.read_csv(os.path.join(DATA_INFO, 'testing_list.txt'), sep = ' ', header = None)[0].tolist()
val_files = pd.read_csv(os.path.join(DATA_INFO, 'validation_list.txt'), sep = ' ', header = None)[0].tolist()

In [None]:
test_lab = [os.path.dirname(i) for i in test_files]
val_lab = [os.path.dirname(i) for i in val_files]

In [None]:
test_list = [os.path.join(DATA_DIR, f) for f in test_files if f.endswith('.wav')]
val_list = [os.path.join(DATA_DIR, f) for f in val_files if f.endswith('.wav')]

In [None]:
for root, dirs, files in os.walk(DATA_DIR):
    data_list += [root + '/'+ f for f in files if f.endswith('.wav')]

In [None]:
train_list = list(set(data_list) - set(test_list) - set(val_list))

In [None]:
train_lab = [os.path.basename(os.path.dirname(i)) for i in train_list]

In [None]:
lab = LabelEncoder()

In [None]:
train_encode = lab.fit_transform(train_lab)
y_encode = lab.fit_transform(train_encode)

In [None]:
val_encode = lab.fit_transform(val_lab)
yval_encode = lab.fit_transform(val_encode)

test_encode = lab.fit_transform(test_lab)
ytest_encode = lab.fit_transform(test_encode)

In [None]:
ytrain = to_categorical(y_encode)
yval = to_categorical(yval_encode)
ytest = to_categorical(ytest_encode)

In [None]:
(ytrain.shape, yval.shape, ytest.shape)

In [None]:
np.save('ytrain.npy', ytrain)

In [None]:
np.save('yval.npy', yval)

In [None]:
np.save('ytest.npy', ytest)

In [None]:
def scale_(x1, scale_factor = 0.1):
    
    x_ = np.reshape(x1, (x1.shape[0], 1))
    
    scaling = np.random.normal(loc = 1.0, scale = scale_factor, size=(1, x_.shape[1]))
    x_noise = np.matmul(np.ones((x_.shape[0], 1)), scaling)
    
    x_scale = x_ * x_noise
    
    x_scale1 = [i for i in np.reshape(x_scale, (x_scale.shape[0]))]
    
    return np.asarray(x_scale1)

In [None]:
def jitter(x, sigma = 0.05):
    x_noise = np.random.normal(loc = 0, scale = sigma, size = x.shape)
    
    return np.asarray([i for i in (x + x_noise)])

In [None]:
#Function taken from https://github.com/PJansson/speech/blob/master/utils/data.py
def timeshift(x, max_shift = 0.2):
    shift = np.random.uniform(-max_shift, max_shift)
    shift = int(len(x) * shift)
    if shift > 0:
        padded = np.pad(x, (shift, 0), "constant")
        return np.asarray(padded[:len(x)])
    else:
        padded = np.pad(x, (0, -shift), "constant")
        return np.asarray(padded[-len(x):])

In [None]:
#Wav to numpy
def wav2np(wav_list):
    
    samp_list, x_data = [], []
    
    for i in wav_list:
        samp_, x = wavfile.read(i)
            
        samp_list.append(samp_)
        x_data.append(x)
        
    return samp_list, x_data

In [None]:
samp_train, x_train = wav2np(train_list)
#samp_val, x_val = wav2np(val_list)
#samp_test, x_test = wav2np(test_list)

In [None]:
len(x_train)

In [None]:
np.save(os.path.join(FEATURES_PATH, 'asr_train.npy'), np.asarray(x_train))

In [None]:
np.save('asr_val', np.asarray(x_val))

In [None]:
np.save('asr_test', np.asarray(x_test))

In [None]:
x_train = np.load(os.path.join(FEATURES_PATH, 'asr_train.npy'))

In [None]:
y_train = np.load('ytrain.npy')

In [None]:
plt.figure()
plt.plot(x_train[1])
plt.xlabel('Number of samples')
plt.ylabel('Amplitude')
plt.title('Raw speech signal')
plt.show()

In [None]:
#Data augmentation function
def augment(data, lab2):
    
    x_data = []
    label_ = np.empty((0, 35))
    
    for i, j in zip(data, lab2):
        
        x_data.append(scale_(i))
        label_ = np.append(label_, np.reshape(j, (1, 35)), axis = 0)
        x_data.append(jitter(i))
        label_ = np.append(label_, np.reshape(j, (1, 35)), axis = 0)
        
        if np.random.choice([True, False]):
            x_data.append(timeshift(i))
            label_ = np.append(label_, np.reshape(j, (1, 35)), axis = 0)
            
    return x_data, label_

In [None]:
x_train1 = x_train[56562:]
lab1 = y_train[56562:]

In [None]:
(len(x_train1), len(lab1))

In [None]:
aug1, ytrain1 = augment(x_train1, lab1)

In [None]:
ytrain1.shape

In [None]:
len(aug1)

In [None]:
#Returns mfcc features of shape (98, 40)
def mfcc_gen(arr):
    return mfcc(arr, winlen = 0.03, numcep = 40, nfilt = 40)

In [None]:
#Zero padding function for data to get all the features in the same input shape
def len_pad(arr):

    if len(arr) == 98:
        return arr
    else:
        stack = (98 - len(arr), 40)     
        return np.vstack((arr, np.zeros(stack)))

In [None]:
def data_preprocess(x):
    
    feature = []
    for i in x:
        mfccs = mfcc_gen(i)
        feature.append(len_pad(mfccs))
        
    return np.asarray(feature)

## Load and pre-process array individually to save memory

In [None]:
train_feat = data_preprocess(aug1)

In [None]:
train_feat.shape

In [None]:
np.save(os.path.join(FEATURES_PATH, 'train_feat3.npy'), train_feat)

In [None]:
np.save(os.path.join(FEATURES_PATH, 'ytrain3.npy'), ytrain1)

In [None]:
val_feat = data_preprocess(x_val)

In [None]:
val_feat.shape

In [None]:
np.save(os.path.join(FEATURES_PATH, 'val_feat.npy'), val_feat)

In [None]:
test_feat = data_preprocess(x_test)

In [None]:
test_feat.shape

In [None]:
np.save(os.path.join(FEATURES_PATH, 'test_feat.npy'), test_feat)