# 1 - Packages

In [1]:
import librosa
import numpy as np
from os import listdir
from os.path import isdir, join

# 2 - Input 

In [2]:
dir_path = 'speech_recognition_dataset/train/audio' #input()
#speech_recognition_dataset/train/audio

In [5]:
categories = [f for f in listdir(dir_path) if isdir(join(dir_path, f))]
categories.sort()
categories.remove('.ipynb_checkpoints')
print('Number of Categories: ', len(categories[1:]))
print(categories[1:])

Number of Categories:  30
['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero']


In [7]:
max_wav_len = 101
target_list = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
unknown_list = [d for d in categories if d not in target_list and d != '_background_noise_' ]
print('Target List: ', target_list)
print('\nUnknownsList : ', unknown_list)

Target List:  ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']

UnknownsList :  ['bed', 'bird', 'cat', 'dog', 'eight', 'five', 'four', 'happy', 'house', 'marvin', 'nine', 'one', 'seven', 'sheila', 'six', 'three', 'tree', 'two', 'wow', 'zero']


# 3 - Prepare Data

In [8]:
background = [f for f in listdir(join(dir_path, '_background_noise_')) if f.endswith('.wav')]

In [9]:
file_name = 'background_noise_mfcc.memmap'
f = open('background_noise_label.txt','w+')
fp = np.memmap(file_name, dtype='float32', mode='w+', shape=(101,20))

In [10]:
cnt = 0
for wav in background:
    wave, sample_rate = librosa.load(join(join(dir_path,'_background_noise_'),wav))
    wave = librosa.resample(wave, sample_rate, 16000)
    mfcc = librosa.feature.mfcc(wave, hop_length=160, n_mfcc=20).T
    for i in range(0, mfcc.shape[0], max_wav_len):
        x = mfcc[i:i + max_wav_len, :]
        if x.shape[0] != max_wav_len:
            continue
        fpc = np.memmap(file_name, dtype='float32', mode='r+', shape=((cnt + 1) * 101,20))
        fpc[cnt * 101:, :] = x
        f.writelines('silence\n')
        cnt += 1
f.close()

In [11]:
fpc = np.memmap(file_name, dtype='float32', mode='r', shape=((cnt) * 101,20))
print(fpc[0:101, :])

[[-380.8116      27.567669   -49.555122  ...   -8.786341    -3.1839867
    -1.2797297]
 [-330.67773     27.263283   -51.6185    ...   -5.1967373   -5.3206096
     0.8096075]
 [-275.29913     26.223972   -54.35544   ...   -1.5325807   -5.5584164
     2.7326455]
 ...
 [-150.74535     27.805984   -49.365887  ...  -10.94832     -8.642139
     2.9427042]
 [-149.94096     29.052677   -48.31129   ...  -10.869948    -8.6259
     3.4077725]
 [-150.18327     29.38075    -47.790947  ...  -10.256454    -8.068186
     3.9103777]]


In [12]:
file_name = 'all_waves_mfcc.memmap'
f = open('all_waves_label.txt','w+')
fp = np.memmap(file_name, dtype='float32', mode='w+', shape=(101,20))

In [14]:
cnt_all_waves = 0
for i, direct in enumerate(categories[1:]):
    waves = [f for f in listdir(join(dir_path, direct)) if f.endswith('.wav')]
    print(i + 1,":", direct, end=" ")
    for wav in waves:
        samples, sample_rate = librosa.load(join(join(dir_path,direct),wav), sr = 16000)
        samples = librosa.resample(samples, sample_rate, 16000)
        mfcc = librosa.feature.mfcc(samples, n_mfcc=20, hop_length=160).T
        
        if len(samples) != 16000 : 
            continue
        if (max_wav_len > mfcc.shape[0]):
            pad_width = max_wav_len  - mfcc.shape[1]
            mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfcc = mfcc[:, :max_wav_len ]
        fpc = np.memmap(file_name, dtype='float32', mode='r+', shape=((cnt_all_waves + 1) * 101,20))
        fpc[cnt_all_waves * 101:, :] = mfcc
        f.writelines(direct + '\n')
        cnt_all_waves += 1

1 : bed 2 : bird 3 : cat 4 : dog 5 : down 6 : eight 7 : five 8 : four 9 : go 10 : happy 11 : house 12 : left 13 : marvin 14 : nine 15 : no 16 : off 17 : on 18 : one 19 : right 20 : seven 21 : sheila 22 : six 23 : stop 24 : three 25 : tree 26 : two 27 : up 28 : wow 29 : yes 30 : zero 

In [15]:
fpc = np.memmap(file_name, dtype='float32', mode='r', shape=((cnt_all_waves) * 101,20))
print(fpc[0:101, :])

[[-576.63556     16.24293      7.935775  ...    1.9529798    4.878601
     6.4231753]
 [-576.5225      16.417307     7.9297323 ...    1.9303882    4.9848022
     6.5338306]
 [-576.46924     16.597977     7.79028   ...    2.0994377    5.258776
     6.551874 ]
 ...
 [-560.5723      31.39305     18.498745  ...    2.6069539    3.1524115
     5.2859282]
 [-563.82434     29.314718    17.94899   ...    3.6223435    4.2829685
     5.2425528]
 [-564.91187     28.464329    17.488823  ...    3.7690902    4.515863
     5.333643 ]]


In [16]:
print("Number of Silence Files:", cnt)
print("Number of Known and Unknown files:", cnt_all_waves)
print("Total Number of example: ", cnt + cnt_all_waves)

Number of Silence Files: 393
Number of Known and Unknown files: 58252
Total Number of example:  58645
