In [27]:
import numpy as np
import matplotlib.pyplot as plt
import os
import IPython.display as ipd
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers

In [4]:
data_path = os.getenv("HOME") + '/aiffel/speech_recognition/data/speech_wav_8000.npz'
speech_data = np.load(data_path)


In [6]:
print(f"Wave data shape : {speech_data['wav_vals'].shape}")
print(f"Label data shape : {speech_data['label_vals'].shape}")

Wave data shape : (50620, 8000)
Label data shape : (50620, 1)


In [9]:
rand = random.randint(0, len(speech_data['wav_vals']))
print(f"rand num : {rand}")

sr = 8000
data = speech_data['wav_vals'][rand]
print(f"Wave data shape : {data.shape}")
print(f"label : {speech_data['label_vals'][rand]}")

ipd.Audio(data, rate=sr)

rand num : 5155
Wave data shape : (8000,)
label : ['left']


In [12]:
target_list = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']

label_value = target_list
label_value.append('unknown')
label_value.append('silence')

new_label_value = dict()
for i, l in enumerate(label_value):
    new_label_value[l] = i
label_value = new_label_value
label_value

{'yes': 0,
 'no': 1,
 'up': 2,
 'down': 3,
 'left': 4,
 'right': 5,
 'on': 6,
 'off': 7,
 'stop': 8,
 'go': 9,
 'unknown': 10,
 'silence': 11}

In [14]:
temp = []
for v in speech_data['label_vals']:
    temp.append(label_value[v[0]])
label_data = np.array(temp)
label_data

array([ 3,  3,  3, ..., 11, 11, 11])

In [20]:
sr = 8000

train_wav, test_wav, train_label, test_label = train_test_split(speech_data['wav_vals'], label_data, test_size=0.1, shuffle=True)

print(train_wav)

train_wav = train_wav.reshape([-1, sr, 1])
test_wav = test_wav.reshape([-1, sr, 1])

[[-7.95670412e-03 -1.21364919e-02 -1.19321197e-02 ... -1.38819711e-02
  -1.27821285e-02 -1.29595129e-02]
 [-2.97594324e-05  2.21090624e-04 -9.67003463e-04 ...  1.89499697e-03
   1.49327365e-03  1.17175980e-03]
 [ 3.74587689e-04  1.56902638e-03  2.63484777e-03 ... -9.08147718e-04
  -1.51947758e-03 -3.11840954e-03]
 ...
 [ 2.25460250e-02 -1.10248424e-01 -1.39737293e-01 ...  1.70766771e-01
   3.33424330e-01  1.34063333e-01]
 [-5.21442678e-04 -7.30324886e-04 -6.24099048e-04 ...  1.77421223e-03
   1.39416556e-03  1.12028222e-03]
 [ 3.32810543e-03  4.86725755e-03  1.01086698e-05 ...  2.49759248e-03
   1.81381940e-03 -2.04087840e-03]]


In [21]:
train_wav

array([[[-7.95670412e-03],
        [-1.21364919e-02],
        [-1.19321197e-02],
        ...,
        [-1.38819711e-02],
        [-1.27821285e-02],
        [-1.29595129e-02]],

       [[-2.97594324e-05],
        [ 2.21090624e-04],
        [-9.67003463e-04],
        ...,
        [ 1.89499697e-03],
        [ 1.49327365e-03],
        [ 1.17175980e-03]],

       [[ 3.74587689e-04],
        [ 1.56902638e-03],
        [ 2.63484777e-03],
        ...,
        [-9.08147718e-04],
        [-1.51947758e-03],
        [-3.11840954e-03]],

       ...,

       [[ 2.25460250e-02],
        [-1.10248424e-01],
        [-1.39737293e-01],
        ...,
        [ 1.70766771e-01],
        [ 3.33424330e-01],
        [ 1.34063333e-01]],

       [[-5.21442678e-04],
        [-7.30324886e-04],
        [-6.24099048e-04],
        ...,
        [ 1.77421223e-03],
        [ 1.39416556e-03],
        [ 1.12028222e-03]],

       [[ 3.32810543e-03],
        [ 4.86725755e-03],
        [ 1.01086698e-05],
        ...,
        

In [22]:
print(f"train data : {train_wav.shape}")
print(f"train label : {train_label.shape}")
print(f"test data : {test_wav.shape}")
print(f"test label : {test_label.shape}")


train data : (45558, 8000, 1)
train label : (45558,)
test data : (5062, 8000, 1)
test label : (5062,)


In [23]:
batch_size = 128
max_epochs = 10

checkpoint_dir = os.getenv("HOME") + "/aiffel/speech_recognition/models/wav"

checkpoint_dir

'/home/aiffel/aiffel/speech_recognition/models/wav'

In [24]:
def one_hot_label(wav, label):
    label = tf.one_hot(label, depth = 12)
    return wav, label

In [26]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_wav, train_label))
train_dataset = train_dataset.map(one_hot_label)
train_dataset = train_dataset.repeat().batch(batch_size=batch_size)
print(train_dataset)

test_dataset = tf.data.Dataset.from_tensor_slices((test_wav, test_label))
test_dataset = test_dataset.map(one_hot_label)
test_dataset = test_dataset.repeat().batch(batch_size=batch_size)
print(test_dataset)

<BatchDataset shapes: ((None, 8000, 1), (None, 12)), types: (tf.float32, tf.float32)>
<BatchDataset shapes: ((None, 8000, 1), (None, 12)), types: (tf.float32, tf.float32)>


In [29]:
input_tensor = layers.Input(shape=(sr, 1))

x = layers.Conv1D(32, 9, padding='same', activation='relu')(input_tensor)
x = layers.Conv1D(32, 9, padding='same', activation='relu')(x)
x = layers.MaxPool1D()(x)

x = layers.Conv1D(64, 9, padding='same', activation='relu')(x)
x = layers.Conv1D(64, 9, padding='same', activation='relu')(x)
x = layers.MaxPool1D()(x)

x = layers.Conv1D(128, 9, padding='same', activation='relu')(x)
x = layers.Conv1D(128, 9, padding='same', activation='relu')(x)
x = layers.Conv1D(128, 9, padding='same', activation='relu')(x)
x = layers.MaxPool1D()(x)

x = layers.Conv1D(256, 9, padding='same', activation='relu')(x)
x = layers.Conv1D(256, 9, padding='same', activation='relu')(x)
x = layers.Conv1D(256, 9, padding='same', activation='relu')(x)
x = layers.MaxPool1D()(x)
x = layers.Dropout(0.3)(x)

x = layers.Flatten()(x)
x = layers.Dense(256)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)

output_tensor = layers.Dense(12)(x)

model_wav = tf.keras.Model(input_tensor, output_tensor)

model_wav.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 8000, 1)]         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 8000, 32)          320       
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 8000, 32)          9248      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 4000, 32)          0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 4000, 64)          18496     
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 4000, 64)          36928     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2000, 64)         