# Deep Learning Models

In this notebook, I will be working with tensorflow to create models with the podcast audio data created in previous scripts and notebooks

In [1]:
import glob
import warnings
warnings.filterwarnings('ignore')

podcast_audio_files = glob.glob('./audio-split-data/*.mp3', recursive=True)
polly_audio_files = glob.glob('./polly-data/*.mp3', recursive=True)

In [2]:
pa = polly_audio_files.copy()
ordered_pa = sorted(pa)

pod_a = podcast_audio_files.copy()
ordered_pod = sorted(pod_a)
#grouping the audio file together by word
grouped_audio = list(zip(ordered_pa, ordered_pod))

In [3]:
# loading both the AWS polly generated audio and the actual episode audio chunks into librosa
import numpy as np
import librosa
polly_arr = []
episode_arr = []
for group in grouped_audio[:10]:
    polly = group[0]
    episode_chunks = group[1]
    episode_id = polly.split('/')[-1].split('.')[0]
    try:
        y_pol, sr_pol = librosa.load(polly, sr=11025)
        y_episode, sr_episode = librosa.load(episode_chunks)
        if (len(y_pol) <= 7000) and (len(y_episode) <= 15000):
            polly_arr.append( y_pol)
            episode_arr.append(y_episode)
    except Exception as e:
        print(e)

In [4]:
polly_arr = np.array(polly_arr)

In [5]:
polly_arr.shape

(10,)

In [6]:
episode_arr = np.array(episode_arr)

In [7]:
episode_arr.shape

(10,)

In [8]:
polly_arr

array([array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
       -2.6498399e-05,  2.4322753e-05,  3.3664037e-05], dtype=float32),
       array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        3.0489087e-05, -2.5245014e-05, -7.3413677e-05], dtype=float32),
       array([ 0.        ,  0.        ,  0.        , ...,  0.00011415,
       -0.00028309, -0.00024886], dtype=float32),
       array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        9.1746231e-05,  2.3154249e-05, -6.6433429e-05], dtype=float32),
       array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        1.0346445e-04, -2.0771184e-04,  6.7976071e-05], dtype=float32),
       array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
       -1.5773579e-05,  1.3382194e-04, -7.5663374e-05], dtype=float32),
       array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
       -2.3723393e-10,  2.4218669e-10, -2.2643941e-10], dtype=float32),
       array([ 0.0000000e+00,  0.0000000e+00,

In [9]:
import tensorflow as tf
polly_seq = tf.keras.preprocessing.sequence.pad_sequences(
    polly_arr, maxlen=7000, dtype='float32', padding='post', truncating='post',
    value=0.0
).reshape((10, 7000))
# polly_seq = tf.keras.preprocessing.sequence.pad_sequences(
#     polly_arr, maxlen=7000, dtype='float32', padding='post', truncating='post',
#     value=0.0
# ).reshape((39933, 7000))

In [10]:
polly_seq

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [11]:
np.save('polly_seq.npy', polly_seq)

In [12]:
print(polly_seq.size)
print(polly_seq.shape)

70000
(10, 7000)


In [13]:
episode_seq = tf.keras.preprocessing.sequence.pad_sequences(
    episode_arr, maxlen=15000, dtype='float32', padding='post', truncating='post',
    value=0.0
).reshape((10, 15000,1))
# episode_seq = tf.keras.preprocessing.sequence.pad_sequences(
#     episode_arr, maxlen=15000, dtype='float32', padding='post', truncating='post',
#     value=0.0
# ).reshape((39933, 15000))

In [14]:
print(episode_seq.size)
print(episode_seq.shape)

150000
(10, 15000, 1)


In [15]:
np.save('episode_seq.npy', episode_seq)

Now, I will map the episode chunks to the polly generated words. In other words, a group of episode chunk sequences will be mapped to one element in the polly generated words array.

In [16]:
#creating samples of data
first_epi_sample = episode_seq[0,:,]
first_polly_sample = polly_seq[0,:,]
print(first_epi_sample.shape)
print(first_polly_sample.shape)

(15000, 1)
(7000,)


In [17]:
def map_epi_to_polly(epi_sample, polly_sample, window):
    """A function to map the episode splices
    to the polly generated words 
    for example: epi_sample = [1, 2, 3, 4, 5]
    polly_sample =                  [6, 7, 8]
    6 -->1, 2, 3
    7 --> 2, 3, 4
    8 --> 3, 4, 5"""
    mapped_vec  = []
    for i, sample in enumerate(polly_sample):
        s = epi_sample[i:i+window]
        mapped_vec.append([s, sample])
    return mapped_vec


In [18]:
# separating the mapped vec into x and y values
test = map_epi_to_polly(first_epi_sample, first_polly_sample, 3000)
X = [t[0] for t in test]
y = [t[1] for t in test]
X = np.array(X).reshape((7000, 3000, 1))
y = np.array(y)
print(X.shape)
print(y.shape)

(7000, 3000, 1)
(7000,)


In [19]:
X.shape

(7000, 3000, 1)

In [20]:
X_seq_len = [epi_seq.size==15000 for epi_seq in X]
y_seq_len = [seq.size==7000 for seq in y]

In [None]:
# making sure that every sequence in polly_seq is the proper length
# pol_seq_len = [len(seq)==7000 for seq in polly_seq]
# assert all(pol_seq_len)

In [None]:
#doing the same test for episode sequences
# epi_seq_len = [len(seqs)==15000 for seqs in episode_seq]
# assert all(epi_seq_len)

In [21]:
#importing libraries to create deep learning model
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [22]:
model = keras.Sequential()
epi_input_layer = tf.keras.layers.InputLayer(
    input_shape=(3000,1))
model.add(epi_input_layer)
model.add(layers.LSTM(128,))
polly_output_layer = tf.keras.layers.Dense(
    1, activation='relu')
model.add(polly_output_layer)

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 128)               66560     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 66,689
Trainable params: 66,689
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(
    optimizer='adam', loss='mse')


In [25]:
polly_seq.shape[0]

10

In [26]:
model.fit(x=X, y=y, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fef5842f2e0>

Our first model trained above, the loss only went down between the first and second epochs. After that the loss remains the same. Perhaps, I need to run more epochs or change the model architecture.

In [None]:
# try to run more epochs 
# maybe use more data because the model is not doing well right now
# more research to find ways to improve model 

At the moment, the model is not performing very well. It's not at all accurate and the loss is the same throughout the epochs. I will do more research on how to improve the model.

In [None]:
# epi_train, epi_test, pol_train, pol_test = train_test_split(episode_seq, polly_seq, test_size=0.3)

In [None]:
# running a different architecture
model_2 = keras.Sequential()
epi_input_layer = tf.keras.layers.InputLayer(
    input_shape=(3000,1))
model_2.add(epi_input_layer)
model_2.add(layers.LSTM(128,))
model_2.add(layers.Dense(32, activation='relu'))
polly_output_layer = tf.keras.layers.Dense(
    1, activation='relu')
model_2.add(polly_output_layer)

In [None]:
model_2.compile(
    optimizer='adam', loss='mse', metrics=['accuracy'])

In [None]:
model_2.fit(x=X, y=y, epochs=5)

In [None]:
# Using MFCCs to build an RNN
for array in polly_arr[:10]:
    polly_mfcc = librosa.feature.mfcc(array, sr=sr_pol)

In [None]:
for epi_splice in episode_arr[:10]:
    episode_mfcc = librosa.feature.mfcc(epi_splice, sr=sr_episode)

In [None]:
#creating model
model_2 = keras.Sequential()
epi_input = keras.layers.InputLayer(input_shape=(15000, 1))
model_2.add(epi_input)
model_2.add(layers.LSTM(128))
polly_output = tf.keras.layers.Dense(
    7000, activation='relu')
model_2.add(polly_output)

In [None]:
model_2.summary()

In [None]:
model_2.compile(
    optimizer='adam', loss='mse', metrics='accuracy')


In [None]:
model_2.fit(x=episode_seq, y=polly_seq, epochs=100)