In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install librosa



In [0]:
import librosa
import pickle
import math

In [0]:
import numpy as np
import tensorflow as tf

# Read and transform data

Convert raw byte stream into useful feature space. STFT on the signals serves the initial feature extraction process

In [0]:
base_path = './drive/My Drive/Colab Notebooks/hw4/speaker-verification/'
train_path = base_path + 'hw4_trs.pkl'
test_path = base_path + 'hw4_tes.pkl'

base_path_model = base_path + 'model/'

In [0]:
with open(train_path, 'rb') as f:
  train_raw = pickle.load(f)

with open(test_path, 'rb') as f:
  test_raw = pickle.load(f)

trs.pkl contains an 500×16,180 matrix, whose row is a speech signal with 16,180 samples. They are
the returned vectors from the librosa.load function. Similarly, tes.pkl holds a 200×22,631 matrix

In [7]:
train_raw.shape, test_raw.shape

((500, 16180), (200, 22631))

## Convert into audio (.wav file) for checking manually

In [0]:
# for i in range(train_raw.shape[0]):
#   librosa.output.write_wav(base_path + 'trs/trs' + ('0000' + str(i))[-4:] + '.wav', train_raw[i], 16000)

In [0]:
# for i in range(test_raw.shape[0]):
#   librosa.output.write_wav(base_path + 'tes/tes' + ('0000' + str(i))[-4:] + '.wav', test_raw[i], 16000)

## Extract features using STFT

In [0]:
train_complex = np.array([librosa.stft(x, n_fft=1024, hop_length=512).T for x in train_raw])
test_complex = np.array([librosa.stft(x, n_fft=1024, hop_length=512).T for x in test_raw])

In [0]:
train = np.abs(train_complex)
test = np.abs(test_complex)

In [0]:
# a = [[1, 2], [3, 4]]
# np.pad()

In [13]:
train.shape, test.shape

((500, 32, 513), (200, 45, 513))

# Generate mini-batches

## Structure of training data

The training matrix is ordered by speakers. Each speaker has 10 utterances, and there are 50 such
speakers (that’s why there are 500 rows). Similarly, the test set has 20 speakers, each of which is with
10 utterances.

In [0]:
samples_per_class = 10

## Procedure to generate mini-batches

1. Randomly sample L pairs of utterances from the ten utterance of the first speaker. In theory, there are $10 \choose 2$= 45 pairs you can sample from. You can use all 45 of them if you want. These are the positive examples in your first minibatch

2. Randomly sample L utterances from the 49 training speakers. Using them and the ten utterances of
the first speaker, form another set of L pairs. If L > 10, you’ll need to repeatedly use the first speaker’s
utterance (i.e. sampling with replacement). This set is your negative examples, each of whose pair
contains an utterance from the first speaker and a random utterance spoken by a different speaker.

3. In this first minibatch, you have 2L pairs of utterances.

4. Repeat this process for the other training speakers, so that each speaker is represented by L positive
pairs and L negative pairs. By doing so, you can form 50 minibatches with a balanced number of
positive and negative pair

In [0]:
max_length = 50
num_features = 513

In [0]:
def pad_zeros(stft):
  stft_val = np.zeros((max_length, num_features))
  stft_val[:stft.shape[0], :stft.shape[1]] = stft
  return stft_val

In [0]:
'''
data: train/test
batch_size: return batch_size number of positive and another batch_size number of negative pairs
stick: if true, one training class will be common across positive and negative pairs

returns:
x: list of pairs
y: 1 for positive pairs, 0 for negative
'''
def next_batch_triplet(data, batch_size, stick=False):
  xa, xp, xn, l = [], [], [], []
  
  num_classes = len(data) // samples_per_class
  base = np.random.randint(num_classes)
  
  # generate positive pairs
  for _ in range(batch_size):
    # randomly select idx_0 only if stick is false
    if not stick:
      base = np.random.randint(num_classes)
      
    # make sure neg_base is not same as base
    while True:
      neg_base = np.random.randint(num_classes)
      if neg_base != base:
        break
      
    idx_a, idx_p = base * samples_per_class + np.random.choice(np.arange(samples_per_class), size=2, replace=False)
    idx_n = neg_base * samples_per_class + np.random.randint(samples_per_class)
    
#     xa.append(pad_zeros(data[idx_a]))
#     xp.append(pad_zeros(data[idx_p]))
#     xn.append(pad_zeros(data[idx_n]))
    
    xa.append(data[idx_a])
    xp.append(data[idx_p])
    xn.append(data[idx_n])
    l.append(data[idx_a].shape[0])
    
  return np.array(xa), np.array(xp), np.array(xn), np.array(l)

In [0]:
'''
data: train/test
batch_size: return batch_size number of positive and another batch_size number of negative pairs
stick: if true, one training class will be common across positive and negative pairs

returns:
x: list of pairs
y: 1 for positive pairs, 0 for negative
'''
def next_batch(data, batch_size, stick=False):
  x1, x2, y, l1, l2 = [], [], [], [], []
  
  num_classes = len(data) // samples_per_class
  base = np.random.randint(num_classes)
  
  # generate positive pairs
  for _ in range(batch_size):
    # randomly select idx_0 only if stick is false
    if not stick:
      base = np.random.randint(num_classes)
      
    idx_0, idx_1 = base * samples_per_class + np.random.choice(np.arange(samples_per_class), size=2, replace=False)
    x1.append(pad_zeros(data[idx_0]))
    x2.append(pad_zeros(data[idx_1]))

#     x1.append(data[idx_0])
#     x2.append(data[idx_1])

    l1.append(data[idx_0].shape[0])
    l2.append(data[idx_1].shape[0])
    y.append([1])
#     print(idx_0, idx_1, 1)
    
  # generate negative pairs
  for _ in range(batch_size):
    # randomly select idx_0 only if stick is false
    if not stick:
      base = np.random.randint(num_classes)
    
    # make sure neg_base is not same as base
    while True:
      neg_base = np.random.randint(num_classes)
      if neg_base != base:
        break
      
    idx_0 = base * samples_per_class + np.random.randint(samples_per_class)
    idx_1 = neg_base * samples_per_class + np.random.randint(samples_per_class)
    
    x1.append(pad_zeros(train[idx_0]))
    x2.append(pad_zeros(train[idx_1]))

#     x1.append(train[idx_0])
#     x2.append(train[idx_1])
    
    l1.append(train[idx_0].shape[0])
    l2.append(train[idx_1].shape[0])
    y.append([0])
#     print(idx_0, idx_1, 0)
    
  return np.array(x1), np.array(x2), np.array(y), np.array(l1), np.array(l2)

# Create model

In [0]:
num_hidden_rnn = [500, 400]
learning_rate = 0.001

In [0]:
# Using this implementation as reference https://github.com/ardiya/siamesenetwork-tensorflow/blob/master/train.py

def model(inp, seq_len):
  cells = []
  
  for i, units in enumerate(num_hidden_rnn):
    cell_name = 'lstm_' + str(i)
    cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(units, reuse=tf.AUTO_REUSE, name=cell_name, initializer=tf.initializers.he_normal(), activation='relu'))
    cells.append(cell)
  
  multi_rnn = tf.nn.rnn_cell.MultiRNNCell(cells)
  output, state = tf.nn.dynamic_rnn(multi_rnn, inp, dtype=tf.float32, sequence_length=seq_len)
  print(output)
      
  flattened_1 = tf.contrib.layers.flatten(output)
  dense_1 = tf.layers.dense(inputs=flattened_1, activation='relu', name='dense_1', reuse=tf.AUTO_REUSE, units=200)
  dense_2 = tf.layers.dense(inputs=dense_1, activation='relu', name='dense_2', reuse=tf.AUTO_REUSE, units=50)
    
  return dense_2

## Loss function

In [0]:
def contrastive_loss(model1, model2, y, margin):
  # y can take values in {0, 1}

  dotProduct = tf.reduce_sum(tf.multiply(model1, model2), axis = 1, keepdims=True)
  sigmDotProduct = tf.nn.sigmoid(dotProduct)
  lossCalcu = tf.reduce_sum(-y * tf.log(10e-8 + sigmDotProduct) - (1 - y) * tf.log(10e-8 + 1 - sigmDotProduct))

  return lossCalcu

In [0]:
# X_anchor = tf.placeholder(dtype=tf.float32, shape=[None, max_length, num_features])
# X_positive = tf.placeholder(dtype=tf.float32, shape=[None, max_length, num_features])
# X_negative = tf.placeholder(dtype=tf.float32, shape=[None, max_length, num_features])
# margin = 10

In [0]:
# def triplet_loss(anchor, positive, negative, margin):
#   distance_ap = tf.sqrt(tf.reduce_sum(tf.pow(anchor - positive, 2), 1, keepdims=True))
#   distance_an = tf.sqrt(tf.reduce_sum(tf.pow(anchor - negative, 2), 1, keepdims=True))
#   return tf.maximum(display_ap - distance_an + margin, 0)

In [0]:
# output_anchor = model(X_anchor, seq_len)
# output_positive = model(X_positive, seq_len)
# output_negative = model(X_negative, seq_len)

In [0]:
def similar(model1, model2, y):
  distance = tf.sqrt(tf.reduce_sum(tf.pow(model1 - model2, 2), 1, keepdims=True))
  prob = 1 - tf.nn.sigmoid(distance)
  prediction = tf.cast(tf.greater_equal(prob, 0.5), tf.float32)
  accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, y), tf.float32))
  return prob, prediction, accuracy

## Inputs to the model

In [0]:
X1 = tf.placeholder(dtype=tf.float32, shape=[None, max_length, num_features])
X2 = tf.placeholder(dtype=tf.float32, shape=[None, max_length, num_features])
y = tf.placeholder(dtype=tf.float32, shape=[None, 1])
seq_len = tf.placeholder(dtype=tf.int32, shape=None)
margin = 0.5

## Plug everything together

In [27]:
model1 = model(X1, seq_len)

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Colocations handled automatically by placer.
Tensor("rnn/transpose_1:0", shape=(?, 50, 400), dtype=float32)

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dense instead.


In [28]:
model2 = model(X2, seq_len)

Tensor("rnn_1/transpose_1:0", shape=(?, 50, 400), dtype=float32)


In [0]:
loss = contrastive_loss(model1, model2, y, margin)
# loss = triplet_loss(anchor, positive, negative)
train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss=loss)

## Train model

In [0]:
batch_size = 100
num_epochs = 50
display_step = 1
save_step = 10

num_samples_tr = train.shape[0]
num_samples_te = test.shape[0]
num_batches_tr = int(math.ceil(num_samples_tr/batch_size))
num_batches_te = int(math.ceil(num_samples_te/batch_size))

train_loss = []
test_loss = []

In [0]:
sess = tf.Session()
saver = tf.train.Saver()

In [32]:
sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter(base_path + 'graphs', sess.graph)
writer.close()

num_batches_tr = num_samples_tr // batch_size
for epoch in range(num_epochs):
  loss_val = 0
  for i in range(num_batches_tr):
    batch_x1, batch_x2, batch_y, l1, l2 = next_batch(train, batch_size)
    
    _, lv = sess.run([train_op, loss], feed_dict={X1: batch_x1, X2: batch_x2, y: batch_y, seq_len: l1})
    loss_val += lv
  
  loss_val = loss_val / (num_batches_tr * batch_size)
  
  if epoch % display_step == 0:
    print(epoch, loss_val)
  if epoch % save_step == 0:
    saver.save(sess, base_path_model + 'model_' + str(epoch) + '.ckpt')

0 6.889409545898437
1 1.4176284790039062
2 1.3911478881835937
3 1.3837318420410156
4 1.386967041015625
5 1.386114288330078
6 1.3862964172363281
7 1.3861468811035156
8 1.3866118469238282
9 1.3856036376953125
10 1.3851288452148438
11 1.3726382751464843
12 1.3756825866699218
13 1.3854981384277343
14 1.3862897644042969
15 1.3862928161621093
16 1.3862939453125
17 1.3862934875488282
18 1.3862942199707031
19 1.386294952392578
20 1.3862939453125
21 1.3862932739257812
22 1.3862939453125
23 1.386293975830078
24 1.386293975830078
25 1.3862950439453126
26 1.386293975830078
27 1.3862939453125
28 1.3862939453125
29 1.3862915954589843
30 1.3862939453125
31 1.3862939453125
32 1.3862940673828126
33 1.3862939453125
34 1.3862939453125
35 1.3862939453125
36 1.3862942810058594
37 1.3862939453125
38 1.3862903442382812
39 1.38628759765625
40 1.3862939453125
41 1.3862935180664062
42 1.3862952575683594
43 1.3862952575683594
44 1.3862958374023437
45 1.3862874450683593
46 1.3863110961914062
47 1.3862925720214845

## Test model

In [33]:
batch_x1, batch_x2, batch_y = next_batch(test, 10)

ValueError: ignored

In [0]:
batch_x1.shape

In [0]:
prob, prediction, accuracy = similar(model1, model2, y)

In [0]:
pb, pd, ac = sess.run([prob, prediction, accuracy], feed_dict={X1: batch_x1, X2: batch_x2, y: batch_y})

In [0]:
pb

In [0]:
pd, batch_y

In [0]:
ac