In [9]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [10]:
!pip install librosa



In [0]:
import librosa
import pickle

In [0]:
import numpy as np
import tensorflow as tf

# Read and transform data

Convert raw byte stream into useful feature space. STFT on the signals serves the initial feature extraction process

In [0]:
base_path = './drive/My Drive/Colab Notebooks/hw4/speaker-verification/'
train_path = base_path + 'hw4_trs.pkl'
test_path = base_path + 'hw4_tes.pkl'

In [0]:
with open(train_path, 'rb') as f:
  train_raw = pickle.load(f)

with open(test_path, 'rb') as f:
  test_raw = pickle.load(f)

trs.pkl contains an 500×16,180 matrix, whose row is a speech signal with 16,180 samples. They are
the returned vectors from the librosa.load function. Similarly, tes.pkl holds a 200×22,631 matrix

In [15]:
train_raw.shape, test_raw.shape

((500, 16180), (200, 22631))

## Convert into audio (.wav file) for checking manually

In [0]:
# for i in range(train_raw.shape[0]):
#   librosa.output.write_wav(base_path + 'trs/trs' + ('0000' + str(i))[-4:] + '.wav', train_raw[i], 16000)

In [0]:
# for i in range(test_raw.shape[0]):
#   librosa.output.write_wav(base_path + 'tes/tes' + ('0000' + str(i))[-4:] + '.wav', test_raw[i], 16000)

## Extract features using STFT

In [0]:
train = np.array([librosa.stft(x, n_fft=1024, hop_length=512).T for x in train_raw])
test = np.array([librosa.stft(x, n_fft=1024, hop_length=512).T for x in test_raw])

In [19]:
train.shape, test.shape

((500, 32, 513), (200, 45, 513))

# Generate mini-batches

## Structure of training data

The training matrix is ordered by speakers. Each speaker has 10 utterances, and there are 50 such
speakers (that’s why there are 500 rows). Similarly, the test set has 20 speakers, each of which is with
10 utterances.

## Procedure to generate mini-batches

1. Randomly sample L pairs of utterances from the ten utterance of the first speaker. In theory, there are $10 \choose 2$= 45 pairs you can sample from. You can use all 45 of them if you want. These are the positive examples in your first minibatch

2. Randomly sample L utterances from the 49 training speakers. Using them and the ten utterances of
the first speaker, form another set of L pairs. If L > 10, you’ll need to repeatedly use the first speaker’s
utterance (i.e. sampling with replacement). This set is your negative examples, each of whose pair
contains an utterance from the first speaker and a random utterance spoken by a different speaker.

3. In this first minibatch, you have 2L pairs of utterances.

4. Repeat this process for the other training speakers, so that each speaker is represented by L positive
pairs and L negative pairs. By doing so, you can form 50 minibatches with a balanced number of
positive and negative pair

In [0]:
L = 10

In [0]:
'''
batch_size: return batch_size number of positive and another batch_size number of negative pairs
stick: if true, one training class will be common across positive and negative pairs

returns:
x: list of pairs
y: 1 for positive pairs, 0 for negative
'''
def next_batch(batch_size, stick=False):
  x1, x2, y = [], [], []
  
  base = np.random.randint(len(train)//10)
  
  # generate positive pairs
  for _ in range(batch_size):
    # randomly select idx_0 only if stick is false
    if not stick:
      base = np.random.randint(len(train)//10)
      
    idx_0, idx_1 = base * 10 + np.random.choice(np.arange(10), size=2, replace=False)
    x1.append(train[idx_0])
    x1.append(train[idx_1])
    y.append(1)
    print(idx_0, idx_1, 1)
    
  # generate negative pairs
  for _ in range(batch_size):
    # randomly select idx_0 only if stick is false
    if not stick:
      base = np.random.randint(len(train)//10)
    
    # make sure neg_base is not same as base
    while True:
      neg_base = np.random.randint(len(train)//10)
      if neg_base != base:
        break
      
    idx_0 = base * 10 + np.random.randint(10)
    idx_1 = neg_base * 10 + np.random.randint(10)
    
    x2.append(train[idx_0])
    x2.append(train[idx_1])
    y.append(0)
    print(idx_0, idx_1, 0)
    
  return np.array(x1), np.array(x2), np.array(y)

# Create model

In [0]:
num_samples_tr = len(train)
batch_size = 20
num_features = 513
num_hidden = 256

learning_rate = 0.001
num_epochs = 100

In [0]:
# Using this implementation as reference https://github.com/ardiya/siamesenetwork-tensorflow/blob/master/train.py

def model(inp, reuse):
  with tf.name_scope('model'):
    with tf.variable_scope('lstm_1') as scope:
      conv_1 = tf.contrib.layers.conv2d()
      
      
      conv_1 = tf.layers.Conv2D(filters=16, kernel_size=(1, 3), strides=(1, 1), padding='same', data_format='channels_last', activation=tf.nn.relu, reuse=reuse)(X)
      pool_1 = tf.layers.MaxPooling2D(pool_size=(1, 2), strides=(1, 2))(conv_1)
      flattened = tf.layers.Flatten(reuse=reuse)(pool_1)
      dropout = tf.layers.dropout(rate=0.1, inputs=flattened)
      dense_1 = tf.layers.Dense(units=513, activation=tf.nn.relu)(dropout)
      output = tf.reshape(dense_1, shape=[-1, 1, 513, 1])
  return dense_1

In [0]:
X1 = tf.placeholder(dtype='float', shape=[None, 1, 513, 1])
X2 = tf.placeholder(dtype='float', shape=[None, 1, 513, 1])

In [29]:
model1 = model(X1, False)

TypeError: ignored