In [1]:
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import gc
import os
import psutil

In [2]:
filesurl = "/home/jovyan/work/free-spoken-digit-dataset/recordings/"
filename = "{}{}".format( filesurl, "9_yweweler_4.wav")
filename = "{}{}".format( filesurl, "7_theo_11.wav")

## Plot wav audio - time and amplitude 

In [None]:
x, sr = librosa.load(filename)
plt.figure(figsize=(14, 5))
librosa.display.waveplot(x, sr=sr)

## Plot spectrogram audio - time, frequency and amplitude 

In [None]:
X = librosa.stft(x, win_length=256, hop_length=16)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
plt.colorbar()

### Plot spectrogram audio - Y frequency axis as log scale

In [None]:
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb[:800], sr=sr, x_axis='time', y_axis='log')
plt.colorbar()

In [None]:
Xdb_=np.expand_dims(Xdb[:800, :513], axis=2)
np.tile(Xdb_, (1,1,3))[0:2,0:2,:]

## Create tensor of audio recordings

### for all users saying "seven" 50 samples each

In [3]:
users = ["jackson", "nicolas", "theo", "yweweler"]
# dictionary of {name : [files of that name]}
filenames = dict(list(map(lambda name:\
         list((name, list(map(lambda x: "7_{}_{}.wav".format(name, x), \
             range(50))))), users)))

In [4]:
def file_to_specgram(filename):
    x, sr = librosa.load(filesurl + filename)
    X = librosa.stft(x, win_length=256, hop_length=16)
    Xdb = librosa.amplitude_to_db(abs(X))
    return Xdb, sr

sr, _ = file_to_specgram(filenames["theo"][0])


In [5]:
# compute jacjson_specgrams but this time with padding 
mxT = 513
def file_to_fixed_size_specgram(filename):
    Xdb, sr = file_to_specgram(filename)
    Xdb = np.pad(Xdb, ((0, 0), (0, mxT-Xdb.shape[1])), "constant")\
    if Xdb.shape[1] <= mxT else Xdb[:, 0:mxT]
    return Xdb

jackson_specgrams = np.tile(np.expand_dims( np.transpose(np.array(list(map(file_to_fixed_size_specgram, \
                        filenames["jackson"]))),\
                                (0,2,1))[:,:,:800], axis=3),\
                            (1,1,1,3))
nicolas_specgrams = np.tile(np.expand_dims( np.transpose(np.array(list(map(file_to_fixed_size_specgram, \
                        filenames["nicolas"]))),\
                                (0,2,1))[:,:,:800], axis=3),\
                            (1,1,1,3))
theo_specgrams = np.tile(np.expand_dims( np.transpose(np.array(list(map(file_to_fixed_size_specgram, \
                        filenames["theo"]))),\
                                (0,2,1))[:,:,:800], axis=3),\
                            (1,1,1,3))
yweweler_specgrams = np.tile(np.expand_dims(np.transpose(np.array(list(map(file_to_fixed_size_specgram, \
                        filenames["yweweler"]))),\
                                (0,2,1))[:,:,:800], axis=3),\
                            (1,1,1,3))

In [None]:
plt.figure(figsize=(14, 5))
librosa.display.specshow(theo_specgrams[11,:,:,1].T, sr=sr, x_axis='time', y_axis='log')
plt.colorbar()

## Transfer learning

### Load Keras weights

In [6]:
import keras
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Dense, Dropout, Flatten
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras import optimizers
from keras.layers.advanced_activations import ELU, PReLU, LeakyReLU
from keras.layers import Dense, Dropout, Activation, Flatten

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [7]:
model = Sequential()
model.add(Conv2D(8, (3, 3), padding='same',
                 input_shape=(513, 800, 3)))
model.add(Activation('relu'))
model.add(Conv2D(8, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(16, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(16, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
#model.add(Dropout(0.25))

model.add(Flatten())
#model.add(Dense(10))
model.add(Activation('relu'))
#model.add(Dropout(0.5))
model.add(Dense(57))
model.add(Activation('softmax'))

model.summary()

# initiate RMSprop optimizer
opt = keras.optimizers.rmsprop(lr=0.0001, decay=1e-6)

# Let's train the model using RMSprop
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 513, 800, 8)       224       
_________________________________________________________________
activation_1 (Activation)    (None, 513, 800, 8)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 511, 798, 8)       584       
_________________________________________________________________
activation_2 (Activation)    (None, 511, 798, 8)       0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 255, 399, 8)       0         
_________________________________________________________________

In [8]:
url_proj = "/home/jovyan/work/VCS/voice-classification/"
model.load_weights(url_proj+'neural_networks/my_model_weights.h5')

In [9]:
#from keras.applications.vgg19 import preprocess_input
from keras.models import Model
import numpy as np
model2 = Model(inputs=model.input, outputs=model.get_layer('flatten_1').output)

### Create training and testing dataset

In [10]:
def permutate_dataset(*args):
#     s = len(args[0])
    indices = list(itertools.product(range(args[0].shape[0]), range(args[1].shape[0])))[0:100]
    return np.array(list(map(lambda idx: np.append(args[0][idx[0]], args[1][idx[1]], axis=0),\
       indices)))

In [21]:
process = psutil.Process(os.getpid())

def apply_model(model, data, step=10):
    for i in range(int(len(data)/step)):
        tmp = model2.predict(data[(step*i):(step*(i+1))])
        x = tmp if(i==0) else np.append(x, tmp, axis=0)
        del tmp
    return x
        
def stage1_dataset_y1(*args, testing=False):
    k = len(args)
    # dataset for same voice 
    for i in range(k):
        x = apply_model(model2, args[i]) 
        xx = permutate_dataset(x, x[1:])
        xs = xx if(i==0) else np.append(xs, xx, axis=0)
        ys = np.ones(xx.shape[0]) if(i==0) else np.append(ys, np.ones(xx.shape[0]))
        del x, xx
        gc.collect()
        print("first loop with i={}, memory usage {}%"\
              .format(i, process.memory_info().rss/(4042484*1024)*100))
    # dataset for different voices

    for i in range(k-1):
        for j in range(i+1, k):
            x = apply_model(model2, args[i], step=2)
            z = apply_model(model2, args[j], step=2)
            xz = permutate_dataset(x, z)
            xs = np.append(xs, xz, axis=0)
            ys = np.append(ys, np.ones(xz.shape[0]))
            del x, z, xz
            gc.collect()
            print("second loop with (i,j)=({},{}) , memory usage {}%"\
              .format(i, j, process.memory_info().rss/(4042484*1024)*100))
    
    return xs, ys

In [14]:
# test the stage1_dataset (without applying model) ::: 
4*10**5*600*2*8/10**9

3.84

In [22]:
xs, ys=stage1_dataset_y1(jackson_specgrams[0:20], nicolas_specgrams[:20], theo_specgrams[0:20])
# xs2, ys2=stage1_dataset_y1(theo_specgrams[0:20], yweweler_specgrams[:20])


first loop with i=0, memory usage 29.50611554677767%
first loop with i=1, memory usage 37.30330163335216%
first loop with i=2, memory usage 45.01529257753401%
second loop with (i,j)=(0,1) , memory usage 49.723981591516505%
second loop with (i,j)=(0,2) , memory usage 57.43607148476036%
second loop with (i,j)=(1,2) , memory usage 65.02684982797705%


In [25]:
ys.shape
xs.shape

(600, 798336)

In [24]:
xs.shape
del jackson_specgrams, nicolas_specgrams, theo_specgrams

In [None]:
# x = np.append(jackson_specgrams, nicolas_specgrams, axis=0)
# del jackson_specgrams, nicolas_specgrams
# x = np.append(x, theo_specgrams, axis=0)
# del theo_specgrams
# x = np.append(x, yweweler_specgrams, axis=0)
# del yweweler_specgrams
# y = np.append(np.ones(50), np.zeros(150))

In [None]:
import random
test_indices = random.sample(range(len(x)), 10)
train_indices = []
for i in range(len(x)):
    if i not in test_indices:
        train_indices.append(i)

x_new_train = []
y_new_train = []
x_test = []
y_test = []

for i in test_indices:
    x_test.append(x[i])
    y_test.append(y[i])


for i in train_indices:
    x_new_train.append(x[i])
    y_new_train.append(y[i])
del x, y
x_new_train = np.array(x_new_train)
y_new_train = np.array(y_new_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

### Create dataset for SVM based on NN features

In [None]:
svm_x2_train = []
svm_y2_train = []
for i in range(len(x_new_train)):
    x_1 = np.expand_dims(x_new_train[i], axis=0)
    #x_1 = preprocess_input(x_1)
    svm_x2_train.append(model2.predict(x_1))
    svm_y2_train.append(y_new_train[i])
del x_new_train, y_new_train

In [None]:
svm_x2_test = []
svm_y2_test = []
for i in range(len(x_test)):
    x_1 = np.expand_dims(x_test[i], axis=0)
    #x_1 = preprocess_input(x_1)
    svm_x2_test.append(model2.predict(x_1))
    svm_y2_test.append(y_test[i])
del x_test, y_test

In [None]:
# reshape svm features and labels
svm_x2_train = np.array(svm_x2_train)
svm_x2_train=svm_x2_train.reshape((svm_x2_train.shape[0], -1))
svm_y2_train = np.array(svm_y2_train)
svm_y2_train=svm_y2_train.reshape((svm_y2_train.shape[0], -1))

svm_x2_test = np.array(svm_x2_test)
svm_x2_test=svm_x2_test.reshape((svm_x2_test.shape[0], -1))
svm_y2_test = np.array(svm_y2_test)
svm_y2_test=svm_y2_test.reshape((svm_y2_test.shape[0], -1))

In [None]:
from sklearn import svm
clf = svm.SVC(kernel='rbf', class_weight='balanced')
clf.fit(svm_x2_train, svm_y2_train)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(svm_y2_test, clf.predict(svm_x2_test))

### Next
- Consider to make test classifications uniformly distributed 0.5|0.5
 - in case I do not change modeling approach
- permutations of same user recordings and different user recordings 
 - that is another approach