# Data Preparation

In [1]:
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.style as ms
ms.use('seaborn-muted')
%matplotlib inline

import librosa
import librosa.display
import IPython.display

import os
import sys
import logging

In [2]:
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

In [3]:
class FeatureExtraction:
    def __init__(self, filename, n_mels=128):
        try:
            if not os.path.isfile(filename):
                raise
            self.filename = filename
        except:
            logger.error("%s does not exists or is not a file", filename)
            sys.exit()
            
        self.n_mels = n_mels
        self.y = None
        self.sr = None
        self.S = None
        self.log_S = None
        self.mfcc = None
        self.delta_mfcc = None
        self.delta2_mfcc = None
        self.M = None
        self.rmse = None
    
    def loadFile(self):
        self.y, self.sr = librosa.load(self.filename)
        logger.debug('File loaded %s', self.filename)
    
    def melspectrogram(self):
        self.S = librosa.feature.melspectrogram(self.y, sr=self.sr, n_mels=self.n_mels)
        self.log_S = librosa.logamplitude(self.S, ref_power=np.max)
        logger.debug("MelSpectrogram calculated for %s", self.filename)
    
    def plotmelspectrogram(self):
        plt.figure(figsize=(12, 4))
        librosa.display.specshow(self.log_S, sr=self.sr, x_axis='time', y_axis='mel')
        plt.title('mel Power Spectrogram')
        plt.colorbar(format='%+02.0f dB')
        plt.tight_layout()
    
    def extractmfcc(self, n_mfcc=13):
        self.mfcc = librosa.feature.mfcc(S=self.log_S, n_mfcc=n_mfcc)
        self.delta_mfcc = librosa.feature.delta(self.mfcc)
        self.delta2_mfcc = librosa.feature.delta(self.mfcc, order=2)
        self.M = np.vstack([self.mfcc, self.delta_mfcc, self.delta2_mfcc])
        logger.debug("MFCC extracted from %s", self.filename)
    
    def plotmfcc(self):
        plt.figure(figsize=(12, 6))
        plt.subplot(3, 1, 1)
        librosa.display.specshow(self.mfcc)
        plt.ylabel('MFCC')
        plt.colorbar()
        
        plt.subplot(3, 1, 2)
        librosa.display.specshow(self.delta_mfcc)
        plt.ylabel('MFCC-$\Delta$')
        plt.colorbar()
        
        plt.subplot(3, 1, 3)
        librosa.display.specshow(self.delta2_mfcc, sr=self.sr, x_axis='time')
        plt.ylabel('MFCC-$\Delta^2$')
        plt.colorbar()
        
        plt.tight_layout()
    
    def extractrmse(self):
        self.rmse = librosa.feature.rmse(y=self.y)
        logger.debug("RMSE extracted from %s", self.filename)

In [4]:
class Dataset:
    def __init__(self, datasetDir, datasetLabelFilename, datasetArrayFilename):
        try:
            if not os.path.isdir(datasetDir):
                raise
            self.datasetDir = datasetDir
            logger.debug("Dataset Directory: %s", self.datasetDir)
        except:
            logger.error("%s does not exists or is not a directory", datasetDir)
            sys.exit()
        
        try:
            if not os.path.isfile(datasetLabelFilename):
                raise
            self.datasetLabelFilename = datasetLabelFilename
            logger.debug("Dataset labels filename: %s", self.datasetLabelFilename)
        except:
            logger.error("%s does not exists or is not a file", datasetLabelFilename)
            sys.exit()
        
        self.datasetArrayFilename = datasetArrayFilename
        logger.debug("Dataset array filename: %s", self.datasetArrayFilename)
        
        self.n_features = 80
        logger.info("Number of features: %s", self.n_features)
        self.X = np.empty(shape=(0, self.n_features))
        self.Y = np.empty(shape=(0, 2))
        
    
    def build(self):
        with open(self.datasetLabelFilename, 'r') as datasetLabelFile:
            filesProcessed=0
            for line in datasetLabelFile:
                lineSplit = line.strip().split(' ')
                audiofilename = lineSplit[0]
                label = lineSplit[1]
                try:
                    features = FeatureExtraction(os.path.join(self.datasetDir, audiofilename))
                    features.loadFile()
                    features.melspectrogram()
                    features.extractmfcc()
                    features.extractrmse()
                except ValueError:
                    logger.error("Error extracting features from file %s", audiofilename)
                    continue
                
                featureVector = []
                for feature in features.mfcc:
                    featureVector.append(np.mean(feature))
                    featureVector.append(np.var(feature))
                
                for feature in features.delta_mfcc:
                    featureVector.append(np.mean(feature))
                    featureVector.append(np.var(feature))
                
                for feature in features.delta2_mfcc:
                    featureVector.append(np.mean(feature))
                    featureVector.append(np.var(feature))
                
                featureVector.append(np.mean(features.rmse))
                featureVector.append(np.var(features.rmse))
                
                self.X = np.vstack((self.X, [featureVector]))
                
                if label == "STUTTER":
                    self.Y = np.vstack((self.Y, [0, 1]))
                elif label == "NORMAL":
                    self.Y = np.vstack((self.Y, [1, 0]))
                else:
                    logger.error("Unexpected label: %s", label)
                    sys.exit()
                
                filesProcessed += 1
                if filesProcessed % 1000 == 0:
                    logger.debug("Files processed: %d", filesProcessed)
            
            
            logger.info("Total files processed: %d", filesProcessed)
    
    def writeToFile(self, filename=None):
        if filename == None:
            filename = self.datasetArrayFilename
            
        if os.path.exists(filename):
            os.remove(filename)
        np.savetxt(filename, np.hstack((self.X, self.Y)))
        logger.info("Array stored in file %s", filename)
    
    def readFromFile(self, filename=None):
        if filename == None:
            filename = self.datasetArrayFilename
            
        if not os.path.isfile(filename):
            logger.error("%s does not exists or is not a file", filename)
            sys.exit()
        matrix = np.loadtxt(filename)
        self.X = matrix[:, 0:self.n_features]
        self.Y = matrix[:, self.n_features:]
        logger.info("Array read from file %s", filename)

In [5]:
dataset = Dataset('dataset', 'datasetLabels.txt', 'datasetArray80.gz')
if not os.path.isfile(dataset.datasetArrayFilename):
    dataset.build()
    dataset.writeToFile()
else:
    dataset.readFromFile()

DEBUG:__main__:Dataset Directory: dataset
DEBUG:__main__:Dataset labels filename: datasetLabels.txt
DEBUG:__main__:Dataset array filename: datasetArray80.gz
INFO:__main__:Number of features: 80
INFO:__main__:Array read from file datasetArray80.gz


# Tensorflow binary classification

In [6]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import datetime
import os

In [7]:
class NeuralNetwork:
    def __init__(self, X_train, Y_train, X_test, Y_test):
        # Data
        self.X_train = X_train
        self.Y_train = Y_train
        self.X_test = X_test
        self.Y_test = Y_test
        
        # Learning Parameters
        self.learning_rate = 0.001
        self.training_epochs = 3500
        self.batch_size = 100
        self.display_step = 200

        # Model Parameters
        self.n_hidden = [10, 10]
        self.hiddenLayers = len(self.n_hidden)
        self.n_input = dataset.n_features
        self.n_classes = 2

        logger.info("Neural network of depth %d", self.hiddenLayers)
        for i in range(self.hiddenLayers):
            logger.debug("Depth of layer %d is %d", (i + 1), self.n_hidden[i])

        self.x = tf.placeholder("float", [None, self.n_input])
        self.y = tf.placeholder("float", [None, self.n_classes])
        self.layer = None
        self.weights = None
        self.biases = None
        # Model
        self.model = self.network(self.x)
        self.save_path = None

        # Loss function and optimizer
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.model, labels=self.y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)

        # Initialize the variables
        self.init = tf.global_variables_initializer()

    def network(self, x):
        self.layer = []
        self.weights = []
        self.biases = []

        for n_layer in range(self.hiddenLayers):
            if n_layer == 0:
                self.weights.append(tf.Variable(tf.random_normal([self.n_input, self.n_hidden[n_layer]])))
                self.biases.append(tf.Variable(tf.random_normal([self.n_hidden[n_layer]])))
                self.layer.append(tf.nn.relu(tf.add(tf.matmul(x, self.weights[n_layer]), self.biases[n_layer])))
            else:
                self.weights.append(tf.Variable(tf.random_normal([self.n_hidden[n_layer - 1], self.n_hidden[n_layer]])))
                self.biases.append(tf.Variable(tf.random_normal([self.n_hidden[n_layer]])))
                self.layer.append(tf.nn.relu(tf.add(tf.matmul(self.layer[n_layer - 1], self.weights[n_layer]), self.biases[n_layer])))


        # Output layer
        self.weights.append(tf.Variable(tf.random_normal([self.n_hidden[self.hiddenLayers - 1], self.n_classes])))
        self.biases.append(tf.Variable(tf.random_normal([self.n_classes])))
        self.layer.append(tf.matmul(self.layer[self.hiddenLayers - 1], self.weights[self.hiddenLayers]) + self.biases[self.hiddenLayers])

        return self.layer[self.hiddenLayers]
    
    def train(self):
        saver = tf.train.Saver()
        with tf.Session() as sess:
            sess.run(self.init)
            for epoch in range(self.training_epochs):
                avg_cost = 0
                total_batch = int(len(self.X_train) / self.batch_size)
                X_batches = np.array_split(self.X_train, total_batch)
                Y_batches = np.array_split(self.Y_train, total_batch)

                for i in range(total_batch):
                    batch_x, batch_y = X_batches[i], Y_batches[i]
                    # Run optimization op (backprop) and cost op (to get loss value)
                    _, c = sess.run([self.optimizer, self.cost], feed_dict={self.x: batch_x, self.y: batch_y})

                    # Compute average loss
                    avg_cost += c / total_batch

                # Display logs per epoch step
                if epoch % self.display_step == 0:
                    logger.debug("Epoch: %04d, cost = %.9f", epoch, avg_cost)
            logger.info("Optimization Finished!")

            # Test model
            correct_prediction = tf.equal(tf.argmax(self.model, 1), tf.argmax(self.y, 1))
            # Calculate accuracy
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
            evalAccuracy = accuracy.eval({self.x: self.X_test, self.y: self.Y_test})
            logger.info("Accuracy: %f", evalAccuracy)
            
            global result 
            result = tf.argmax(self.model, 1).eval({self.x: X_test, self.y: Y_test})
            
            if not os.path.isdir("models"):
                os.mkdir("models")
            timestamp = '{:%Y-%m-%d-%H:%M:%S}'.format(datetime.datetime.now()) + '-' + str(evalAccuracy)
            os.mkdir(os.path.join("models", timestamp))
            modelfilename =  os.path.join(os.path.join("models", timestamp), 'session.ckpt')
            self.save_path = saver.save(sess, modelfilename)
            
            with open(os.path.join(os.path.join("models", timestamp), 'details.txt'), 'w') as details:
                details.write("learning_rate = " + str(self.learning_rate) + "\n")
                details.write("training_epochs = " + str(self.training_epochs) + "\n")
                details.write("batch_size = " + str(self.batch_size) + "\n")
                details.write("display_step = " + str(self.display_step) + "\n")
                details.write("n_hidden = " + str(self.n_hidden) + "\n")
                details.write("hiddenLayers = " + str(self.hiddenLayers) + "\n")
                details.write("n_input = " + str(self.n_input) + "\n")
                details.write("n_classes = " + str(self.n_classes) + "\n")
                
            logger.info("Model saved in file: %s" % self.save_path)
    
    def getModelPath(self):
        return self.save_path
        
    def loadFromFile(self, filename):
        saver = tf.train.Saver()
        with tf.Session() as sess:
            saver.restore(sess, filename)
            # Test model
            correct_prediction = tf.equal(tf.argmax(self.model, 1), tf.argmax(self.y, 1))
            # Calculate accuracy
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
            evalAccuracy = accuracy.eval({self.x: self.X_test, self.y: self.Y_test})
            logger.info("Accuracy: %f", evalAccuracy)
            

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset.X, dataset.Y)

In [9]:
nn = NeuralNetwork(X_train, Y_train, X_test, Y_test)
nn.train()

INFO:__main__:Neural network of depth 2
DEBUG:__main__:Depth of layer 1 is 10
DEBUG:__main__:Depth of layer 2 is 10
DEBUG:__main__:Epoch: 0000, cost = 1968.440712462
DEBUG:__main__:Epoch: 0200, cost = 0.747153334
DEBUG:__main__:Epoch: 0400, cost = 0.658895474
DEBUG:__main__:Epoch: 0600, cost = 0.616195857
DEBUG:__main__:Epoch: 0800, cost = 0.416466756
DEBUG:__main__:Epoch: 1000, cost = 0.308861214
DEBUG:__main__:Epoch: 1200, cost = 0.265498052
DEBUG:__main__:Epoch: 1400, cost = 0.272061644
DEBUG:__main__:Epoch: 1600, cost = 0.255973265
DEBUG:__main__:Epoch: 1800, cost = 0.257400463
DEBUG:__main__:Epoch: 2000, cost = 0.245317548
DEBUG:__main__:Epoch: 2200, cost = 0.242108824
DEBUG:__main__:Epoch: 2400, cost = 0.237053688
DEBUG:__main__:Epoch: 2600, cost = 0.233611262
DEBUG:__main__:Epoch: 2800, cost = 0.231847076
DEBUG:__main__:Epoch: 3000, cost = 0.230007967
DEBUG:__main__:Epoch: 3200, cost = 0.230492083
DEBUG:__main__:Epoch: 3400, cost = 0.225449742
INFO:__main__:Optimization Finished

In [10]:
tf.reset_default_graph()
nn1 = NeuralNetwork(X_train, Y_train, X_test, Y_test)
nn1.loadFromFile(nn.getModelPath())

INFO:__main__:Neural network of depth 2
DEBUG:__main__:Depth of layer 1 is 10
DEBUG:__main__:Depth of layer 2 is 10


INFO:tensorflow:Restoring parameters from models/2017-11-24-04:09:36-0.885687/session.ckpt


INFO:tensorflow:Restoring parameters from models/2017-11-24-04:09:36-0.885687/session.ckpt
INFO:__main__:Accuracy: 0.885687


# Using the NN model for classification

In [38]:
from pydub import AudioSegment
import os

In [35]:
wavFile = AudioSegment.from_wav('wav/release1/M_1106_25y0m_1.wav')

In [37]:
segmentLength = 250
segmentHop = 50
upperLimit = wavFile.duration_seconds * 1000 - segmentLength

start = 0
while start < upperLimit:
    end = start + segmentLength
    audio = wavFile[start:end]
    filename = "/tmp/test/audio" + ":" + str(start) + ":" + str(end) + ".wav"
    audio.export(filename, format="wav")
    print(filename)
    start += segmentHop
    
end = upperLimit + segmentLength
audio = wavFile[start:]
filename = "/tmp/test/audio" + ":" + str(start) + ":" + str(int(end)) + ".wav"
audio.export(filename, format="wav")
print(filename)

/tmp/test/audio:0:250.wav
/tmp/test/audio:50:300.wav
/tmp/test/audio:100:350.wav
/tmp/test/audio:150:400.wav
/tmp/test/audio:200:450.wav
/tmp/test/audio:250:500.wav
/tmp/test/audio:300:550.wav
/tmp/test/audio:350:600.wav
/tmp/test/audio:400:650.wav
/tmp/test/audio:450:700.wav
/tmp/test/audio:500:750.wav
/tmp/test/audio:550:800.wav
/tmp/test/audio:600:850.wav
/tmp/test/audio:650:900.wav
/tmp/test/audio:700:950.wav
/tmp/test/audio:750:1000.wav
/tmp/test/audio:800:1050.wav
/tmp/test/audio:850:1100.wav
/tmp/test/audio:900:1150.wav
/tmp/test/audio:950:1200.wav
/tmp/test/audio:1000:1250.wav
/tmp/test/audio:1050:1300.wav
/tmp/test/audio:1100:1350.wav
/tmp/test/audio:1150:1400.wav
/tmp/test/audio:1200:1450.wav
/tmp/test/audio:1250:1500.wav
/tmp/test/audio:1300:1550.wav
/tmp/test/audio:1350:1600.wav
/tmp/test/audio:1400:1650.wav
/tmp/test/audio:1450:1700.wav
/tmp/test/audio:1500:1750.wav
/tmp/test/audio:1550:1800.wav
/tmp/test/audio:1600:1850.wav
/tmp/test/audio:1650:1900.wav
/tmp/test/audio:17

/tmp/test/audio:38000:38250.wav
/tmp/test/audio:38050:38300.wav
/tmp/test/audio:38100:38350.wav
/tmp/test/audio:38150:38400.wav
/tmp/test/audio:38200:38450.wav
/tmp/test/audio:38250:38500.wav
/tmp/test/audio:38300:38550.wav
/tmp/test/audio:38350:38600.wav
/tmp/test/audio:38400:38650.wav
/tmp/test/audio:38450:38700.wav
/tmp/test/audio:38500:38750.wav
/tmp/test/audio:38550:38800.wav
/tmp/test/audio:38600:38850.wav
/tmp/test/audio:38650:38900.wav
/tmp/test/audio:38700:38950.wav
/tmp/test/audio:38750:39000.wav
/tmp/test/audio:38800:39050.wav
/tmp/test/audio:38850:39100.wav
/tmp/test/audio:38900:39150.wav
/tmp/test/audio:38950:39200.wav
/tmp/test/audio:39000:39250.wav
/tmp/test/audio:39050:39300.wav
/tmp/test/audio:39100:39350.wav
/tmp/test/audio:39150:39400.wav
/tmp/test/audio:39200:39450.wav
/tmp/test/audio:39250:39500.wav
/tmp/test/audio:39300:39550.wav
/tmp/test/audio:39350:39600.wav
/tmp/test/audio:39400:39650.wav
/tmp/test/audio:39450:39700.wav
/tmp/test/audio:39500:39750.wav
/tmp/tes

/tmp/test/audio:83700:83950.wav
/tmp/test/audio:83750:84000.wav
/tmp/test/audio:83800:84050.wav
/tmp/test/audio:83850:84100.wav
/tmp/test/audio:83900:84150.wav
/tmp/test/audio:83950:84200.wav
/tmp/test/audio:84000:84250.wav
/tmp/test/audio:84050:84300.wav
/tmp/test/audio:84100:84350.wav
/tmp/test/audio:84150:84400.wav
/tmp/test/audio:84200:84450.wav
/tmp/test/audio:84250:84500.wav
/tmp/test/audio:84300:84550.wav
/tmp/test/audio:84350:84600.wav
/tmp/test/audio:84400:84650.wav
/tmp/test/audio:84450:84700.wav
/tmp/test/audio:84500:84750.wav
/tmp/test/audio:84550:84800.wav
/tmp/test/audio:84600:84850.wav
/tmp/test/audio:84650:84900.wav
/tmp/test/audio:84700:84950.wav
/tmp/test/audio:84750:85000.wav
/tmp/test/audio:84800:85050.wav
/tmp/test/audio:84850:85100.wav
/tmp/test/audio:84900:85150.wav
/tmp/test/audio:84950:85200.wav
/tmp/test/audio:85000:85250.wav
/tmp/test/audio:85050:85300.wav
/tmp/test/audio:85100:85350.wav
/tmp/test/audio:85150:85400.wav
/tmp/test/audio:85200:85450.wav
/tmp/tes

/tmp/test/audio:128800:129050.wav
/tmp/test/audio:128850:129100.wav
/tmp/test/audio:128900:129150.wav
/tmp/test/audio:128950:129200.wav
/tmp/test/audio:129000:129250.wav
/tmp/test/audio:129050:129300.wav
/tmp/test/audio:129100:129350.wav
/tmp/test/audio:129150:129400.wav
/tmp/test/audio:129200:129450.wav
/tmp/test/audio:129250:129500.wav
/tmp/test/audio:129300:129550.wav
/tmp/test/audio:129350:129600.wav
/tmp/test/audio:129400:129650.wav
/tmp/test/audio:129450:129700.wav
/tmp/test/audio:129500:129750.wav
/tmp/test/audio:129550:129800.wav
/tmp/test/audio:129600:129850.wav
/tmp/test/audio:129650:129900.wav
/tmp/test/audio:129700:129950.wav
/tmp/test/audio:129750:130000.wav
/tmp/test/audio:129800:130050.wav
/tmp/test/audio:129850:130100.wav
/tmp/test/audio:129900:130150.wav
/tmp/test/audio:129950:130200.wav
/tmp/test/audio:130000:130250.wav
/tmp/test/audio:130050:130300.wav
/tmp/test/audio:130100:130350.wav
/tmp/test/audio:130150:130400.wav
/tmp/test/audio:130200:130450.wav
/tmp/test/audi

/tmp/test/audio:176750:177000.wav
/tmp/test/audio:176800:177050.wav
/tmp/test/audio:176850:177100.wav
/tmp/test/audio:176900:177150.wav
/tmp/test/audio:176950:177200.wav
/tmp/test/audio:177000:177250.wav
/tmp/test/audio:177050:177300.wav
/tmp/test/audio:177100:177350.wav
/tmp/test/audio:177150:177400.wav
/tmp/test/audio:177200:177450.wav
/tmp/test/audio:177250:177500.wav
/tmp/test/audio:177300:177550.wav
/tmp/test/audio:177350:177600.wav
/tmp/test/audio:177400:177650.wav
/tmp/test/audio:177450:177700.wav
/tmp/test/audio:177500:177750.wav
/tmp/test/audio:177550:177800.wav
/tmp/test/audio:177600:177850.wav
/tmp/test/audio:177650:177900.wav
/tmp/test/audio:177700:177950.wav
/tmp/test/audio:177750:178000.wav
/tmp/test/audio:177800:178050.wav
/tmp/test/audio:177850:178100.wav
/tmp/test/audio:177900:178150.wav
/tmp/test/audio:177950:178200.wav
/tmp/test/audio:178000:178250.wav
/tmp/test/audio:178050:178300.wav
/tmp/test/audio:178100:178350.wav
/tmp/test/audio:178150:178400.wav
/tmp/test/audi

In [None]:
# Classify each audio file and find continous chunk of NORMAL speech then join them
for wavFile in os.listdir('/tmp/audio')