In [6]:
from __future__ import print_function
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
#np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.utils import np_utils

from torch.utils.data import Dataset, DataLoader
import cv2
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision
import torchvision.datasets as datasets
from torch.autograd import Variable


In [7]:
X = np.load('X_all.npy')
y = np.load('y_all.npy')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [10]:
class AudioDataset(Dataset):
    def __init__(self, X_train,y_train):  
        self.dataset = X_train,y_train

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = {'mfcc': X_train[idx], 'labels': y_train[idx]}

        return sample

In [11]:
dataset=AudioDataset(X_train,y_train)

In [16]:
dataset.dataset

(array([[[-6.05600928e+02, -4.22348209e+02, -1.95625449e+02, ...,
          -3.01460864e+02, -1.37774933e+02, -6.93089500e+01],
         [ 0.00000000e+00,  8.58180408e+01,  1.25907576e+02, ...,
           7.54483118e+01,  7.19054112e+01,  7.79408023e+01],
         [ 0.00000000e+00, -4.74510561e+01, -8.25658517e+01, ...,
          -5.04117029e+01, -7.51347117e+01, -6.79016794e+01],
         ...,
         [ 0.00000000e+00,  5.24027526e+00,  6.81130687e+00, ...,
           1.72358063e+01,  3.48511555e+01,  2.30152058e+01],
         [ 0.00000000e+00, -1.27973494e+01, -1.74476597e+01, ...,
           7.74626281e+00,  3.96455085e+00, -2.60227025e+00],
         [ 0.00000000e+00,  2.37430803e-01, -6.23233646e+00, ...,
          -1.57488092e-01, -1.09011070e+01, -1.14507291e+01]],
 
        [[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
         [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
           0.00000000

In [26]:
train_load = torch.utils.data.DataLoader(dataset=dataset,
                                           batch_size=1,
                                           shuffle=True)

In [30]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()
        
        self.cnn1 = nn.Conv2d(in_channels=1, out_channels=8, kernel_size=3,stride=1, padding=1)
        self.batchnorm1 = nn.BatchNorm2d(8)        #Batch normalization
        self.relu = nn.ReLU()                 #RELU Activation
        self.maxpool1 = nn.MaxPool2d(kernel_size=2)   #Maxpooling reduces the size by kernel size. 64/2 = 32
        
        self.cnn2 = nn.Conv2d(in_channels=8, out_channels=32, kernel_size=5, stride=1, padding=2)
        self.batchnorm2 = nn.BatchNorm2d(32)
        self.maxpool2 = nn.MaxPool2d(kernel_size=2)    #Size now is 32/2 = 16
        
        #Flatten the feature maps. You have 32 feature mapsfrom cnn2. Each of the feature is of size 16x16 --> 32*16*16 = 8192
        self.fc1 = nn.Linear(in_features=3360, out_features=4000)   #Flattened image is fed into linear NN and reduced to half size
        self.droput = nn.Dropout(p=0.5)                    #Dropout used to reduce overfitting
        self.fc2 = nn.Linear(in_features=4000, out_features=2000)
        self.droput = nn.Dropout(p=0.5)
        self.fc3 = nn.Linear(in_features=2000, out_features=500)
        self.droput = nn.Dropout(p=0.5)
        self.fc4 = nn.Linear(in_features=500, out_features=50)
        self.droput = nn.Dropout(p=0.5)
        self.fc5 = nn.Linear(in_features=50, out_features=10)    #Since there were so many features, I decided to use 45 layers to get output layers. You can increase the kernels in Maxpooling to reduce image further and reduce number of hidden linear layers.
       
        
    def forward(self,x):
        out = self.cnn1(x)
        out = self.batchnorm1(out)
        out = self.relu(out)
        out = self.maxpool1(out)
        out = self.cnn2(out)
        out = self.batchnorm2(out)
        out = self.relu(out)
        out = self.maxpool2(out)
        #Flattening is done here with .view() -> (batch_size, 32*16*16) = (100, 8192)
        out = out.view(-1,3360)   #-1 will automatically update the batchsize as 100; 8192 flattens 32,16,16
        #Then we forward through our fully connected layer 
        out = self.fc1(out)
        out = self.relu(out)
        out = self.droput(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.droput(out)
        out = self.fc3(out)
        out = self.relu(out)
        out = self.droput(out)
        out = self.fc4(out)
        out = self.relu(out)
        out = self.droput(out)
        out = self.fc5(out)
        return out

In [31]:
model = CNN()
CUDA = torch.cuda.is_available()
if CUDA:
    model = model.cuda()    
loss_fn = nn.CrossEntropyLoss()        
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)

In [38]:
#Training the CNN

import time

num_epochs = 150

#Define the lists to store the results of loss and accuracy
train_loss = []
test_loss = []
train_accuracy = []
test_accuracy = []

#Training
for epoch in range(num_epochs): 
    #Reset these below variables to 0 at the begining of every epoch
    start = time.time()
    correct = 0
    iterations = 0
    iter_loss = 0.0
    
    model.train()                   # Put the network into training mode
    
    for i, data in enumerate(train_load):
        
        # Convert torch tensor to Variable
        
        
        inputs = data['mfcc']
        inputs = torch.FloatTensor(np.expand_dims(inputs,axis=1))
        print("inputs",inputs,inputs.shape)
        
        labels = data['labels']
        labels = labels.long()
        print("labels",labels,data['labels'])


        #print("labels",labels)
        # If we have GPU, shift the data to GPU
        CUDA = torch.cuda.is_available()
        if CUDA:
            inputs = inputs.cuda()
            labels = labels.cuda()

        optimizer.zero_grad()            # Clear off the gradient in (w = w - gradient)
        outputs = model(inputs) 
        print("outputs",outputs.shape,"label",labels.shape)
        loss = loss_fn(outputs, labels)  
        iter_loss += loss.item()       # Accumulate the loss
        loss.backward()                 # Backpropagation 
        optimizer.step()                # Update the weights

        # Record the correct predictions for training data 
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum()
        iterations += 1

        # Record the training loss
    train_loss.append(iter_loss/iterations)
    # Record the training accuracy
    train_accuracy.append((100 * correct / len(dataset)))

    print ('Epoch {}/{}, Training Loss: {:.3f}, Training Accuracy: {:.3f}'
            .format(epoch+1, num_epochs, train_loss[-1], train_accuracy[-1]))

    #Testing
    loss = 0.0
    correct = 0
    iterations = 0

#     model.eval()                    # Put the network into evaluation mode
    
#     for i, (inputs, labels) in enumerate(test_load):
        
#         # Convert torch tensor to Variable
#         inputs = Variable(inputs)
#         labels = Variable(labels)
        
#         CUDA = torch.cuda.is_available()
#         if CUDA:
#             inputs = inputs.cuda()
#             labels = labels.cuda()
        
#         outputs = model(inputs)     
#         loss = loss_fn(outputs, labels) # Calculate the loss
#         loss += loss.data[0]
#         # Record the correct predictions for training data
#         _, predicted = torch.max(outputs, 1)
#         correct += (predicted == labels).sum()
        
#         iterations += 1

#     # Record the Testing loss
#     test_loss.append(loss/iterations)
#     # Record the Testing accuracy
#     test_accuracy.append((100 * correct / len(test_dataset)))
#     stop = time.time()
    
#     print ('Epoch {}/{}, Training Loss: {:.3f}, Training Accuracy: {:.3f}, Testing Loss: {:.3f}, Testing Acc: {:.3f}, Time: {}s'
#            .format(epoch+1, num_epochs, train_loss[-1], train_accuracy[-1], test_loss[-1], test_accuracy[-1], stop-start))

      


inputs tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]]) torch.Size([1, 1, 20, 85])
labels tensor([9]) tensor([9], dtype=torch.int32)
outputs torch.Size([1, 10]) label torch.Size([1])
inputs tensor([[[[-6.0560e+02, -4.2235e+02, -1.9563e+02,  ..., -3.0146e+02,
           -1.3777e+02, -6.9309e+01],
          [ 0.0000e+00,  8.5818e+01,  1.2591e+02,  ...,  7.5448e+01,
            7.1905e+01,  7.7941e+01],
          [ 0.0000e+00, -4.7451e+01, -8.2566e+01,  ..., -5.0412e+01,
           -7.5135e+01, -6.7902e+01],
          ...,
          [ 0.0000e+00,  5.2403e+00,  6.8113e+00,  ...,  1.7236e+01,
            3.4851e+01,  2.3015e+01],
          [ 0.0000e+00, -1.2797e+01, -1.7448e+01,  ...,  7.7463e+00,
            3.9646e+00, -2.6023e+00],
          [ 0.0000e+00,  2.3743e-01, -6

inputs tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]]) torch.Size([1, 1, 20, 85])
labels tensor([9]) tensor([9], dtype=torch.int32)
outputs torch.Size([1, 10]) label torch.Size([1])
Epoch 7/150, Training Loss: 2.145, Training Accuracy: 50.000
inputs tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]]) torch.Size([1, 1, 20, 85])
labels tensor([9]) tensor([9], dtype=torch.int32)
outputs torch.Size([1, 10]) label torch.Size([1])
inputs tensor([[[[-6.0560e+02, -4.2235e+02, -1.9563e+02,  ..., -3.0146e+02,
           -1.3777e+02, -6.9309e+01],
          [ 0.000

KeyboardInterrupt: 