#### Import Packages 

In [108]:
import IPython.display as ipd
import librosa
import librosa.display
import os
from os.path import basename
import cv2
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(1)
%matplotlib inline

import torch 
import torch.nn as nn
from torch.autograd import Variable
from tensorboardX import SummaryWriter
import torch.utils.data
import torch.utils.data as data_utils

#### Load data with melspectrogram features

In [2]:
def load_data(data_location, width=200, height=140):
    
    #Get number of data files
    num_samples = 0
    for file_ in glob.glob(data_location + '/*.wav'):
        num_samples+=1
        
    features = np.zeros((num_samples, 1, height, width), np.float32)
    labels = np.zeros((num_samples), np.int64)
    
    # For shuffling
    p = np.random.permutation(num_samples)
    
    file_num = 0
    for file_ in glob.glob(data_location + '/*.wav'):
        
        # Record emotion type and intensity
#         labels[p[file_num], int(basename(file_)[:-4].split('-')[2])-1] = 1.0
#         labels[p[file_num], int(basename(file_)[:-4].split('-')[3])+7] = 1.0
        emotionVar = int(basename(file_)[:-4].split('-')[2])
        intensityVar = int(basename(file_)[:-4].split('-')[3])
        labels[p[file_num]] = ((emotionVar-1)*2 + intensityVar - 1)
        # Read file and extract features
        X, sample_rate = librosa.load(file_, res_type='kaiser_fast')
        file_feature = librosa.feature.melspectrogram(X, sample_rate)
        file_feature = cv2.resize(librosa.power_to_db(file_feature, ref=np.max), (width, height))
        features[p[file_num]] = file_feature.reshape((1, height, width))
        
        file_num+=1
        
    return {'features' : features, 'labels' : labels}

In [201]:
data = load_data('./Data', width = 300, height = 210)

In [202]:
val_split = 0.2
features = data['features']
labels = data['labels']
X_train = features[:int((1-val_split)*features.shape[0])]
y_train = labels[:int((1-val_split)*features.shape[0])]
X_val = features[int((1-val_split)*features.shape[0]):]
y_val = labels[int((1-val_split)*features.shape[0]):]

In [203]:
plt.imshow(X_train[0].reshape((140, 200)))
plt.show()

ValueError: cannot reshape array of size 63000 into shape (140,200)

#### PyTorch!! Let's make a model! 

In [219]:
# Hyper Parameters
width = 300
height = 210
num_epochs = 30
batch_size = 6
learning_rate = 0.00005

In [220]:
# Data Loader (Input Pipeline)
train_dataset = data_utils.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, 
                                           shuffle=True)

val_dataset = data_utils.TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                          batch_size=batch_size, 
shuffle=False)

In [221]:
def get_val_acc():
    crnn.eval()
    correct = 0
    total = 0
    for in_features, labels in val_loader:
        in_features = Variable(in_features).cuda()
        labels = labels.cuda()
    #     ids_type = torch.LongTensor([0,1,2,3,4,5,6]).cuda()
    #     ids_intensity = torch.LongTensor([7,8]).cuda()
        outputs = crnn(in_features)
    #     outputs_type = nn.functional.softmax(outputs.data.index_select(1,ids_type))
    #     outputs_intensity = nn.functional.softmax(outputs.data.index_select(1,ids_intensity))
        _, predicted = torch.max(outputs.data, 1)
    #     _, predicted_type = torch.max(outputs_type, 1)
    #     _, predicted_intensity = torch.max(outputs_intensity, 1)
    #     labels_type = labels.index_select(1,ids_type)
    #     labels_intensity = labels.index_select(1,ids_intensity)
        total += labels.size(0)
        correct += (predicted == labels).sum()

    print('Validation Accuracy: %d %%' % (100 * correct / total))

In [222]:
class CRNN(nn.Module):
    def __init__(self):
        super(CRNN, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2))
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2))
        self.layer4 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2))
        self.rnn = nn.LSTM(height//16, 256, 1, bidirectional=True)
        self.fc1 = nn.Sequential(
            nn.Linear((width//16)*(height//16)*256, 500),
            nn.Dropout(0.3))
#         self.fc1 = nn.Linear(256, 500)
        self.fc2 = nn.Linear(500, 16)

        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
#         out = out.sum(1)
#         out = out.view(out.size(2), out.size(0), out.size(1))
#         out, (hn, cn) = self.rnn(out)
#         out = out[-1]
        out = out.view(out.size(0), -1)
        out = self.fc1(out)
        out = self.fc2(out)
        return out


writer = SummaryWriter()
crnn = CRNN()
crnn.cuda()

CRNN (
  (layer1): Sequential (
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU ()
    (2): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
  )
  (layer2): Sequential (
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU ()
    (2): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
  )
  (layer3): Sequential (
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU ()
    (2): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
  )
  (layer4): Sequential (
    (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU ()
    (2): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
  )
  (rnn): LSTM(13, 256, bidirectional=True)
  (fc1): Sequential (
    (0): Linear (59904 -> 500)
    (1): Dropout (p = 0.3)
  )
  (fc2): Linear (500 -> 16)
)

In [223]:
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(crnn.parameters(), lr=learning_rate)

In [224]:
# Train the Model
for epoch in range(num_epochs):
    for i, (in_features, labels) in enumerate(train_loader):
        crnn.train()
        in_features = Variable(in_features).cuda()
        labels = Variable(labels).cuda()
        
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs = crnn(in_features)
        writer.add_graph(crnn, outputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
    print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' 
%(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))
    get_val_acc()

# export scalar data to JSON for external processing
writer.close()

Epoch [1/30], Iter [192/192] Loss: 2.5534
Validation Accuracy: 19 %
Epoch [2/30], Iter [192/192] Loss: 1.7051
Validation Accuracy: 25 %
Epoch [3/30], Iter [192/192] Loss: 1.9400
Validation Accuracy: 38 %
Epoch [4/30], Iter [192/192] Loss: 1.5985
Validation Accuracy: 42 %
Epoch [5/30], Iter [192/192] Loss: 0.9775
Validation Accuracy: 50 %
Epoch [6/30], Iter [192/192] Loss: 0.5099
Validation Accuracy: 48 %
Epoch [7/30], Iter [192/192] Loss: 0.2248
Validation Accuracy: 52 %
Epoch [8/30], Iter [192/192] Loss: 0.9128
Validation Accuracy: 52 %
Epoch [9/30], Iter [192/192] Loss: 0.0954
Validation Accuracy: 57 %
Epoch [10/30], Iter [192/192] Loss: 0.0822
Validation Accuracy: 58 %
Epoch [11/30], Iter [192/192] Loss: 0.0444
Validation Accuracy: 60 %
Epoch [12/30], Iter [192/192] Loss: 0.0395
Validation Accuracy: 56 %
Epoch [13/30], Iter [192/192] Loss: 0.0328
Validation Accuracy: 59 %
Epoch [14/30], Iter [192/192] Loss: 0.0294
Validation Accuracy: 57 %
Epoch [15/30], Iter [192/192] Loss: 0.0164


In [200]:
# Test the Model
crnn.eval()
correct = 0
total = 0
for in_features, labels in val_loader:
    in_features = Variable(in_features).cuda()
    labels = labels.cuda()
#     ids_type = torch.LongTensor([0,1,2,3,4,5,6]).cuda()
#     ids_intensity = torch.LongTensor([7,8]).cuda()
    outputs = crnn(in_features)
#     outputs_type = nn.functional.softmax(outputs.data.index_select(1,ids_type))
#     outputs_intensity = nn.functional.softmax(outputs.data.index_select(1,ids_intensity))
    _, predicted = torch.max(outputs.data, 1)
#     _, predicted_type = torch.max(outputs_type, 1)
#     _, predicted_intensity = torch.max(outputs_intensity, 1)
#     labels_type = labels.index_select(1,ids_type)
#     labels_intensity = labels.index_select(1,ids_intensity)
    total += labels.size(0)
    correct += (predicted == labels).sum()

print('Test Accuracy of the model on test images: %d %%' % (100 * correct / total))

Test Accuracy of the model on test images: 61 %
