# Baseline CNNs - Chromagrams, MFCC, Spectrograms

Code References:
- Data loading: https://discuss.pytorch.org/t/input-numpy-ndarray-instead-of-images-in-a-cnn/18797/3
- Train and test functions adapted from discussion sections code

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import Dataset, DataLoader

## Chromagrams

In [2]:
## Load data
train_chroma = np.load("Data/train_chroma12.npy")
test_chroma = np.load("Data/test_chroma12.npy")
train_labels = np.load("Data/train_labels_chroma12.npy")
test_labels = np.load("Data/test_labels_chroma12.npy")

In [3]:
train_chroma.shape

(5366, 12, 431)

In [4]:
# from https://discuss.pytorch.org/t/input-numpy-ndarray-instead-of-images-in-a-cnn/18797/3

class MyDataset(Dataset):
    def __init__(self, data, target, transform=None):
        self.data = torch.from_numpy(data).float()
        self.target = torch.from_numpy(target).long()
        self.transform = transform
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        
        if self.transform:
            x = self.transform(x)
            
        return x, y
    
    def __len__(self):
        return len(self.data)

In [5]:
bs = 32

train_set = MyDataset(train_chroma, train_labels)
train_loader = DataLoader(train_set, batch_size=bs, shuffle=True)
test_set = MyDataset(test_chroma, test_labels)
test_loader = DataLoader(test_set, batch_size=bs, shuffle=False)

## CNN - Chromagram

In [2]:
class CNN_chroma(nn.Module):
    def __init__(self):
        super(CNN_chroma, self).__init__()
        self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 5, kernel_size = (2,5),
                              stride = (2,1), padding = (6,1))
        self.conv2_bn1 = nn.BatchNorm2d(5)
        self.pool1 = nn.MaxPool2d((1, 2))
        self.conv2 = nn.Conv2d(5, 10, kernel_size = (2,4), stride = (2,2), padding = (6,2))
        self.conv2_bn2 = nn.BatchNorm2d(10)
        self.pool2 = nn.MaxPool2d((1, 2))
        self.conv3 = nn.Conv2d(10, 10, kernel_size = (2,4), stride = (2,2), padding = (6,1))
        self.conv2_bn3 = nn.BatchNorm2d(10)
        self.fc1 = nn.Linear(10 * 12 * 27, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 5)

    def forward(self, x):
#         print(x.shape)
        x = self.pool1(F.relu(self.conv2_bn1(self.conv1(x))))
#         print(x.shape)
        x = self.pool2(F.relu(self.conv2_bn2(self.conv2(x))))
#         print(x.shape)
        x = F.relu(self.conv2_bn3(self.conv3(x)))
#         print(x.shape)
        x = x.view(-1, 10*12*27)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x), dim=1)
        return x

In [8]:
def train(model, train_loader, criterion, optimizer, epoch):
    train_loss = 0
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data = data.view(data.shape[0], 1, data.shape[1], data.shape[2])
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        if batch_idx % (len(train_loader)//2) == 0:
            print('Train({})[{:.0f}%]: Loss: {:.4f}'.format(
                epoch, 100. * batch_idx / len(train_loader), train_loss/(batch_idx+1)))

def test(model, test_loader, criterion, epoch, batch_size = 32):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data = data.view(data.shape[0], 1, data.shape[1], data.shape[2])
            output = model(data)
            test_loss += criterion(output, target).item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
    
    test_loss = (test_loss*batch_size)/len(test_loader.dataset)
    print('Test({}): Loss: {:.4f}, Accuracy: {:.4f}%'.format(
        epoch, test_loss, 100. * correct / len(test_loader.dataset)))

In [9]:
torch.manual_seed(17)

<torch._C.Generator at 0x7f451247efb0>

In [10]:
model = CNN_chroma() #.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), weight_decay=1e-5)

# optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)

num_epochs = 10

In [11]:
for epoch in range(1, num_epochs + 1):
    train(model, train_loader, criterion, optimizer, epoch)
    test(model, test_loader, criterion, epoch)

Train(1)[0%]: Loss: 1.6071
Train(1)[50%]: Loss: 1.5458
Test(1): Loss: 1.5153, Accuracy: 37.4284%
Train(2)[0%]: Loss: 1.4474
Train(2)[50%]: Loss: 1.4666
Test(2): Loss: 1.5178, Accuracy: 36.9599%
Train(3)[0%]: Loss: 1.3891
Train(3)[50%]: Loss: 1.4250
Test(3): Loss: 1.5281, Accuracy: 36.4914%
Train(4)[0%]: Loss: 1.4568
Train(4)[50%]: Loss: 1.3863
Test(4): Loss: 1.5255, Accuracy: 37.4284%
Train(5)[0%]: Loss: 1.2572
Train(5)[50%]: Loss: 1.3771
Test(5): Loss: 1.5244, Accuracy: 37.5846%
Train(6)[0%]: Loss: 1.3005
Train(6)[50%]: Loss: 1.3324
Test(6): Loss: 1.5265, Accuracy: 37.6887%
Train(7)[0%]: Loss: 1.2221
Train(7)[50%]: Loss: 1.3181
Test(7): Loss: 1.5247, Accuracy: 38.0271%
Train(8)[0%]: Loss: 1.3349
Train(8)[50%]: Loss: 1.2981
Test(8): Loss: 1.5316, Accuracy: 37.5846%
Train(9)[0%]: Loss: 1.2951
Train(9)[50%]: Loss: 1.2837
Test(9): Loss: 1.5334, Accuracy: 36.8037%
Train(10)[0%]: Loss: 1.4141
Train(10)[50%]: Loss: 1.2626
Test(10): Loss: 1.5368, Accuracy: 36.8558%


## CNN MFCC

In [12]:
## Load data
train_mfcc = np.load("Data/train_mfcc13.npy")
test_mfcc = np.load("Data/test_mfcc13.npy")
train_labels = np.load("Data/train_labels_mfcc13.npy")
test_labels = np.load("Data/test_labels_mfcc13.npy")

In [13]:
bs = 32

train_set = MyDataset(train_mfcc, train_labels)
train_loader = DataLoader(train_set, batch_size=bs, shuffle=True)
test_set = MyDataset(test_mfcc, test_labels)
test_loader = DataLoader(test_set, batch_size=bs, shuffle=False)

In [14]:
class CNN_mfcc(nn.Module):
    def __init__(self):
        super(CNN_mfcc, self).__init__()
        self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 5, kernel_size = (4,5),
                              stride = (1,1), padding = (1,1))
        self.conv2_bn1 = nn.BatchNorm2d(5)
        self.pool1 = nn.MaxPool2d((1, 2))
        self.conv2 = nn.Conv2d(5, 10, kernel_size = (2,4), stride = (2,2), padding = (6,2))
        self.conv2_bn2 = nn.BatchNorm2d(10)
        self.pool2 = nn.MaxPool2d((1, 2))
        self.conv3 = nn.Conv2d(10, 10, kernel_size = (2,4), stride = (2,2), padding = (6,1))
        self.conv2_bn3 = nn.BatchNorm2d(10)
        self.fc1 = nn.Linear(10 * 12 * 27, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 5)

    def forward(self, x):
#         print(x.shape)
        x = self.pool1(F.relu(self.conv2_bn1(self.conv1(x))))
#         print(x.shape)
        x = self.pool2(F.relu(self.conv2_bn2(self.conv2(x))))
#         print(x.shape)
        x = F.relu(self.conv2_bn3(self.conv3(x)))
#         print(x.shape)
        x = x.view(-1, 10*12*27)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x), dim=1)
        return x

In [15]:
model = CNN_mfcc() #.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), weight_decay=1e-5)

# optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)

num_epochs = 10

In [16]:
for epoch in range(1, num_epochs + 1):
    train(model, train_loader, criterion, optimizer, epoch)
    test(model, test_loader, criterion, epoch)

Train(1)[0%]: Loss: 1.6042
Train(1)[50%]: Loss: 1.5356
Test(1): Loss: 1.4782, Accuracy: 41.9053%
Train(2)[0%]: Loss: 1.4521
Train(2)[50%]: Loss: 1.3791
Test(2): Loss: 1.4133, Accuracy: 47.8917%
Train(3)[0%]: Loss: 1.3203
Train(3)[50%]: Loss: 1.3596
Test(3): Loss: 1.4129, Accuracy: 48.5164%
Train(4)[0%]: Loss: 1.3375
Train(4)[50%]: Loss: 1.3264
Test(4): Loss: 1.4032, Accuracy: 49.4014%
Train(5)[0%]: Loss: 1.2834
Train(5)[50%]: Loss: 1.2996
Test(5): Loss: 1.3689, Accuracy: 53.8261%
Train(6)[0%]: Loss: 1.3184
Train(6)[50%]: Loss: 1.2872
Test(6): Loss: 1.3447, Accuracy: 55.4399%
Train(7)[0%]: Loss: 1.3113
Train(7)[50%]: Loss: 1.2583
Test(7): Loss: 1.3785, Accuracy: 52.5508%
Train(8)[0%]: Loss: 1.2779
Train(8)[50%]: Loss: 1.2386
Test(8): Loss: 1.3696, Accuracy: 53.2535%
Train(9)[0%]: Loss: 1.1491
Train(9)[50%]: Loss: 1.1988
Test(9): Loss: 1.3400, Accuracy: 55.8824%
Train(10)[0%]: Loss: 1.1839
Train(10)[50%]: Loss: 1.1960
Test(10): Loss: 1.3824, Accuracy: 52.8371%


## Spectrograms

In [27]:
class CNN_spec(nn.Module):
    def __init__(self):
        super(CNN_spec, self).__init__()
        self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 5, kernel_size = (4,3),
                              stride = (2,2), padding = (1,1))
        self.conv2_bn1 = nn.BatchNorm2d(5)
        self.pool1 = nn.MaxPool2d((1, 2))
        self.conv2 = nn.Conv2d(5, 10, kernel_size = 4, stride = 2, padding = 1)
        self.conv2_bn2 = nn.BatchNorm2d(10)
        self.pool2 = nn.MaxPool2d(2)
        self.conv3 = nn.Conv2d(10, 10, kernel_size = (3,4), stride = 2, padding = 1)
        self.conv2_bn3 = nn.BatchNorm2d(10)
        self.pool2 = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(10 * 4 * 13, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 5)

    def forward(self, x):
#         print(x.shape)
        x = self.pool1(F.relu(self.conv2_bn1(self.conv1(x))))
#         print(x.shape)
        x = self.pool2(F.relu(self.conv2_bn2(self.conv2(x))))
#         print(x.shape)
        x = F.relu(self.conv2_bn3(self.conv3(x)))
#         print(x.shape)
        x = x.view(-1, 10*4*13)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x), dim=1)
        return x

In [18]:
## Load data

train_stft1 = np.load("Data/train_stft-dB-1.npy", allow_pickle = True)
test_stft1 = np.load("Data/test_stft-dB-1.npy")
train_labels1 = np.load("Data/train_labels_stft-dB-1.npy")
test_labels1 = np.load("Data/test_labels_stft-dB-1.npy")

## Load new data
train_stft2 = np.load("Data/train_stft-dB-2.npy", allow_pickle = True)
test_stft2 = np.load("Data/test_stft-dB-2.npy")
train_labels2 = np.load("Data/train_labels_stft-dB-2.npy")
test_labels2 = np.load("Data/test_labels_stft-dB-2.npy")


In [19]:
train_stft = np.concatenate((train_stft1, train_stft2))
test_stft = np.concatenate((test_stft1, test_stft2))
train_labels = np.concatenate((train_labels1, train_labels2))
test_labels = np.concatenate((test_labels1, test_labels2))

In [20]:
bs = 32

train_set = MyDataset(train_stft, train_labels)
train_loader = DataLoader(train_set, batch_size=bs, shuffle=True)
test_set = MyDataset(test_stft, test_labels)
test_loader = DataLoader(test_set, batch_size=bs, shuffle=False)

In [21]:
train_stft.shape

(2147, 64, 431)

In [28]:
model = CNN_spec() #.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), weight_decay=1e-5)

# optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)

num_epochs = 10

In [29]:
num_epochs = 10
for epoch in range(1, num_epochs + 1):
    train(model, train_loader, criterion, optimizer, epoch)
    test(model, test_loader, criterion, epoch)

Train(1)[0%]: Loss: 1.6109
Train(1)[50%]: Loss: 1.5690
Test(1): Loss: 1.5301, Accuracy: 37.0611%
Train(2)[0%]: Loss: 1.4488
Train(2)[50%]: Loss: 1.4555
Test(2): Loss: 1.5499, Accuracy: 35.0455%
Train(3)[0%]: Loss: 1.4551
Train(3)[50%]: Loss: 1.3914
Test(3): Loss: 1.4649, Accuracy: 44.9935%
Train(4)[0%]: Loss: 1.3724
Train(4)[50%]: Loss: 1.3235
Test(4): Loss: 1.4490, Accuracy: 46.7490%
Train(5)[0%]: Loss: 1.3249
Train(5)[50%]: Loss: 1.3092
Test(5): Loss: 1.4141, Accuracy: 50.5202%
Train(6)[0%]: Loss: 1.1366
Train(6)[50%]: Loss: 1.2674
Test(6): Loss: 1.4727, Accuracy: 45.5137%
Train(7)[0%]: Loss: 1.2889
Train(7)[50%]: Loss: 1.2346
Test(7): Loss: 1.4158, Accuracy: 50.1300%
Train(8)[0%]: Loss: 1.2681
Train(8)[50%]: Loss: 1.2169
Test(8): Loss: 1.4544, Accuracy: 47.0091%
Train(9)[0%]: Loss: 1.1763
Train(9)[50%]: Loss: 1.1921
Test(9): Loss: 1.4191, Accuracy: 50.4551%
Train(10)[0%]: Loss: 1.2738
Train(10)[50%]: Loss: 1.1809
Test(10): Loss: 1.4008, Accuracy: 52.5358%


In [30]:
## Load new data
train_stft1 = np.load("Data/train_stft-dB-3.npy", allow_pickle = True)
test_stft1 = np.load("Data/test_stft-dB-3.npy")
train_labels1 = np.load("Data/train_labels_stft-dB-3.npy")
test_labels1 = np.load("Data/test_labels_stft-dB-3.npy")

train_stft2 = np.load("Data/train_stft-dB-4.npy", allow_pickle = True)
test_stft2 = np.load("Data/test_stft-dB-4.npy")
train_labels2 = np.load("Data/train_labels_stft-dB-4.npy")
test_labels2 = np.load("Data/test_labels_stft-dB-4.npy")

train_stft = np.concatenate((train_stft1, train_stft2))
test_stft = np.concatenate((test_stft1, test_stft2))
train_labels = np.concatenate((train_labels1, train_labels2))
test_labels = np.concatenate((test_labels1, test_labels2))

bs = 32

train_set = MyDataset(train_stft, train_labels)
train_loader = DataLoader(train_set, batch_size=bs, shuffle=True)
test_set = MyDataset(test_stft, test_labels)
test_loader = DataLoader(test_set, batch_size=bs, shuffle=False)

num_epochs = 10
for epoch in range(1, num_epochs + 1):
    train(model, train_loader, criterion, optimizer, epoch)
    test(model, test_loader, criterion, epoch)

Train(1)[0%]: Loss: 1.3467
Train(1)[50%]: Loss: 1.3275
Test(1): Loss: 1.3622, Accuracy: 53.2552%
Train(2)[0%]: Loss: 1.2420
Train(2)[50%]: Loss: 1.2599
Test(2): Loss: 1.3596, Accuracy: 53.7760%
Train(3)[0%]: Loss: 1.1337
Train(3)[50%]: Loss: 1.2373
Test(3): Loss: 1.3748, Accuracy: 52.0833%
Train(4)[0%]: Loss: 1.2189
Train(4)[50%]: Loss: 1.2459
Test(4): Loss: 1.3503, Accuracy: 54.5573%
Train(5)[0%]: Loss: 1.2015
Train(5)[50%]: Loss: 1.2352
Test(5): Loss: 1.3461, Accuracy: 55.0130%
Train(6)[0%]: Loss: 1.2104
Train(6)[50%]: Loss: 1.2025
Test(6): Loss: 1.4152, Accuracy: 48.6328%
Train(7)[0%]: Loss: 1.1580
Train(7)[50%]: Loss: 1.2129
Test(7): Loss: 1.3786, Accuracy: 51.8229%
Train(8)[0%]: Loss: 1.2848
Train(8)[50%]: Loss: 1.1851
Test(8): Loss: 1.3484, Accuracy: 54.4922%
Train(9)[0%]: Loss: 1.2512
Train(9)[50%]: Loss: 1.1884
Test(9): Loss: 1.3381, Accuracy: 56.1198%
Train(10)[0%]: Loss: 1.0830
Train(10)[50%]: Loss: 1.1522
Test(10): Loss: 1.3441, Accuracy: 55.3385%


In [31]:
## Load new data
train_stft = np.load("Data/train_stft-dB-5.npy", allow_pickle = True)
test_stft = np.load("Data/test_stft-dB-5.npy")
train_labels = np.load("Data/train_labels_stft-dB-5.npy")
test_labels = np.load("Data/test_labels_stft-dB-5.npy")


bs = 32

train_set = MyDataset(train_stft, train_labels)
train_loader = DataLoader(train_set, batch_size=bs, shuffle=True)
test_set = MyDataset(test_stft, test_labels)
test_loader = DataLoader(test_set, batch_size=bs, shuffle=False)

num_epochs = 10
for epoch in range(1, num_epochs + 1):
    train(model, train_loader, criterion, optimizer, epoch)
    test(model, test_loader, criterion, epoch)

Train(1)[0%]: Loss: 1.2747
Train(1)[50%]: Loss: 1.3332
Test(1): Loss: 1.3427, Accuracy: 55.0781%
Train(2)[0%]: Loss: 1.4820
Train(2)[50%]: Loss: 1.2824
Test(2): Loss: 1.3350, Accuracy: 56.5104%
Train(3)[0%]: Loss: 1.1846
Train(3)[50%]: Loss: 1.2499
Test(3): Loss: 1.3890, Accuracy: 51.0417%
Train(4)[0%]: Loss: 1.2994
Train(4)[50%]: Loss: 1.2902
Test(4): Loss: 1.4046, Accuracy: 49.7396%
Train(5)[0%]: Loss: 1.4702
Train(5)[50%]: Loss: 1.2386
Test(5): Loss: 1.3667, Accuracy: 51.3021%
Train(6)[0%]: Loss: 1.1716
Train(6)[50%]: Loss: 1.2097
Test(6): Loss: 1.3462, Accuracy: 54.4271%
Train(7)[0%]: Loss: 1.1275
Train(7)[50%]: Loss: 1.1982
Test(7): Loss: 1.3724, Accuracy: 52.6042%
Train(8)[0%]: Loss: 1.2099
Train(8)[50%]: Loss: 1.1538
Test(8): Loss: 1.3562, Accuracy: 53.7760%
Train(9)[0%]: Loss: 1.2043
Train(9)[50%]: Loss: 1.1286
Test(9): Loss: 1.3844, Accuracy: 51.0417%
Train(10)[0%]: Loss: 1.1110
Train(10)[50%]: Loss: 1.1205
Test(10): Loss: 1.3377, Accuracy: 55.7292%
