set up

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
# Imports

from pipeline.data_read import *
from pipeline.image_prep import *
from pipeline.image_prep_torch import *
from pipeline.results_analysis import *

In [None]:
# Set constants

# Paths
dataset = "small"

if dataset == "small":
    audio_folderpath = "C:/Users/anany/Cambridge/Part II Project/data/audio/small"
    image_folderpath = "C:/Users/anany/Cambridge/Part II Project/data/images/small"
    csv_filepath = "C:/Users/anany/Cambridge/Part II Project/data/small.csv"
if dataset == "large":
    audio_folderpath = "C:/Users/anany/Cambridge/Part II Project/data/audio/large"
    image_folderpath = "C:/Users/anany/Cambridge/Part II Project/data/images/large"
    csv_filepath = "C:/Users/anany/Cambridge/Part II Project/data/large.csv"

image_type = "spectrogram"

# Image options
clip_length = 10
shift_length = 50
resize_dim = 30
save = False

# Model parameters
generate = False
split_ratio = 0.8
k_folds = 5 # Only relevant if cross_validation is True
cross_validation = True
epochs = 3
gen_batch_size = 100 # Only relevant if generate is True
model_batch_size = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
torch.cuda.is_available()

In [None]:
genre_id = get_genres(audio_folderpath)
genre_list = list(genre_id.keys())
label_list = [genre_id[genre] for genre in genre_list]

if save:
    audio_dict = get_audio_dict(audio_folderpath, genre_id) # TODO: CHANGE TO MAKE??
    all_song_ids = list(audio_dict.keys())
    convert_and_save(all_song_ids, audio_dict=audio_dict, image_type=image_type, image_folderpath=image_folderpath)
    create_csv(csv_filepath=csv_filepath, audio_dict=audio_dict)

else:
    audio_dict = get_audio_dict_csv(csv_filepath=csv_filepath)
    all_song_ids = list(audio_dict.keys())

train_song_ids, val_song_ids = split_train_test(id_list=all_song_ids, ratio=split_ratio)

train/test

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision

from torch.utils.data import DataLoader

In [None]:
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder

In [None]:
# gendataset

import torch
from torch.utils.data import Dataset, DataLoader

class GenDataset(Dataset):
    def __init__(self, song_ids, audio_dict, clip_length, shift_length, resize_dim, label_list, image_type=None, image_folderpath=None, gen_batch_size = 1, mode = "train"):
        # generate_data_parameters
        self.song_ids = song_ids
        self.audio_dict = audio_dict
        self.clip_length = clip_length
        self.shift_length = shift_length
        self.resize_dim = resize_dim
        self.label_list = label_list
        self.image_type = image_type
        self.image_folderpath = image_folderpath
        self.gen_batch_size = gen_batch_size
        self.mode = mode

        # generator function keep-track
        self.gen_func = generate_data_torch(song_ids=self.song_ids, audio_dict=self.audio_dict, clip_length=self.clip_length, shift_length=self.shift_length, resize_dim=self.resize_dim, label_list=self.label_list, image_type=self.image_type, image_folderpath=self.image_folderpath, gen_batch_size = self.gen_batch_size, mode = self.mode)
        self.clip_ids = []
        self.images = None
        self.labels = np.empty((0, 1))
        self.image_offset = 0

    def __len__(self):
        len = 0
        for id in self.song_ids:
            filepath = self.audio_dict[id]
            with wave.open(filepath, "rb") as wav_file:
                frame_count = wav_file.getnframes()
                sample_rate = wav_file.getframerate()
                audio_len = frame_count/sample_rate
                clip_count = int((audio_len - self.clip_length + self.shift_length)/self.shift_length) * 2
                len += clip_count
        return len
    
    def __getitem__(self, index):
        # reset if index = 0
        if index == 0:
            self.clip_ids = []
            self.images = None
            self.labels = np.empty((0, 1))
            self.image_offset = 0

        # check if imgs is empty
        if self.images is None:
            # get ids, imgs, labels
            ids, imgs, labels = next(self.gen_func)
            
            # update self values for all these
            self.clip_ids.extend(ids)
            self.images = imgs
            self.labels = np.vstack((self.labels, labels))
        
        # GETTING TUPLE
        # imgs[index-offset]
        # labels[index]
        out_image = self.images[index - self.image_offset]
        out_label = self.labels[index]

        # if imgs is last one
        if (index - self.image_offset) == (self.images.shape[0] - 1):
            # update img_offset
            self.image_offset += self.images.shape[0]
            # empty imgs
            self.images = None

        out_image = torch.tensor(out_image).permute(2, 0, 1).float()
        out_label = torch.tensor(out_label[0]).long()
        
        return out_image, out_label





In [None]:
# cnn

import torch.nn as nn
import torch.nn.functional as F

class Model(torch.nn.Module):
    def __init__(self, image_dim):
        super().__init__()
        self.conv1 = torch.nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.pool = torch.nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = torch.nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = torch.nn.Linear(64 * (image_dim // 4) * (image_dim // 4), 128)
        self.fc2 = torch.nn.Linear(128, 64)
        self.fc3 = torch.nn.Linear(64, 5)
    
    def forward(self, input):
        x = self.pool(F.relu(self.conv1(input)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    

In [None]:
if not cross_validation:

    # define model
    model = Model(resize_dim)
    model = model.to(device)
    
    loss_func = torch.nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001)

    # train and validation dataloaders
    train_dataset = GenDataset(song_ids=train_song_ids, audio_dict=audio_dict, clip_length=clip_length, shift_length=shift_length, resize_dim=resize_dim, label_list=label_list, image_type=image_type, image_folderpath=image_folderpath, gen_batch_size=gen_batch_size, mode="train")
    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False)

    val_dataset = GenDataset(song_ids=val_song_ids, audio_dict=audio_dict, clip_length=clip_length, shift_length=shift_length, resize_dim=resize_dim, label_list=label_list, image_type=image_type, image_folderpath=image_folderpath, gen_batch_size=gen_batch_size, mode="test")
    val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # empty tensors to keep track of loss, accuracy, predicted classes and actual classes
    train_clip_loss = torch.tensor([])
    val_clip_loss = torch.tensor([])

    train_clip_accuracy = torch.tensor([])
    val_clip_accuracy = torch.tensor([])

    train_clip_pred_classes = torch.tensor([], dtype=torch.long).to(device)
    train_clip_actual_classes = torch.tensor([], dtype=torch.long).to(device)

    val_clip_pred_classes = torch.tensor([], dtype=torch.long).to(device)
    val_clip_actual_classes = torch.tensor([], dtype=torch.long).to(device)


    for epoch in range(epochs):  # loop over the dataset multiple times
        print(f"Epoch {epoch + 1}     ", end="")
        correct_train = 0
        total_train = 0
        running_loss_train = 0.0
        for i, data in enumerate(train_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs = inputs.cuda()
            labels = labels.cuda()

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            output = model(inputs)
            _, predicted = torch.max(output.data, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()

            if epoch == epochs - 1:
                train_clip_pred_classes = torch.cat((train_clip_pred_classes, torch.tensor(predicted)))
                train_clip_actual_classes = torch.cat((train_clip_actual_classes, torch.tensor(labels)))
                '''train_clip_pred_classes.extend(predicted)
                train_clip_actual_classes.extend(labels)'''

            loss = loss_func(output, labels)
            loss.backward()

            optimizer.step()

            # print statistics
            running_loss_train += loss.item()

        train_clip_loss = torch.cat((train_clip_loss, torch.tensor([running_loss_train/len(train_dataloader)])), dim=0)
        train_clip_accuracy = torch.cat((train_clip_accuracy, torch.tensor([correct_train/total_train])), dim=0)

        print(f"Train Loss {running_loss_train/len(train_dataloader)}    Train Accuracy {correct_train/total_train}     ", end = "")

        correct_val = 0
        total_val = 0
        running_loss_val = 0.0

        with torch.no_grad():
            for data in val_dataloader:
                inputs, labels = data
                inputs = inputs.cuda()
                labels = labels.cuda()
                
                output = model(inputs)
                _, predicted = torch.max(output.data, 1)

                if epoch == epochs - 1:
                    val_clip_pred_classes = torch.cat((val_clip_pred_classes, predicted.clone().detach()))
                    val_clip_actual_classes = torch.cat((val_clip_actual_classes, labels.clone().detach()))

                total_val += labels.size(0)
                correct_val += (predicted == labels).sum().item()

                loss = loss_func(output, labels)
                running_loss_val += loss.item()
            
            val_clip_loss = torch.cat((val_clip_loss, torch.tensor([running_loss_val/len(val_dataloader)])), dim=0)
            val_clip_accuracy = torch.cat((val_clip_accuracy, torch.tensor([correct_val/total_val])), dim=0)
        
        print(f"Val Loss {running_loss_val/len(val_dataloader)}    Val Accuracy {correct_val/total_val}")
        

    train_clip_vote_array = F.one_hot(train_clip_pred_classes, num_classes=len(label_list))
    val_clip_vote_array = F.one_hot(val_clip_pred_classes, num_classes=len(label_list))


    print('Finished Training')

In [None]:
#if cross_validation:


# split train+val and test, make batches
train_val_song_ids, test_song_ids = split_train_test(all_song_ids, split_ratio)
batches = split_k_fold(train_val_song_ids, k_folds)

# empty tensors to keep track of loss and accuracy
train_clip_loss_cross = torch.tensor([]).to(device)
train_clip_accuracy_cross = torch.tensor([]).to(device)
val_clip_loss_cross = torch.tensor([]).to(device)
val_clip_accuracy_cross = torch.tensor([]).to(device)

test_clip_accuracy = torch.tensor([]).to(device) # concat this, add whole tensor for this

# test dataloader
test_dataset = GenDataset(song_ids=test_song_ids, audio_dict=audio_dict, clip_length=clip_length, shift_length=shift_length, resize_dim=resize_dim, label_list=label_list, image_type=image_type, image_folderpath=image_folderpath, gen_batch_size=gen_batch_size, mode="test")
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# vote ids and arrays
val_clip_vote_ids = []
val_clip_vote_array = torch.empty((0, len(genre_list))).to(device)

test_clip_vote_ids = get_clip_ids(test_song_ids, audio_dict=audio_dict, clip_length=clip_length, shift_length=shift_length)
test_clip_vote_array = torch.zeros((len(test_clip_vote_ids), len(genre_list))).to(device)



for b in range(len(batches)):

    # define model
    model = Model(resize_dim)
    model = model.to(device)

    # loss function and optimizer
    loss_func = torch.nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001)

    # loss and accuracy - keep track
    train_clip_loss = torch.tensor([]).to(device)
    train_clip_accuracy = torch.tensor([]).to(device)

    val_clip_loss = torch.tensor([]).to(device)
    val_clip_accuracy = torch.tensor([]).to(device)

    # pred/actual classes
    '''train_clip_pred_classes = torch.tensor([], dtype=torch.long).to(device)
    train_clip_actual_classes = torch.tensor([], dtype=torch.long).to(device)'''

    

    # train/val dataloaders
    if b != len(batches) - 1:
            train_song_ids = batches[0:b] + batches[b+1:]
    else:
        train_song_ids = batches[:-1]
    
    train_song_ids = [item for batch in train_song_ids for item in batch]
    val_song_ids = batches[b]

    train_dataset = GenDataset(song_ids=train_song_ids, audio_dict=audio_dict, clip_length=clip_length, shift_length=shift_length, resize_dim=resize_dim, label_list=label_list, image_type=image_type, image_folderpath=image_folderpath, gen_batch_size=gen_batch_size, mode="train")
    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False)

    val_dataset = GenDataset(song_ids=val_song_ids, audio_dict=audio_dict, clip_length=clip_length, shift_length=shift_length, resize_dim=resize_dim, label_list=label_list, image_type=image_type, image_folderpath=image_folderpath, gen_batch_size=gen_batch_size, mode="test")
    val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # train
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}     ", end="")
        correct_train = 0
        total_train = 0
        running_loss_train = 0.0

        # forward
        for i, data in enumerate(train_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs = inputs.cuda()
            labels = labels.cuda()

            optimizer.zero_grad()

            output = model(inputs)
            _, predicted = torch.max(output.data, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()

            #backward
            loss = loss_func(output, labels)
            loss.backward()
            optimizer.step()
            running_loss_train += loss.item()

        # update train loss
        # update train accuracy
        train_clip_loss = torch.cat((train_clip_loss, torch.tensor([running_loss_train/len(train_dataloader)]).to(device)), dim=0)
        train_clip_accuracy = torch.cat((train_clip_accuracy, torch.tensor([correct_train/total_train]).to(device)), dim=0)

        print(f"Train Loss {running_loss_train/len(train_dataloader)}    Train Accuracy {correct_train/total_train}     ", end = "")


        # validation
        correct_val = 0
        total_val = 0
        running_loss_val = 0.0

        val_clip_pred_classes = torch.tensor([], dtype=torch.long).to(device)
        val_clip_actual_classes = torch.tensor([], dtype=torch.long).to(device)
        
        with torch.no_grad():
            for data in val_dataloader:
                inputs, labels = data
                inputs = inputs.cuda()
                labels = labels.cuda()
                
                output = model(inputs)
                _, predicted = torch.max(output.data, 1)

                if epoch == epochs - 1:
                    val_clip_pred_classes = torch.cat((val_clip_pred_classes, predicted.clone().detach()))
                    val_clip_actual_classes = torch.cat((val_clip_actual_classes, labels.clone().detach()))

                    # update val vote array
                    '''new_val_clip_vote_array = F.one_hot(predicted, num_classes=len(label_list)).to(device)
                    val_clip_vote_array = torch.cat((val_clip_vote_array, new_val_clip_vote_array), dim=0)'''

                total_val += labels.size(0)
                correct_val += (predicted == labels).sum().item()

                loss = loss_func(output, labels)
                running_loss_val += loss.item()
            
            # update val loss
            val_clip_loss = torch.cat((val_clip_loss, torch.tensor([running_loss_val/len(val_dataloader)]).to(device)), dim=0)
            # update val accuracy
            val_clip_accuracy = torch.cat((val_clip_accuracy, torch.tensor([correct_val/total_val]).to(device)), dim=0)
            # update val vote array
            if epoch == epochs - 1:
                val_clip_vote_ids.extend(val_dataset.clip_ids)
                # print(val_clip_vote_ids)
                new_val_clip_vote_array = F.one_hot(val_clip_pred_classes, num_classes=len(label_list)).to(device)
                val_clip_vote_array = torch.cat((val_clip_vote_array, new_val_clip_vote_array), dim=0)

            print(f"Val Loss {running_loss_val/len(val_dataloader)}    Val Accuracy {correct_val/total_val}")
    
    # val
    # update vote ids and array
    '''val_clip_vote_ids.extend(val_dataset.clip_ids)
    new_val_clip_vote_array = F.one_hot(val_clip_pred_classes, num_classes=len(label_list))
    val_clip_vote_array = torch.cat((val_clip_vote_array, new_val_clip_vote_array), dim=0)
    '''

    # test
    correct_test = 0
    total_test = 0
    running_loss_test = 0.0

    test_clip_pred_classes = torch.tensor([], dtype=torch.long).to(device)
    test_clip_actual_classes = torch.tensor([], dtype=torch.long).to(device)

    with torch.no_grad():
        for data in test_dataloader:
            inputs, labels = data
            inputs = inputs.cuda()
            labels = labels.cuda()

            output = model(inputs)
            _, predicted = torch.max(output.data, 1)

            test_clip_pred_classes = torch.cat((test_clip_pred_classes, predicted.clone().detach()))
            test_clip_actual_classes = torch.cat((test_clip_actual_classes, labels.clone().detach()))

            total_test += labels.size(0)
            correct_test += (predicted == labels).sum().item()

            loss = loss_func(output, labels)
            running_loss_test += loss.item()

        # update test_accuracy
        test_clip_accuracy = torch.cat((test_clip_accuracy, torch.tensor([running_loss_test/len(test_dataloader)]).to(device)), dim=0)

        # update test vote array
        new_test_clip_vote_array = F.one_hot(test_clip_pred_classes, num_classes=len(label_list)).to(device)
        test_clip_vote_array = torch.add(test_clip_vote_array, new_test_clip_vote_array)#, dim=0)

        print(f"Batch {b + 1}/{len(batches)} done.   Test Accuracy {running_loss_test/len(test_dataloader)}")
    
    train_clip_loss_cross = torch.cat((train_clip_loss_cross, train_clip_loss.unsqueeze(0)), dim = 0)
    train_clip_accuracy_cross = torch.cat((train_clip_accuracy_cross, train_clip_accuracy.unsqueeze(0)), dim = 0)

    val_clip_loss_cross = torch.cat((val_clip_loss_cross, val_clip_loss.unsqueeze(0)), dim = 0)
    val_clip_accuracy_cross = torch.cat((val_clip_accuracy_cross, val_clip_accuracy.unsqueeze(0)), dim = 0)



In [None]:
if not cross_validation:
    train_clip_confusion = get_confusion_matrix(vote_ids=train_dataset.clip_ids, vote_array=train_clip_vote_array, label_list=label_list)
    val_clip_confusion = get_confusion_matrix(vote_ids=val_dataset.clip_ids, vote_array=val_clip_vote_array, label_list=label_list)

    norm_train_clip_confusion = torch.zeros((len(label_list), len(label_list)))
    for i in range(len(label_list)):
        norm_train_clip_confusion[:, i] = train_clip_confusion.clone()[:, i]/sum(train_clip_confusion[:, i])

    norm_val_clip_confusion = torch.zeros((len(label_list), len(label_list)))
    for i in range(len(label_list)):
        norm_val_clip_confusion[:, i] = val_clip_confusion.clone()[:, i]/sum(val_clip_confusion[:, i])

if cross_validation:
    val_clip_confusion = get_confusion_matrix(vote_ids=val_clip_vote_ids, vote_array=val_clip_vote_array, label_list=label_list)
    test_clip_confusion = get_confusion_matrix(vote_ids=test_clip_vote_ids, vote_array=val_clip_vote_array, label_list=label_list)

    norm_val_clip_confusion = torch.zeros((len(label_list), len(label_list)))
    for i in range(len(label_list)):
        norm_val_clip_confusion[:, i] = val_clip_confusion.clone()[:, i]/sum(val_clip_confusion[:, i])

    norm_test_clip_confusion = torch.zeros((len(label_list), len(label_list)))
    for i in range(len(label_list)):
        norm_test_clip_confusion[:, i] = test_clip_confusion.clone()[:, i]/sum(test_clip_confusion[:, i])

analysis

In [None]:
'''temp = train_clip_loss_cross.clone().cpu().numpy()

plt.plot(sum(temp)/len(temp), label='Average Training Loss')
#plt.plot(sum(val_clip_loss_cross)/len(val_clip_loss_cross), label='Average Validation Loss')
plt.title('Average Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()'''

In [None]:
# Loss Function

if cross_validation:
    train_clip_loss = train_clip_loss_cross.clone().cpu().numpy()
    train_clip_loss = sum(train_clip_loss)/len(train_clip_loss)

    val_clip_loss = val_clip_loss_cross.clone().cpu().numpy()
    val_clip_loss = sum(val_clip_loss)/len(val_clip_loss)


plt.plot(train_clip_loss, label='Training Loss')
plt.plot(val_clip_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Accuracy Function

if cross_validation:
    train_clip_accuracy = train_clip_accuracy_cross.clone().cpu().numpy()
    train_clip_accuracy = sum(train_clip_accuracy)/len(train_clip_accuracy)

    val_clip_accuracy = val_clip_accuracy_cross.clone().cpu().numpy()
    val_clip_accuracy = sum(val_clip_accuracy)/len(val_clip_loss)

plt.plot(train_clip_accuracy, label='Training Accuracy')
plt.plot(val_clip_accuracy, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

'''if cross_validation:
    plt.plot(sum(accuracy)/len(accuracy), label='Average Training Accuracy')
    plt.plot(sum(val_accuracy)/len(val_accuracy), label='Average Validation Accuracy')
    plt.title('Average Training and Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()'''

In [None]:
# Validation Clip Confusion Matrix
import matplotlib.cm as cm


fig, ax = plt.subplots()
ax.imshow(norm_val_clip_confusion)#, cmap = "gray_r")

labels = [item.get_text() for item in ax.get_yticklabels()]
for i in range(1, len(labels) - 1):
    labels[i] = genre_list[i - 1]

ax.set_xticklabels(labels, rotation = 45, ha = "right")
ax.set_yticklabels(labels)

ax.set_xlabel("Actual class")
ax.set_ylabel("Predicted class")
ax.set_title("Validation Clip Confusion Matrix")

sm = cm.ScalarMappable(cmap="viridis")
sm.set_array(norm_val_clip_confusion)
cbar = fig.colorbar(sm, ax=ax)    

plt.show()

In [None]:
# Test Clip Confusion Matrix

fig, ax = plt.subplots()
ax.imshow(norm_test_clip_confusion, cmap = "viridis")

labels = [item.get_text() for item in ax.get_yticklabels()]
for i in range(1, len(labels) - 1):
    labels[i] = genre_list[i - 1]

ax.set_xticklabels(labels, rotation = 45, ha = "right")
ax.set_yticklabels(labels)

ax.set_xlabel("Actual class")
ax.set_ylabel("Predicted class")
ax.set_title("Test Clip Confusion Matrix")

sm = cm.ScalarMappable(cmap="viridis")
sm.set_array(norm_val_clip_confusion)
cbar = fig.colorbar(sm, ax=ax)    

plt.show()

In [None]:
if cross_validation:
    _, test_clip_pred_classes_cross = torch.max(test_clip_vote_array, dim=1)
    c = classified_as(test_dataset.clip_ids, test_clip_pred_classes_cross, test_clip_actual_classes, 1, 4)
else:
    c = classified_as(test_dataset.clip_ids, test_clip_pred_classes, test_clip_actual_classes, 1, 4)

if (len(c) != 0):
    example = random.randint(0, len(c) - 1)
    print(c[example])
    print(audio_dict[c[example][:4]])
    display(play(c[example], audio_dict=audio_dict))
else:
    print("No such clips.")

In [None]:
if not cross_validation:
    val_song_vote_array = get_song_votes(val_song_ids, val_dataset.clip_ids, val_clip_vote_array.clone())
    val_song_pred_classes = val_song_vote_array.argmax(axis=1)
    val_song_actual_classes = get_labels(val_song_ids, label_list)

    val_song_confusion = torch.zeros((len(label_list), len(label_list)))
    for i in range(len(val_song_pred_classes)):
        val_song_confusion[val_song_pred_classes[i]][val_song_actual_classes[i]] += 1

    norm_val_song_confusion = np.zeros((len(label_list), len(label_list)))
    for i in range(len(label_list)):
        norm_val_song_confusion[:, i] = val_song_confusion[:, i]/sum(val_song_confusion[:, i])


if cross_validation:
    val_song_vote_array = get_song_votes(train_val_song_ids, val_clip_vote_ids, val_clip_vote_array.clone())
    val_song_pred_classes = val_song_vote_array.argmax(axis=1)
    val_song_actual_classes = get_labels(train_val_song_ids, label_list)

    '''print(val_song_pred_classes)

    '''
    val_song_confusion = torch.zeros((len(label_list), len(label_list)))
    for i in range(len(val_song_pred_classes)):
        val_song_confusion[val_song_pred_classes[i]][val_song_actual_classes[i]] += 1

    norm_val_song_confusion = np.zeros((len(label_list), len(label_list)))
    for i in range(len(label_list)):
        norm_val_song_confusion[:, i] = val_song_confusion[:, i]/sum(val_song_confusion[:, i])



    test_song_vote_array = get_song_votes(test_song_ids, test_dataset.clip_ids, test_clip_vote_array)
    test_song_pred_classes = test_song_vote_array.argmax(axis=1)
    test_song_actual_classes = get_labels(test_song_ids, label_list)
    
    test_song_confusion = np.zeros((len(label_list), len(label_list)))
    for i in range(len(test_song_pred_classes)):
        test_song_confusion[test_song_pred_classes[i]][test_song_actual_classes[i]] += 1

    norm_test_song_confusion = np.zeros((len(label_list), len(label_list)))
    for i in range(len(label_list)):
        norm_test_song_confusion[:, i] = test_song_confusion[:, i]/sum(test_song_confusion[:, i])


In [None]:
# Validation Song Confusion Matrix
import matplotlib.cm as cm

fig, ax = plt.subplots()
ax.imshow(norm_val_song_confusion)#, cmap = "gray_r")

labels = [item.get_text() for item in ax.get_yticklabels()]
for i in range(1, len(labels) - 1):
    labels[i] = genre_list[i - 1]

ax.set_xticklabels(labels, rotation = 45, ha = "right")
ax.set_yticklabels(labels)

ax.set_xlabel("Actual class")
ax.set_ylabel("Predicted class")
ax.set_title("Validation Song Confusion Matrix")

sm = cm.ScalarMappable(cmap="viridis")
sm.set_array(norm_val_song_confusion)
cbar = fig.colorbar(sm, ax=ax)

plt.show()

In [None]:
# Test Song Confusion Matrix

if cross_validation:  
    fig, ax = plt.subplots()
    ax.imshow(norm_test_song_confusion, cmap = "viridis")

    labels = [item.get_text() for item in ax.get_yticklabels()]
    for i in range(1, len(labels) - 1):
        labels[i] = genre_list[i - 1]

    ax.set_xticklabels(labels, rotation = 45, ha = "right")
    ax.set_yticklabels(labels)

    ax.set_xlabel("Actual class")
    ax.set_ylabel("Predicted class")
    ax.set_title("Test Song Confusion Matrix")

    sm = cm.ScalarMappable(cmap="viridis")
    sm.set_array(norm_val_song_confusion)
    cbar = fig.colorbar(sm, ax=ax)

    plt.show()

# Demo

In [None]:
print("GENRES")
for i in range(len(genre_list)):
    print(i, genre_list[i])

In [None]:
sample_id_0 = 'b122_ch0_40.0_50.0'
sample_id_1 = 'h094_ch0_45.0_55.0'
sample_id_2 = 'j011_ch1_50.0_60.0'
sample_id_3 = 'p224_ch0_115.0_125.0'
sample_id_4 = 'r031_ch1_90.0_100.0'

sample_id_0m = 'b044_ch1_60.0_70.0'
sample_id_1m = 'h071_ch1_75.0_85.0'
sample_id_2m = 'j070_ch1_75.0_85.0'
sample_id_3m = 'p124_ch1_135.0_145.0'
sample_id_4m = 'r167_ch0_130.0_140.0'

sample_id = sample_id_3m

In [None]:
sample_song_id = sample_id[:4]
sample_song_ids, sample_img_arrays, sample_labels = return_data(id_list = [sample_song_id], audio_dict=audio_dict, clip_length=clip_length, shift_length=shift_length, resize_dim=resize_dim, label_list=label_list, image_type = image_type, image_folderpath=image_folderpath)
index = sample_song_ids.index(sample_id)
sample_img = sample_img_arrays[index]
sample_label = sample_labels[index]

sample_img = torch.tensor(sample_img).permute(2, 0, 1).float().unsqueeze(0).to(device)
sample_label = torch.tensor(sample_label[0]).long().unsqueeze(0).to(device)


with torch.no_grad():
    sample_output = (model(sample_img))
    sample_pred = torch.max(sample_output.data, 1)[1]


print(f"Predicted genre: {genre_list[sample_pred.item()]}")
print(f"Actual genre: {genre_list[sample_label.item()]}")

In [None]:
play(sample_id, audio_dict=audio_dict)