Until now we have been training the model with png spectrograms, which are colour images and therefor have 3 channels. The colour is just a color map, it doesn't actually mean anything, so we might as well train with the raw spectrogram values. 

I calculated the spectrograms and saved them all as .npy files, so I'll need to figure out how to get those into the data loader, then change the input channels to 1. 



In [1]:
# Import necessary libraries.
import os
import glob
import imageio
import random, shutil
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torchaudio
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as display
import librosa
import librosa.display
import pandas as pd
import seaborn as sns
import requests
from zipfile import ZipFile

In [2]:
fname = "music.zip"
url = "https://osf.io/drjhb/download"

if not os.path.isfile(fname):
    try:
        r = requests.get(url)
    except requests.ConnectionError:
        print("!!! Failed to download data !!!")
else:
    if r.status_code != requests.codes.ok:
        print("!!! Failed to download data !!!")
    else:
        with open(fname, "wb") as fid:
            fid.write(r.content)
            
os.chdir('C:\\Users\\ipzac\\Documents\\Project Data\Music')

with ZipFile(fname, 'r') as zipObj:
# Extract all the contents of zip file in different directory
    zipObj.extractall()

KeyboardInterrupt: 

In [2]:
#Utils

def set_device():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if device != "cuda":
        print("WARNING: For this notebook to perform best, "
          "if possible, in the menu under `Runtime` -> "
          "`Change runtime type.`  select `GPU` ")
    else:
        print("GPU is enabled in this notebook.")

    return device


#  Plotting function.

def plot_loss_accuracy(train_loss, train_acc, validation_loss, validation_acc):
    epochs = len(train_loss)
    fig, (ax1, ax2) = plt.subplots(1, 2)
    ax1.plot(list(range(epochs)), train_loss, label='Training Loss')
    ax1.plot(list(range(epochs)), validation_loss, label='Validation Loss')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss')
    ax1.set_title('Epoch vs Loss')
    ax1.legend()

    ax2.plot(list(range(epochs)), train_acc, label='Training Accuracy')
    ax2.plot(list(range(epochs)), validation_acc, label='Validation Accuracy')
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Accuracy')
    ax2.set_title('Epoch vs Accuracy')
    ax2.legend()
    fig.set_size_inches(15.5, 5.5)
    plt.show()
    
def apply_sliding_window(in_dir, out_dir, window_length = 3, hop_length = 1, 
                         save_as_tensor = False):
  #IMPORTANT NOTE - this function assumes that the spectrograms were made with 
    #default librosa nfft_size, hop_length, etc. 
    #window and hop length units are in seconds. 
    window_size = librosa.time_to_frames(window_length)
    hop_size = librosa.time_to_frames(hop_length)
    
    #get list of genre folders
    genre_folders = glob.glob(in_dir + "*")
    #loop over genre folders
    for g in genre_folders:
        spec_files = glob.glob(g + "\\*.npy")
    
        out_folder = out_dir + os.path.basename(g) + "\\"
        # Make sure save directory exists
        if not os.path.exists(out_folder):
            os.mkdir(out_folder)
        
        #loop over spectrogram files
        for f in spec_files:
            #extract file name
            file_name = f.split('/')[-1]
            
            #load spectrogram
            spec = np.load(file_name)
            
            save_name = file_name.strip(".npy")
            
            #apply sliding frame to spectrogram
            all_frames = librosa.util.frame(spec, window_size, hop_size)
            all_frames = np.moveaxis(all_frames, 2, 0)

            #loop over individual frames
            for i, frame in enumerate(all_frames):
        
                #specify out file name
                    if save_as_tensor: #convert np array to tensor
                        torch_frame = torch.from_numpy(frame)
                        #save torch of spectrogram frame
                        torch.save(torch_frame, out_folder + os.path.basename(save_name) + "_" + str(i) + ".pt") 
                    else: #save spectrogram frame
                        np.save(out_folder + os.path.basename(save_name) + "_" + str(i) + ".pt")

In [3]:
device = set_device()

GPU is enabled in this notebook.


## Make Mel Spectrograms with Librosa

In [4]:
in_folder_path = "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\genres_original\\"
out_folder_path = "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\spectrograms_30s\\"
if not os.path.exists(out_folder_path):
    os.mkdir(out_folder_path)

#get list of genre folders
genre_folders = os.listdir(in_folder_path)

#initialize empty list of problem files:
problem_files = []

#loop over genre folders
print('starting loop')
for genre_folder in genre_folders: 
    #get list of individual .wav files
    os.chdir(in_folder_path + '\\' + genre_folder)
    wav_files = os.listdir(".")
  #get output folder path
    out_folder = out_folder_path + '\\' + genre_folder
  #make output folder if it doesn't exist
    if not os.path.exists(out_folder):
        os.mkdir(out_folder)

  #loop over list of wav files
    print(genre_folder)
    
    for wav_file in wav_files:
    #extract file name
        file_name = wav_file.split('/')[-1]
        file_name = file_name.strip(".wav")
        try:
            #load wav file
            y, sr = librosa.load(wav_file)

            #normalize audio
            y = (y - y.mean())/ y.std()

          #calculate mel spectrogram
            spectrogram = librosa.feature.melspectrogram(y, sr = sr)
            spectrogram_db = librosa.amplitude_to_db(spectrogram, ref = np.max)

      #global normalization of the spectrogram (not frequency band normalization...)
            spectrogram_norm = (spectrogram_db - spectrogram_db.mean()) / spectrogram_db.std()

      #save spectrogram
    
            np.save(out_folder + '\\' +file_name, spectrogram_norm)
        except:
            print("Something is wrong with " + file_name + " so it has been skipped")
            problem_files = problem_files.append(wav_file)

starting loop
blues
classical
country
disco
hiphop
jazz




Something is wrong with jazz.00054 so it has been skipped
metal
pop
reggae
rock


## Split training and test data

In [5]:
## EDIT HERE TO CHANGE SIZE OF TEST AND VAL DATASETS
test_prop = 0.15
val_prop = 0.15

train_prop = 1 - test_prop - val_prop

# Create folder with training, testing and validation data.

spectrograms_dir = "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\spectrograms_30s"
folder_names = ["C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\train\\", 
                "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\test\\", 
                "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\val\\"]

genres = list(os.listdir(spectrograms_dir))

train_dir = folder_names[0]
test_dir = folder_names[1]
val_dir = folder_names[2]

for f in folder_names:
    if os.path.exists(f):
        shutil.rmtree(f)
        os.mkdir(f)
    else:
        os.mkdir(f)
# Loop over all genres
for g in genres:
    if os.path.exists("C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\train\\" + g + '\\' ):
        shutil.rmtree("C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\train\\" + g + '\\' )
        os.mkdir("C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\train\\" + g + '\\' )
    else:
        os.mkdir("C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\train\\" + g + '\\' )
        
    if os.path.exists("C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\test\\" + g + '\\' ):
        shutil.rmtree("C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\test\\" + g + '\\' )
        os.mkdir("C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\test\\" + g + '\\' )
    else:
        os.mkdir("C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\test\\" + g + '\\' )
        
    if os.path.exists("C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\val\\" + g + '\\' ):
        shutil.rmtree("C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\val\\" + g + '\\' )
        os.mkdir("C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\val\\" + g + '\\' )
    else:
        os.mkdir("C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\val\\" + g + '\\' )
        
    src_file_paths= glob.glob('C:\\Users\\ipzac\\Documents\\Project Data\\Music\\spectrograms_30s\\' + g + '\\*')
    # find all images & split in train, test, and validation
    # random.shuffle(src_file_paths)

    test_idx = int(len(src_file_paths) * test_prop)
    val_idx = test_idx + int(len(src_file_paths) * val_prop)

    test_files = src_file_paths[0:test_idx]
    val_files = src_file_paths[test_idx:val_idx]
    train_files = src_file_paths[val_idx:]

  # copy training and testing images over
    for f in train_files:
        try:
            shutil.copy(f, "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\train\\" + g + '\\' + os.path.split(f)[-1])
        except Exception as e: 
            print(e)
    for f in val_files:
        try:
            shutil.copy(f, "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\val\\" + g  + '\\' + os.path.split(f)[-1])
        except Exception as e:
            print(e)
    for f in test_files:
        try:
            shutil.copy(f, "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\test\\" + g  + '\\' + os.path.split(f)[-1])
        except Exception as e:
            print(e)

## Apply sliding window to data

In [6]:
cropped_names = ["C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\train_cropped\\", 
                "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\test_cropped\\", 
                "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\val_cropped\\"]
for f in cropped_names:
    if os.path.exists(f):
        shutil.rmtree(f)
        os.mkdir(f)
    else:
        os.mkdir(f)
        
# Cut Data

apply_sliding_window( "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\train\\", "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\train_cropped\\", save_as_tensor=True)
apply_sliding_window("C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\test\\", "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\test_cropped\\", save_as_tensor=True)
apply_sliding_window("C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\val\\", "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\val_cropped\\", save_as_tensor = True)



## Dataloader - torch version

In [7]:
# Data loading.
train_dir = "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\train_cropped\\"
val_dir =  "C:\\Users\\ipzac\\Documents\\Project Data\\Music\\Data\\val_cropped\\"

train_dataset = datasets.DatasetFolder(
    train_dir, 
    extensions = (".pt"), 
    loader = torch.load, 
    transform = transforms.Compose([torchaudio.transforms.TimeMasking(time_mask_param= 40, iid_masks=True), 
                                    torchaudio.transforms.FrequencyMasking(freq_mask_param=40, iid_masks=True), 
                                    torchaudio.transforms.TimeMasking(time_mask_param= 30, iid_masks=True), 
                                    torchaudio.transforms.FrequencyMasking(freq_mask_param = 30, iid_masks=True), 
                                    torchaudio.transforms.TimeMasking(time_mask_param= 20, iid_masks=True), 
                                    torchaudio.transforms.FrequencyMasking(freq_mask_param = 20, iid_masks=True)]) )

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=100, shuffle=True, num_workers=0, drop_last = True)

val_dataset = datasets.DatasetFolder(
    val_dir, 
    extensions = (".pt"), 
    loader = torch.load )

val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=100, shuffle=True, num_workers=0, drop_last = True)

## Train Default Network

In [8]:
# Make a CNN & train it to predict genres.

class music_net(nn.Module):
    def __init__(self):
        """Intitalize neural net layers"""
        super(music_net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, stride=1, padding=0) # changed in channels to 1 
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=0)
        self.conv3 = nn.Conv2d(in_channels=128, out_channels=512, kernel_size=3, stride=1, padding=0)
        self.conv4 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3, stride=1, padding=0)
        self.conv5 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=3, stride=1, padding=0)
        self.fc1 = nn.Linear(in_features=512, out_features=10) 
        
        self.batchnorm1 = nn.BatchNorm2d(num_features=64)
        self.batchnorm2 = nn.BatchNorm2d(num_features=128)
        self.batchnorm3 = nn.BatchNorm2d(num_features=512)
        self.batchnorm4 = nn.BatchNorm2d(num_features=256)
        self.batchnorm5 = nn.BatchNorm2d(num_features=128)

        self.dropout = nn.Dropout(p=0.5, inplace=False)


    def forward(self, x):
    # Conv layer 1.
        x = self.conv1(x)
        x = self.batchnorm1(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2)

    # Conv layer 2.
        x = self.conv2(x)
        x = self.batchnorm2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2)

    # Conv layer 3.
        x = self.dropout(x)
        x = self.conv3(x)
        x = self.batchnorm3(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2)

    # Conv layer 4.
        x = self.dropout(x)
        x = self.conv4(x)
        x = self.batchnorm4(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2)

    # # Conv layer 5.
        x = self.conv5(x)
        x = self.batchnorm5(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2)

    # Fully connected layer 1.
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        x = self.fc1(x)
        x = F.softmax(x)

        return x


def train(model, device, train_loader, validation_loader, epochs):
    criterion =  nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0025)
    train_loss, validation_loss = [], []
    train_acc, validation_acc = [], []
    with tqdm(range(epochs), unit='epoch') as tepochs:
        tepochs.set_description('Training')
        for epoch in tepochs:
            model.train()
            # keep track of the running loss
            running_loss = 0.
            correct, total = 0, 0

        for data, target in train_loader:
            # getting the training set
            data, target = data.to(device), target.to(device)
        
        #squeeze out extra dimension in data
        #data = data.squeeze(dim = 0) ########################################### this is new. 

        #using torch dataloader, you have to unsqueeze the data, I think...
            data = data.reshape((100, 1, 128, 129))#######  this is hardcoded for now and should be fixed somewhere else. 
        #100 is the batch size, 1 is for a single channel, and 128 by 129 is the size of the spectrogram image in pixels. 

        # Get the model output (call the model with the data from this batch)
            output = model(data)
        # Zero the gradients out)
            optimizer.zero_grad()
        # Get the Loss
            loss  = criterion(output, target)
        # Calculate the gradients
            loss.backward()
        # Update the weights (using the training step of the optimizer)
            optimizer.step()

            tepochs.set_postfix(loss=loss.item())
            running_loss += loss  # add the loss for this batch

        # get accuracy
            _, predicted = torch.max(output, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

      # append the loss for this epoch
            train_loss.append(running_loss/len(train_loader))
            train_acc.append(correct/total)

      # evaluate on validation data
            model.eval()
            running_loss = 0.
            correct, total = 0, 0

        for data, target in validation_loader:
        # getting the validation set
        #using torch dataloader, you have to unsqueeze the data, I think...
            data = data.reshape((100, 1, 128, 129))#######  this is hardcoded for now and should be fixed somewhere else. 
        #100 is the batch size, 1 is for a single channel, and 128 by 129 is the size of the spectrogram image in pixels. 

            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            tepochs.set_postfix(loss=loss.item())
            running_loss += loss.item()
        # get accuracy
            _, predicted = torch.max(output, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

            validation_loss.append(running_loss/len(validation_loader))
            validation_acc.append(correct/total)

    return train_loss, train_acc, validation_loss, validation_acc

In [9]:
# Run training.
net = music_net().to(device)
train_loss, train_acc, validation_loss, validation_acc = train(net, device, train_loader, val_loader, 30)
plot_loss_accuracy(train_loss, train_acc, validation_loss, validation_acc)

  0%|          | 0/30 [00:00<?, ?epoch/s]

IndexError: tuple index out of range

# Run training.
net = music_net().to(device)
train_loss, train_acc, validation_loss, validation_acc = train(net, device, train_loader, val_loader, 30)
plot_loss_accuracy(train_loss, train_acc, validation_loss, validation_acc)

Very interesting! We reach high training accuracy in much fewer epochs here than we did with the original png dataset. I should note that I am also using a larger batch size here than there. Could that be having an effect? 

We reach that high accuracy on the training set without any of the 'bumps' that we saw when training on the png dataset for longer. Maybe that's a sign that I should stop focusing so much on the RSA and instead focus on reducing the overfitting with this model. 

Some ideas on reducing the overfitting: 
- Data augmentation
- More dropout
- L1 / L2 reglularization (doesn't seem to be included at the moment). 
- Reduce the complexity of the model (fewer layers, narrower layers). 
- early stopping (although based on the plots, that doesn't seem like it would make a difference.) 

The validation accuracy is also very noisy. I already made the validation set larger than before (15% instead of 10%). Is there anything else we can do about that? Is it something we should be worried about? 

# Experimenting with Regularization

Robert suggested that a good place to start would be simplifying the model. Strip it down to as few layers as possible until it can barely improve on the training set, then add complexity back in, looking for a sweetspot with good fit without massive overfitting. 


## Reducing number of layers

# Alex's Advice on Regularization: 

Here are some ideas of changes to the model with Alex's help: 

- Increase the stride to decrease computation time (thanks Zach!)

- Dropout after conv2D layer (thanks Courtnie!)

- Create a bottleneck shape for the channels (wide to narrow to wide) (thanks Alex!)
  - for example, instead of [1, 8, 32, 64, 128, 512] do [1, 64, 512, 128]. 

- Use only a subset of the training data for hyperparameter setting. That will make this iteration process faster. (thanks Alex!)

- Try increasing stride instead of max pooling. (thanks Alex)

- Play with learning rate a bit , but this is lower priority. (thanks Alex). 
  - Try a slightly lower learning rate. 

- More data augmentation should be priority #1. 

- With optimizers other than adam (simpler ones) learning rate decay usually improves accuracy. With adam it isn't so necessary. 
  - There are some cases where adam doesn't work so great. Hard to predict what those cases will be. Sometimes SGD + learning rate decay is better. It is usually about equally good to adam, but adam is simpler to set up because fewer parameters. 

-  Our next meeting with Alex will be on Thursday- same time. 


Here are some ideas about augmentation in a dataloader with Alex: 

- Most direct option is to write a dataloader that applies the windowing, the augmentation, the spectrograming, all at the dataloader phase. 

- Another option would be to apply the transformation in the frequency domain, so that we apply the augmentations on the spectrograms, not the signal. 
  - this might be the better option, because it will be faster (don't need to fft every time). 


### Bottle Neck Shape of Network

I'm going to use the same number of layers as the original network, but make the filter size get wide --> small --> big. 

In [None]:
# Make a CNN & train it to predict genres.

class music_net(nn.Module):
  def __init__(self):
    """Intitalize neural net layers"""
    super(music_net, self).__init__()
    self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=0) # changed in channels to 1 
    self.conv2 = nn.Conv2d(in_channels=16, out_channels=64, kernel_size=3, stride=1, padding=0)
    self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=0)
    self.conv4 = nn.Conv2d(in_channels=128, out_channels=512, kernel_size=3, stride=1, padding=0)
    self.conv5 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, stride=1, padding=0)
    self.fc1 = nn.Linear(in_features=512, out_features=10) ##################################### changed in_features

    self.batchnorm1 = nn.BatchNorm2d(num_features=16)
    self.batchnorm2 = nn.BatchNorm2d(num_features=64)
    self.batchnorm3 = nn.BatchNorm2d(num_features=128)
    self.batchnorm4 = nn.BatchNorm2d(num_features=512)
    self.batchnorm5 = nn.BatchNorm2d(num_features=128)

    self.dropout = nn.Dropout(p=0.3, inplace=False)


  def forward(self, x):
    # Conv layer 1.
    x = self.conv1(x)
    x = self.batchnorm1(x)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2)

    # Conv layer 2.
    x = self.conv2(x)
    x = self.batchnorm2(x)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2)

    # Conv layer 3.
    x = self.conv3(x)
    x = self.batchnorm3(x)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2)

    # Conv layer 4.
    x = self.conv4(x)
    x = self.batchnorm4(x)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2)

    # Conv layer 5.
    x = self.conv5(x)
    x = self.batchnorm5(x)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2)
    #print(x.shape)################################################################

    # Fully connected layer 1.
    x = torch.flatten(x, 1)
    x = self.dropout(x)
    x = self.fc1(x)
    x = F.softmax(x)

    return x


def train(model, device, train_loader, validation_loader, epochs):
  criterion =  nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
  train_loss, validation_loss = [], []
  train_acc, validation_acc = [], []
  with tqdm(range(epochs), unit='epoch') as tepochs:
    tepochs.set_description('Training')
    for epoch in tepochs:
      model.train()
      # keep track of the running loss
      running_loss = 0.
      correct, total = 0, 0

      for data, target in train_loader:
        # getting the training set
        data, target = data.to(device), target.to(device)
        
        #squeeze out extra dimension in data
        #data = data.squeeze(dim = 0) ########################################### this is new. 

        #using torch dataloader, you have to unsqueeze the data, I think...
        data = data.reshape((100, 1, 128, 129))#######  this is hardcoded for now and should be fixed somewhere else. 
        #100 is the batch size, 1 is for a single channel, and 128 by 129 is the size of the spectrogram image in pixels. 

        # Get the model output (call the model with the data from this batch)
        output = model(data)
        # Zero the gradients out)
        optimizer.zero_grad()
        # Get the Loss
        loss  = criterion(output, target)
        # Calculate the gradients
        loss.backward()
        # Update the weights (using the training step of the optimizer)
        optimizer.step()

        tepochs.set_postfix(loss=loss.item())
        running_loss += loss  # add the loss for this batch

        # get accuracy
        _, predicted = torch.max(output, 1)
        total += target.size(0)
        correct += (predicted == target).sum().item()

      # append the loss for this epoch
      train_loss.append(running_loss/len(train_loader))
      train_acc.append(correct/total)

      # evaluate on validation data
      model.eval()
      running_loss = 0.
      correct, total = 0, 0

      for data, target in validation_loader:
        # getting the validation set
        #using torch dataloader, you have to unsqueeze the data, I think...
        data = data.reshape((100, 1, 128, 129))#######  this is hardcoded for now and should be fixed somewhere else. 
        #100 is the batch size, 1 is for a single channel, and 128 by 129 is the size of the spectrogram image in pixels. 

        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        tepochs.set_postfix(loss=loss.item())
        running_loss += loss.item()
        # get accuracy
        _, predicted = torch.max(output, 1)
        total += target.size(0)
        correct += (predicted == target).sum().item()

      validation_loss.append(running_loss/len(validation_loader))
      validation_acc.append(correct/total)

  return train_loss, train_acc, validation_loss, validation_acc

In [None]:
# Run training.

net = music_net().to(device)
train_loss, train_acc, validation_loss, validation_acc = train(net, device, train_loader, val_loader, 50)
plot_loss_accuracy(train_loss, train_acc, validation_loss, validation_acc)

That did help it learn the training set faster, compared to a network with the same number of layers but the size was doubling at every layer, but it didn't change much with the validation set accuracy. Maybe I'll stick with this architecture and try applying other regularization tricks to it. 

### Data Augmentation with SpecAugment

In [None]:
# Data loading.
train_dir = "/content/train_cropped/"
val_dir =  "/content/val_cropped/"

train_dataset = datasets.DatasetFolder(
    train_dir, 
    extensions = (".pt"), 
    loader = torch.load, 
    transform = transforms.Compose([torchaudio.transforms.TimeMasking(time_mask_param= 10, iid_masks=True), 
                                    torchaudio.transforms.FrequencyMasking(freq_mask_param=5, iid_masks=True)]) )

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=100, shuffle=True, num_workers=0, drop_last = True)

val_dataset = datasets.DatasetFolder(
    val_dir, 
    extensions = (".pt"), 
    loader = torch.load )

val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=100, shuffle=True, num_workers=0, drop_last = True)

In [None]:
import torchaudio

In [None]:
for data, target in train_dataset:
  print(target)
  print(data.shape)
  plt.figure()
  plt.imshow(data.numpy())

In [None]:
# Run training.

net = music_net().to(device)
train_loss, train_acc, validation_loss, validation_acc = train(net, device, train_loader, val_loader, 50)
plot_loss_accuracy(train_loss, train_acc, validation_loss, validation_acc)

It seems that that level of masking really didn't have much effect at all. Maybe we need wider masking bands? I don't know how much we can push it really. I'll try again with much larger masks and fewer epochs just to see. 

In [None]:
# Data loading.
train_dir = "/content/train_cropped/"
val_dir =  "/content/val_cropped/"

train_dataset = datasets.DatasetFolder(
    train_dir, 
    extensions = (".pt"), 
    loader = torch.load, 
    transform = transforms.Compose([torchaudio.transforms.TimeMasking(time_mask_param= 20, iid_masks=True), 
                                    torchaudio.transforms.FrequencyMasking(freq_mask_param=20, iid_masks=True), 
                                    torchaudio.transforms.TimeMasking(time_mask_param= 20, iid_masks=True)]) )

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=100, shuffle=True, num_workers=0, drop_last = True)

val_dataset = datasets.DatasetFolder(
    val_dir, 
    extensions = (".pt"), 
    loader = torch.load )

val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=100, shuffle=True, num_workers=0, drop_last = True)

In [None]:
# Run training.

net = music_net().to(device)
train_loss, train_acc, validation_loss, validation_acc = train(net, device, train_loader, val_loader, 50)
plot_loss_accuracy(train_loss, train_acc, validation_loss, validation_acc)

This does seem to have helped. The validation accuracy is now hovering around the mid 70s, not the low 70s it seems. Maybe we can push the masking even further. 

In [None]:
# Data loading.
train_dir = "/content/train_cropped/"
val_dir =  "/content/val_cropped/"

train_dataset = datasets.DatasetFolder(
    train_dir, 
    extensions = (".pt"), 
    loader = torch.load, 
    transform = transforms.Compose([torchaudio.transforms.TimeMasking(time_mask_param= 40, iid_masks=True), 
                                    torchaudio.transforms.FrequencyMasking(freq_mask_param=40, iid_masks=True), 
                                    torchaudio.transforms.TimeMasking(time_mask_param= 20, iid_masks=True), 
                                    torchaudio.transforms.FrequencyMasking(freq_mask_param = 20, iid_masks=True)]) )

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=100, shuffle=True, num_workers=0, drop_last = True)

val_dataset = datasets.DatasetFolder(
    val_dir, 
    extensions = (".pt"), 
    loader = torch.load )

val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=100, shuffle=True, num_workers=0, drop_last = True)

In [None]:
# Run training.

net = music_net().to(device)
train_loss, train_acc, validation_loss, validation_acc = train(net, device, train_loader, val_loader, 50)
plot_loss_accuracy(train_loss, train_acc, validation_loss, validation_acc)

Looks like that is going even better! I'm not to increase the augmentation ***even more*** and also increase the learning rate slightly. 

In [None]:
# Data loading.
train_dir = "/content/train_cropped/"
val_dir =  "/content/val_cropped/"

train_dataset = datasets.DatasetFolder(
    train_dir, 
    extensions = (".pt"), 
    loader = torch.load, 
    transform = transforms.Compose([torchaudio.transforms.TimeMasking(time_mask_param= 40, iid_masks=True), 
                                    torchaudio.transforms.FrequencyMasking(freq_mask_param=40, iid_masks=True), 
                                    torchaudio.transforms.TimeMasking(time_mask_param= 30, iid_masks=True), 
                                    torchaudio.transforms.FrequencyMasking(freq_mask_param = 30, iid_masks=True), 
                                    torchaudio.transforms.TimeMasking(time_mask_param= 20, iid_masks=True), 
                                    torchaudio.transforms.FrequencyMasking(freq_mask_param = 20, iid_masks=True)]) )

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=100, shuffle=True, num_workers=0, drop_last = True)

val_dataset = datasets.DatasetFolder(
    val_dir, 
    extensions = (".pt"), 
    loader = torch.load )

val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=100, shuffle=True, num_workers=0, drop_last = True)

In [None]:
# Make a CNN & train it to predict genres.

class music_net(nn.Module):
  def __init__(self):
    """Intitalize neural net layers"""
    super(music_net, self).__init__()
    self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=0) # changed in channels to 1 
    self.conv2 = nn.Conv2d(in_channels=16, out_channels=64, kernel_size=3, stride=1, padding=0)
    self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=0)
    self.conv4 = nn.Conv2d(in_channels=128, out_channels=512, kernel_size=3, stride=1, padding=0)
    self.conv5 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, stride=1, padding=0)
    self.fc1 = nn.Linear(in_features=512, out_features=10) ##################################### changed in_features

    self.batchnorm1 = nn.BatchNorm2d(num_features=16)
    self.batchnorm2 = nn.BatchNorm2d(num_features=64)
    self.batchnorm3 = nn.BatchNorm2d(num_features=128)
    self.batchnorm4 = nn.BatchNorm2d(num_features=512)
    self.batchnorm5 = nn.BatchNorm2d(num_features=128)

    self.dropout = nn.Dropout(p=0.3, inplace=False)


  def forward(self, x):
    # Conv layer 1.
    x = self.conv1(x)
    x = self.batchnorm1(x)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2)

    # Conv layer 2.
    x = self.conv2(x)
    x = self.batchnorm2(x)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2)

    # Conv layer 3.
    x = self.conv3(x)
    x = self.batchnorm3(x)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2)

    # Conv layer 4.
    x = self.conv4(x)
    x = self.batchnorm4(x)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2)

    # Conv layer 5.
    x = self.conv5(x)
    x = self.batchnorm5(x)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2)
    #print(x.shape)################################################################

    # Fully connected layer 1.
    x = torch.flatten(x, 1)
    x = self.dropout(x)
    x = self.fc1(x)
    x = F.softmax(x)

    return x


def train(model, device, train_loader, validation_loader, epochs):
  criterion =  nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
  train_loss, validation_loss = [], []
  train_acc, validation_acc = [], []
  with tqdm(range(epochs), unit='epoch') as tepochs:
    tepochs.set_description('Training')
    for epoch in tepochs:
      model.train()
      # keep track of the running loss
      running_loss = 0.
      correct, total = 0, 0

      for data, target in train_loader:
        # getting the training set
        data, target = data.to(device), target.to(device)
        
        #squeeze out extra dimension in data
        #data = data.squeeze(dim = 0) ########################################### this is new. 

        #using torch dataloader, you have to unsqueeze the data, I think...
        data = data.reshape((100, 1, 128, 129))#######  this is hardcoded for now and should be fixed somewhere else. 
        #100 is the batch size, 1 is for a single channel, and 128 by 129 is the size of the spectrogram image in pixels. 

        # Get the model output (call the model with the data from this batch)
        output = model(data)
        # Zero the gradients out)
        optimizer.zero_grad()
        # Get the Loss
        loss  = criterion(output, target)
        # Calculate the gradients
        loss.backward()
        # Update the weights (using the training step of the optimizer)
        optimizer.step()

        tepochs.set_postfix(loss=loss.item())
        running_loss += loss  # add the loss for this batch

        # get accuracy
        _, predicted = torch.max(output, 1)
        total += target.size(0)
        correct += (predicted == target).sum().item()

      # append the loss for this epoch
      train_loss.append(running_loss/len(train_loader))
      train_acc.append(correct/total)

      # evaluate on validation data
      model.eval()
      running_loss = 0.
      correct, total = 0, 0

      for data, target in validation_loader:
        # getting the validation set
        #using torch dataloader, you have to unsqueeze the data, I think...
        data = data.reshape((100, 1, 128, 129))#######  this is hardcoded for now and should be fixed somewhere else. 
        #100 is the batch size, 1 is for a single channel, and 128 by 129 is the size of the spectrogram image in pixels. 

        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        tepochs.set_postfix(loss=loss.item())
        running_loss += loss.item()
        # get accuracy
        _, predicted = torch.max(output, 1)
        total += target.size(0)
        correct += (predicted == target).sum().item()

      validation_loss.append(running_loss/len(validation_loader))
      validation_acc.append(correct/total)

  return train_loss, train_acc, validation_loss, validation_acc

In [None]:
# Run training.

net = music_net().to(device)
train_loss, train_acc, validation_loss, validation_acc = train(net, device, train_loader, val_loader, 50)
plot_loss_accuracy(train_loss, train_acc, validation_loss, validation_acc)