# Imports / paths

In [1]:
! pip install torch-summary

Collecting torch-summary
  Downloading https://files.pythonhosted.org/packages/ca/db/93d18c84f73b214acfa4d18051d6f4263eee3e044c408928e8abe941a22c/torch_summary-1.4.5-py3-none-any.whl
Installing collected packages: torch-summary
Successfully installed torch-summary-1.4.5


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import json
from torchsummary import summary
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support
import librosa.display
import math
import os
from textwrap import dedent

IRMAS_PATH = "/content/drive/MyDrive/ITCS 5156 project/IRMAS dataset/IRMAS-TrainingData/"
JSON_PATH = "/content/drive/MyDrive/ITCS 5156 project/IRMAS dataset/json_files/"
filename = "irmas_data_mfcc13_hop_length256_n_fft2048.json"

from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


# Dataset / Preprocessing

In [3]:
class IRMASDataset(Dataset):
  def __init__(self, JSON_PATH=JSON_PATH, filename=filename, transform=None):
    with open(JSON_PATH + filename, "r") as f:
      self.irmas_data = json.load(f)
    self.metadata = self.irmas_data.pop('metadata') 
    self.instruments = ["cel", "cla", "flu", "gac", "gel", "org", "pia", "sax",
    "tru", "vio", "voi"]
    self.encoder = LabelEncoder()
    self.encoder.fit(self.instruments)

  def __len__(self):
    return len(self.irmas_data)

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = np.array(idx.tolist())

    mfccs = np.array(self.irmas_data[str(idx)]['mfccs'])[np.newaxis,...]

    primary_instrument = self.encoder.transform(
      [np.array(self.irmas_data[str(idx)]['primary_instrument'])]
    )
    sample = {'mfccs': mfccs, 'instrument': primary_instrument, 'metadata': self.metadata}
    return sample

def prep_dataset(filename=filename, val_split=0.2, batch_size=1):

  dataset = IRMASDataset(JSON_PATH=JSON_PATH, filename=filename)
  train_set, val_set = random_split(dataset, [round(len(dataset) * (1-val_split)), round(len(dataset)*val_split)])

  train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=1)
  val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True, num_workers=1)

  return train_loader, val_loader, dataset

# Architectures

In [16]:
################################################################################
#################################### BLOCKS ####################################
################################################################################

class ConvBlock(nn.Module):

	def __init__(self, in_channels, out_channels, conv_kernel_size=3,
							 conv_stride=1, conv_padding=0,
							 inc_pool=True, pool_kernel_size=2, pool_stride=2):
		"""Convolutional block with conv2d, linear activation, max pooling, 
			and batch norm
		:param in_channels:
		:param out_channels:
		:param conv_kernel_size:
		:param conv_stride:
		:param conv_padding:
		:param inc_pool: If true, includes a max pooling layer

		The following params only matter if inc_pool is True
		:param pool_kernel_size:
		:param pool_stride:
		"""
		super(ConvBlock, self).__init__()

		# construct sequential blocks
		if inc_pool:
			self.conv_block = nn.Sequential(
						nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
										kernel_size=conv_kernel_size, stride=conv_stride,
											padding=conv_padding),
						nn.ReLU(),
						nn.MaxPool2d(kernel_size=pool_kernel_size, stride=pool_stride),
						nn.BatchNorm2d(num_features=out_channels)
				)
		else:
			self.conv_block = nn.Sequential(
						nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
											kernel_size=conv_kernel_size, stride=conv_stride,
											padding=conv_padding),
						nn.ReLU(),
						nn.BatchNorm2d(num_features=out_channels)
				) 

	# run forward
	def forward(self, x):
		x = self.conv_block(x)
		return x

class LinearBlock(nn.Module):

	def __init__(self, in_features, out_features, dropout_prob=0):
		"""Linear block with dense layer, relu, batch norm, then dropout
		:param in_features:
		:param out_features:
		:param dropout_prob: Set to 0 for no dropout layer
		"""

		super(LinearBlock, self).__init__()
		self.linear_block = nn.Sequential(
				nn.Linear(in_features=in_features, out_features=out_features),
				nn.ReLU(),
				nn.BatchNorm1d(num_features=out_features),
				nn.Dropout(p=dropout_prob)
		)

	def forward(self, x):
		x = self.linear_block(x)
		return x

class HeadBlock(nn.Module):

	def __init__(self, in_features):
		"""Linear block with softmax output.
		NOTE: no longer using softmax output since the CrossEntropyLoss handles that
		:param in_features:
		out_features is fixed to 11 to corrospond to the number of classes
		"""
		super(HeadBlock, self).__init__()
		self.head_block = nn.Sequential(
				nn.Linear(in_features=in_features, out_features=11),
				#nn.Softmax()
		)

	def forward(self, x):
		x = self.head_block(x)
		return x

################################################################################
################################### NETWORKS ###################################
################################################################################
class Conv1Layer(nn.Module):

	def __init__(self, single_sample, channels=[8],
							 conv_kernel_sizes=[3],
							 conv_strides=[1],
							 conv_paddings=[0],
							 pool_masks=[True],
							 pool_kernel_sizes=[2],
							 pool_strides=[2],
							 linear_features=[128, 64],
							 dropout_probs=[0, 0]):

		"""Convolutional neural network with 1 conv layer and 3 linear layers.
		All hyperparams are flexible and initialized using lists (or array-likes).
		The nth entry in each list corrosponds to the nth layer

		:param single_sample: a sample mfcc to run through the network on init to 
		get layer sizes
		:param channels:
		:param conv_kernel_sizes:
		:param conv_paddings:
		:param pool_masks: array of booleans to control max pooling
			ex: [False, True] means no max pooling after 1st layer, but max pooling 
			after second layer. Other hyperparams for maxpooling must be passed so
			that alignment is consistent. ex: in the [False, True] example, one could
			pass [3, 2] for pool kernel size. The 3 does nothing but the 2 will use 
			a pool kernel size of 2. Passing only [2] will result in an error even if 
			there is only one maxpool layer.
		:param pool_kernel_sizes:
		:param pool_strides:
		:param linear features: output sizes for linear layers (input size
			determined on init by one_mfcc)
		:param dropout_probs:

		"""
		super(Conv1Layer, self).__init__()

		# convolutional blocks
		self.conv1 = ConvBlock(in_channels=1, out_channels=channels[0],
													 conv_kernel_size=conv_kernel_sizes[0],
													 conv_stride=conv_strides[0],
													 conv_padding=conv_paddings[0],
													 inc_pool=pool_masks[0],
													 pool_kernel_size=pool_kernel_sizes[0],
													 pool_stride=pool_strides[0])
	
		# run a single sample through the convolutional block to get output size
		# https://discuss.pytorch.org/t/convolution-and-pooling-layers-need-a-method-to-calculate-output-size/21895
		sample_output1 = self.conv1(torch.from_numpy(
				single_sample[np.newaxis,...].astype(np.float32)))
	
		sample_flattened = sample_output1.flatten(start_dim=1)
 
		# linear blocks
		self.linear1 = LinearBlock(in_features=(sample_flattened.shape[1]),
																						out_features=(linear_features[0]),
																						dropout_prob=dropout_probs[0])
		self.linear2 = LinearBlock(in_features=(linear_features[0]),
																						out_features=(linear_features[1]),
																						dropout_prob=dropout_probs[1])
		self.head = HeadBlock(in_features=(linear_features[1]))

	def forward(self, x):
		x = self.conv1(x)
		x = x.flatten(start_dim=1)
		x = self.linear1(x)
		x = self.linear2(x)
		x = self.head(x)
		return x

class Conv3Layer(nn.Module):
	def __init__(self, single_sample, channels=[8, 16, 32],
							 conv_kernel_sizes=[3, 3, 3],
							 conv_strides=[1, 1, 1],
							 conv_paddings=[0, 0, 1,],
							 pool_masks=[True, False, False],
							 pool_kernel_sizes=[2, 2, 2],
							 pool_strides=[2, 2, 2],
							 linear_features=[128, 64],
							 dropout_probs=[0, 0]):

		"""Convolutional neural network with 3 conv layers and 3 linear layers.
		All hyperparams are flexible and initialized using lists (or array-likes).
		The nth entry in each list corrosponds to the nth layer

		:param single_sample: a sample mfcc to run through the network on init to 
		get layer sizes
		:param channels:
		:param conv_kernel_sizes:
		:param conv_paddings:
		:param pool_masks: array of booleans to control max pooling
			ex: [False, True] means no max pooling after 1st layer, but max pooling 
			after second layer. Other hyperparams for maxpooling must be passed so
			that alignment is consistent. ex: in the [False, True] example, one could
			pass [3, 2] for pool kernel size. The 3 does nothing but the 2 will use 
			a pool kernel size of 2. Passing only [2] will result in an error even if 
			there is only one maxpool layer.
		:param pool_kernel_sizes:
		:param pool_strides:
		:param linear features: output sizes for linear layers (input size
			determined on init by one_mfcc)
		:param dropout_probs:
		"""

		super(Conv3Layer, self).__init__()

		self.conv1 = ConvBlock(in_channels=1, out_channels=channels[0],
													 conv_kernel_size=conv_kernel_sizes[0],
													 conv_stride=conv_strides[0],
													 conv_padding=conv_paddings[0],
													 inc_pool=pool_masks[0],
													 pool_kernel_size=pool_kernel_sizes[0],
													 pool_stride=pool_strides[0])
		
		self.conv2 = ConvBlock(in_channels=channels[0], out_channels=channels[1],
													 conv_kernel_size=conv_kernel_sizes[1],
													 conv_stride=conv_strides[1],
													 conv_padding=conv_paddings[1],
													 inc_pool=pool_masks[1],
													 pool_kernel_size=pool_kernel_sizes[1],
													 pool_stride=pool_strides[1])

		self.conv3 = ConvBlock(in_channels=channels[1], out_channels=channels[2],
													 conv_kernel_size=conv_kernel_sizes[2],
													 conv_stride=conv_strides[2],
													 conv_padding=conv_paddings[2],
													 inc_pool=pool_masks[2],
													 pool_kernel_size=pool_kernel_sizes[2],
													 pool_stride=pool_strides[2])
		
		# calculate size for linear layers
		sample_output1 = self.conv1(torch.from_numpy(
				single_sample[np.newaxis,...].astype(np.float32)))
		sample_output2 = self.conv2(sample_output1)
		sample_output3 = self.conv3(sample_output2)
		sample_flattened = sample_output3.flatten(start_dim=1)

		# linear blocks
		self.linear1 = LinearBlock(in_features=(sample_flattened.shape[1]),
																						out_features=(linear_features[0]),
																						dropout_prob=dropout_probs[0])
		self.linear2 = LinearBlock(in_features=(linear_features[0]),
																						out_features=(linear_features[1]),
																						dropout_prob=dropout_probs[1])
		self.head = HeadBlock(in_features=(linear_features[1]))
	
	def forward(self, x):
		x = self.conv1(x)
		x = self.conv2(x)
		x = self.conv3(x)
		x = x.flatten(start_dim=1)
		x = self.linear1(x)
		x = self.linear2(x)
		x = self.head(x)
		return x

class Conv5Layer(nn.Module):
	def __init__(self, single_sample, channels= [8, 8, 32, 32, 64],
							 conv_kernel_sizes=[3, 3, 3, 3, 3],
							 conv_strides=[1, 1, 1, 1, 1],
							 conv_paddings=[0, 0, 1, 1, 1],
							 pool_masks=[True, False, False, False, False],
							 pool_kernel_sizes=[2, 2, 2, 2, 2],
							 pool_strides=[2, 2, 2, 2, 2],
							 linear_features=[128, 64],
							 dropout_probs=[0, 0]):
		
		"""Convolutional neural network with 3 conv layers and 3 linear layers.
		All hyperparams are flexible and initialized using lists (or array-likes).
		The nth entry in each list corrosponds to the nth layer

		:param single_sample: a sample mfcc to run through the network on init to 
		get layer sizes
		:param channels:
		:param conv_kernel_sizes:
		:param conv_paddings:
		:param pool_masks: array of booleans to control max pooling
			ex: [False, True] means no max pooling after 1st layer, but max pooling 
			after second layer. Other hyperparams for maxpooling must be passed so
			that alignment is consistent. ex: in the [False, True] example, one could
			pass [3, 2] for pool kernel size. The 3 does nothing but the 2 will use 
			a pool kernel size of 2. Passing only [2] will result in an error even if 
			there is only one maxpool layer.
		:param pool_kernel_sizes:
		:param pool_strides:
		:param linear features: output sizes for linear layers (input size
			determined on init by one_mfcc)
		:param dropout_probs:
		"""

		super(Conv5Layer, self).__init__()

		# convolutional layers
		self.conv1 = ConvBlock(in_channels=1, out_channels=channels[0],
													 conv_kernel_size=conv_kernel_sizes[0],
													 conv_stride=conv_strides[0],
													 conv_padding=conv_paddings[0],
													 inc_pool=pool_masks[0],
													 pool_kernel_size=pool_kernel_sizes[0],
													 pool_stride=pool_strides[0])
		
		self.conv2 = ConvBlock(in_channels=channels[0], out_channels=channels[1],
													 conv_kernel_size=conv_kernel_sizes[1],
													 conv_stride=conv_strides[1],
													 conv_padding=conv_paddings[1],
													 inc_pool=pool_masks[1],
													 pool_kernel_size=pool_kernel_sizes[1],
													 pool_stride=pool_strides[1])

		self.conv3 = ConvBlock(in_channels=channels[1], out_channels=channels[2],
													 conv_kernel_size=conv_kernel_sizes[2],
													 conv_stride=conv_strides[2],
													 conv_padding=conv_paddings[2],
													 inc_pool=pool_masks[2],
													 pool_kernel_size=pool_kernel_sizes[2],
													 pool_stride=pool_strides[2])
		
		self.conv4 = ConvBlock(in_channels=channels[2], out_channels=channels[3],
													 conv_kernel_size=conv_kernel_sizes[3],
													 conv_stride=conv_strides[3],
													 conv_padding=conv_paddings[3],
													 inc_pool=pool_masks[3],
													 pool_kernel_size=pool_kernel_sizes[3],
													 pool_stride=pool_strides[3])
		
		self.conv5 = ConvBlock(in_channels=channels[3], out_channels=channels[4],
													 conv_kernel_size=conv_kernel_sizes[4],
													 conv_stride=conv_strides[4],
													 conv_padding=conv_paddings[4],
													 inc_pool=pool_masks[4],
													 pool_kernel_size=pool_kernel_sizes[4],
													 pool_stride=pool_strides[4])
		
		# calculate size for linear layers
		sample_output1 = self.conv1(torch.from_numpy(
				single_sample[np.newaxis,...].astype(np.float32)))
		sample_output2 = self.conv2(sample_output1)
		sample_output3 = self.conv3(sample_output2)
		sample_output4 = self.conv4(sample_output3)
		sample_output5 = self.conv5(sample_output4)
		sample_flattened = sample_output5.flatten(start_dim=1)


		# linear blocks
		self.linear1 = LinearBlock(in_features=(sample_flattened.shape[1]),
																						out_features=(linear_features[0]),
																						dropout_prob=dropout_probs[0])
		self.linear2 = LinearBlock(in_features=(linear_features[0]),
																						out_features=(linear_features[1]),
																						dropout_prob=dropout_probs[1])
		self.head = HeadBlock(in_features=(linear_features[1]))
	
	def forward(self, x):
		x = self.conv1(x)
		x = self.conv2(x)
		x = self.conv3(x)
		x = self.conv4(x)
		x = self.conv5(x)
		x = x.flatten(start_dim=1)
		x = self.linear1(x)
		x = self.linear2(x)
		x = self.head(x)
		return x

### This one doesn't seem to be working yet
class ConvNLayer(nn.Module):
	def __init__(self, single_sample, 

		num_conv_layers=2,
		channels=[8, 16],
		conv_kernel_sizes=[3, 3],
		conv_strides=[1, 1],
		conv_paddings=[1, 1],
		pool_masks=[True, True],
		pool_kernel_sizes=[2, 2],
		pool_strides=[2, 2],
		
		num_linear_layers=2,
		linear_features=[128, 64],
		dropout_probs=[0, 0]
		):
		"""Convolutional neural net with an arbitrary number of convolutional layers
		"""
		super(ConvNLayer, self).__init__()

		self.num_conv_layers = num_conv_layers
		self.num_linear_layers = num_linear_layers

		# prepend 1 to input channels since there is only one
		channels.insert(0, 1)

		# define list of convolutional layers
		self.conv_layers = [
			ConvBlock(
				in_channels = channels[i],
				out_channels = channels[i+1],
				conv_kernel_size = conv_kernel_sizes[i],
				conv_stride = conv_strides[i],
				conv_padding = conv_paddings[i],
				inc_pool = pool_masks[i],
				pool_kernel_size = pool_kernel_sizes[i],
				pool_stride = pool_strides[i])
		for i in range(self.num_conv_layers)]

		# calculate size of linear layers
		sample = torch.from_numpy(
			single_sample[np.newaxis,...].astype(np.float32)
		)

		for i in range(self.num_conv_layers):
			sample = self.conv_layers[i](sample)

		sample_flattened = sample.flatten(start_dim=1)

		# prepend shape of input to linear block
		linear_features.insert(0, sample_flattened.shape[1])

		# define list of linear layers
		self.linear_layers = [
			LinearBlock(
				in_features = (linear_features[i]),
				out_features = (linear_features[i+1]),
				dropout_prob = dropout_probs[i])
			for i in range(self.num_linear_layers)
		]

		# define output head
		self.head = HeadBlock(in_features=(linear_features[-1]))

	def forward(self, x):
		for i in range(self.num_conv_layers):
			print(self.conv_layers[i])
			x = self.conv_layers[i](x)
		
		x = x.flatten(start_dim=1)

		for i in range(self.num_linear_layers):
			x = self.linear_layers[i](x)
		
		x = self.head(x)
		return x

models_dict = {
	"Conv_1_layer": Conv1Layer,
	"Conv_3_layer": Conv3Layer,
	"Conv_5_layer": Conv5Layer,
	"Conv_N_layer": ConvNLayer,
}

# Get a sample & test

In [5]:
train_loader, val_loader, dataset = prep_dataset(
      filename=filename, batch_size=5, val_split=0.2)
single_sample = dataset[0]

one_mfcc = np.array(single_sample['mfccs'])
one_mfcc.shape

(1, 13, 517)

# Train Loop

In [32]:
def train_model(filename="irmas_data_mfcc13_hop_length256_n_fft2048", model_id="TestModel",
                num_epochs=2, interval=16, lr=0.001, batch_size=64,
                val_split=0.2, save_checkpoint=False, checkpoint_path="",
                notes="", checkpoint_name="utitled.pt", criterion=torch.nn.NLLLoss(),
                patience=None, min_epochs=5, buffer=0.05, dropout_prob=None,
                model_args={}, experiment_params={}):
  """Model training loop for music analysis project. Currently, this loop only supports
  models that take input in the shape [mini_batch, channels, L, W].

  :param filename:
  :param model_id:
  :param num_epochs:
  :param interval:
  :param lr:
  :param batch_size:
  :param val_split:
  :param save_checkpoint:
  :param checkpoint_path:
  :param notes:
  :param checkpoint name:
  :param criterion:
  :param patience: If validation loss does not improve over this many epochs, stop training
  """

  # Initialize device
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  print("device: ", device)
  
  # get train and validation set, print metadata
  train_loader, val_loader, dataset = prep_dataset(
      filename=filename, batch_size=batch_size, val_split=val_split)

  print("dataset metadata: ", dataset.metadata)

  # param for early stopping
  stop_next = False

  # get number of train and validdation samples
  train_samples = round(len(dataset) * (1-val_split))
  val_samples = round(len(dataset)*val_split)

  # initialize loss history and accuracy history for each epoch
  # this is the stored history for the train and validation metrics
  epoch_hist = []
  avg_train_loss_hist = []  # training loss for each epoch
  std_train_loss_hist = []
  avg_val_loss_hist = []    # validation loss for each epoch
  std_val_loss_hist = []
  train_acc_hist = []       # training accuracy for each epoch
  train_prec_hist = []
  train_recall_hist = []
  train_f1_hist = []
  val_acc_hist = []         # validation accuracy for each epoch
  val_prec_hist = []
  val_recall_hist = []
  val_f1_hist = []


  # get one sample to load initial shape for neural net
  single_sample = dataset[0]
  one_mfcc = np.array(single_sample['mfccs'])
  print("train model: data loaders initialized")
  print("sample shape = ", one_mfcc.shape)

  # initialize model
  model = models_dict[model_id](one_mfcc, **model_args).to(device)
  print("model loaded")
  summary_str = str(summary(model, one_mfcc.shape, verbose=0))

  print(summary_str)

  # initialize optimizer and criterion
  optimizer = torch.optim.Adam(model.parameters(), lr=lr)
  print("criterion: ", criterion)

  n_train_steps = len(train_loader)
  n_val_steps = len(val_loader)

  ### loop epochs
  for epoch in range(num_epochs):
    print("\n\ntraining epoch: ", epoch)
    epoch_hist.append(epoch+1)
    epoch_time_start = time.time()
    interval_time_start = time.time()
    model.to(device)

    # at the start of the epoch, set all tracked params to zero
    train_losses = []
    val_losses = []
    inter_epoch_loss = []
    train_num_correct = 0
    val_num_correct = 0

    # set params to be tracked within the epoch ("inter-epoch")
    # these will be outputted at each interval, but not saved
    inter_epoch_num_correct = 0

    ### Training loop
    model.train()
    print("model set to train")
    train_preds = []
    train_targets = []
    for i, sample in enumerate(train_loader):

      # prep input and target tensor
      input_tensor = torch.from_numpy(
          np.array(sample['mfccs']).astype(np.float32)).to(device)
      targets = sample['instrument']
      target_tensor = torch.squeeze(torch.tensor(targets), dim=1)
      #print("target tensor after processing: ", target_tensor)
      train_targets.extend(list(targets.numpy()))
      # make predictions
      try:
        predictions = torch.squeeze(model(input_tensor).to('cpu'), dim=1)
      except Exception as e:
        print("EXCEPTION THROWN: ", e)
      # compute loss and do back-propagation
      loss = criterion(predictions, target_tensor)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # append the loss to overall 
      train_losses.append(loss.item())
      inter_epoch_loss.append(loss.item())

      # compute accuracies
      with torch.no_grad():
        predictions_arr = predictions.numpy()
        preds = [np.argmax(predictions_arr[i]) 
          for i in range(len(target_tensor))]
        # inter-epoch accuracy (reset this at each interval)
        inter_epoch_num_correct += np.sum([target_tensor[i] == np.argmax(predictions[i])
          for i in range(len(target_tensor))])
        
        # epoch accuracy (this is tracked and saved)
        train_num_correct += np.sum([target_tensor[i] == np.argmax(predictions[i])
          for i in range(len(target_tensor))])
        #print("debugging in epoch: preds = ", preds)
        train_preds.extend(preds)

      # print step info
      if i % interval == 0:

        # time elapsed
        interval_time_end = time.time()

        # compute mean and std of losses
        inter_epoch_loss_avg = np.mean(inter_epoch_loss)
        inter_epoch_loss_std = np.std(inter_epoch_loss)
        
        # compute inter-epoch accuracy
        # note, this accuracy may be incorrect at the end of each epoch
        # when the batch size is slightly different
        acc = inter_epoch_num_correct / (interval*batch_size)
        print(f"Epoch [{epoch+1}/{num_epochs}], step [{i+1}/{n_train_steps}], ",
              f"Loss: {inter_epoch_loss_avg:.4f} +/- {inter_epoch_loss_std:.4f}, ",
              f"accuracy: {acc}, "
              f"time elapsed = {interval_time_end-interval_time_start}s")
        interval_time_start = time.time()

        # reset inter_epoch metrics
        inter_epoch_num_correct = 0
        inter_epoch_loss = []

    ### training loop finished
    # append the accuracy
    train_acc_hist.append(train_num_correct / train_samples)

    # calculate classification metrics
    train_targets = np.array(train_targets).ravel()
    train_preds = np.array(train_preds).ravel()
    # print("debugging: train targets: ", train_targets)
    # print("debugging: train predictions: ", train_preds)
    train_prec, train_recall, train_f1, _ = precision_recall_fscore_support(train_targets, train_preds,
                                                      average='micro')

    ### Validation loop
    model.eval()
    print("model set to eval")
    val_preds = []
    val_targets = []
    with torch.no_grad():

      num_correct = 0
      for i, sample in enumerate(val_loader):
        
        # prep input and target tensor
        input_tensor = torch.from_numpy(
            np.array(sample['mfccs']).astype(np.float32)).to(device)
        targets = sample['instrument']
        val_targets.extend(list(targets.numpy()))
        target_tensor = torch.squeeze(torch.tensor(targets), dim=1)
        #target_tensor = torch.squeeze(torch.tensor(sample['instrument']), dim=1)

        # make predictions
        try:
          predictions = torch.squeeze(model(input_tensor).to('cpu'), dim=1)
        except Exception as e:
          print("EXCEPTION THROWN: ", e)
        # compute and append losses
        loss = criterion(predictions, target_tensor)
        val_losses.append(loss.item())

        predictions_arr = predictions.numpy()
        preds = [np.argmax(predictions_arr[i]) 
          for i in range(len(target_tensor))]
        val_preds.extend(preds)
        # get num correct to comput accuracy
        val_num_correct += np.sum([target_tensor[i] == np.argmax(predictions[i])
          for i in range(len(target_tensor))])
      
      ### validation loop finished. prep model and metrics for saving
      # calculate validation accuracy
      val_acc_hist.append(val_num_correct / val_samples)
      val_targets = np.array(val_targets).ravel()
      val_preds = np.array(val_preds).ravel()
      # print("debugging: train targets: ", train_targets)
      # print("debugging: train predictions: ", train_preds)
      val_prec, val_recall, val_f1, _ = precision_recall_fscore_support(val_targets, val_preds,
                                                      average='micro')
      # calculate mean and standard deviation of losses
      avg_train_loss = np.mean(train_losses)
      std_train_loss = np.std(train_losses)
      avg_val_loss = np.mean(val_losses)
      std_val_loss = np.std(val_losses)

      # append mean and standard deviation to histories
      avg_train_loss_hist.append(avg_train_loss) 
      std_train_loss_hist.append(std_train_loss)  
      avg_val_loss_hist.append(avg_val_loss)
      std_val_loss_hist.append(std_val_loss)

      train_prec_hist.append(train_prec)
      train_recall_hist.append(train_recall)
      train_f1_hist.append(train_f1)

      val_prec_hist.append(val_prec)
      val_recall_hist.append(val_recall)
      val_f1_hist.append(val_f1)
    
    ### epoch training finished, output results and save checkpoint

    # text output
    epoch_time_end = time.time()
    print(f"\nEPOCH FINISHED: , ",
          f"training: acc = {train_acc_hist[-1]:.3f}, ",
          f"precision = {train_prec_hist[-1]:.3f}",
          f"recall = {train_recall_hist[-1]:.3f}",
          f"f1 = {train_f1_hist[-1]:.3f}",
          f"::: val: acc = {val_acc_hist[-1]:.3f}, ",
          f"precision = {val_prec_hist[-1]:.3f}",
          f"recall = {val_recall_hist[-1]:.3f}",
          f"time elapsed = {epoch_time_end-epoch_time_start}s")
    
    # make a plot
    plt.close("all")
    fig, ax = plt.subplots(ncols=2, figsize=[15, 5])
    #ax.scatter(epoch_hist, avg_train_loss_hist, c='r', label="train loss", )
    ax[0].plot(epoch_hist, avg_train_loss_hist, 'ro--', label="train loss", )
    ax[0].errorbar(x=epoch_hist, y=avg_train_loss_hist, yerr=std_train_loss_hist,
                capsize=5, ls='none', color='r')

    # ax.scatter(epoch_hist, avg_val_loss_hist, c='b', label="val loss", )
    ax[0].plot(epoch_hist, avg_val_loss_hist, 'ko--', label="val loss", )
    ax[0].errorbar(x=epoch_hist, y=avg_val_loss_hist, yerr=std_val_loss_hist,
                capsize=5, ls='none', color='k')
    
    ax[0].set_xlabel("epoch")
    ax[0].set_ylabel("loss")
    ax[0].legend()
    

    ax[1].plot(epoch_hist, train_acc_hist, 'r-.', label="train accuracy", 
                  marker='s')

    ax[1].plot(epoch_hist, val_acc_hist, 'k-.', label="val accuracy",
                  marker='s')
    ax[1].set_ylabel("accuracy")
    ax[1].set_xlabel("epoch")
    ax[1].set_ylim([0, 1])
    ax[1].legend()
    fig.tight_layout(pad=1)
    plt.show(block=False)

    # check validation loss if we need to stop training
    # print("validation loss hist: ", avg_val_loss_hist)
    # if (epoch > patience) and all(avg_val_loss_hist[-1-i] >= avg_val_loss_hist[-1-i-1]
    #                               for i in range(patience)):
    model.to('cpu')
    if (epoch > min_epochs) and (
        #avg_val_loss_hist[-1] > (std_val_loss_hist[-1] + std_train_loss_hist[-1] + avg_train_loss_hist[-1] + buffer)):
        (avg_val_loss_hist[-1] - avg_train_loss_hist[-1] + buffer) > (std_train_loss_hist[-1] + std_val_loss_hist[-1])
        or (avg_val_loss_hist[-1] > 5*avg_train_loss_hist[-1])):
        #avg_val_loss_hist[-1] > (std_val_loss_hist[-1] + std_train_loss_hist[-1] + buffer)):
                    # and any(avg_val_loss_hist[-1-i] >= avg_val_loss_hist[-1-i-1]
                    #                                   for i in range(patience)):
      # save model
      # TODO: refactor this so torch.save isn't repeated
      if save_checkpoint:
        notes = notes + "\n\n stopped early"
        torch.save({
            'filename': filename,
            'epochs': epoch_hist,
            'model_id': model_id,
            'model_state_dict': model.state_dict(),
            'model_args': model_args,
            'metrics':{
              'avg_train_loss_hist': avg_train_loss_hist,
              'std_train_loss_hist': std_train_loss_hist,
              'avg_val_loss_hist': avg_val_loss_hist,
              'std_val_loss_hist': std_val_loss_hist,
              'train_acc_hist': train_acc_hist,
              'train_prec_hist': train_prec_hist,
              'train_recall_hist': train_recall_hist,
              'train_f1_hist': train_f1_hist,
              'val_acc_hist': val_acc_hist,
              'val_prec_hist': val_prec_hist,
              'val_recall_hist': val_recall_hist,
              'val_f1_hist': val_f1_hist,},
            'dataset_info': dataset.metadata,
            'notes': notes,
            'summary': summary_str,
            'experiment_params': experiment_params,
        }, checkpoint_path+checkpoint_name)
        print("model saved")

      if stop_next:
        print("stopping early")
        break
      else:
        stop_next = True
    else:
      stop_next = False

    # save model
    if save_checkpoint:
      torch.save({
            'filename': filename,
            'epochs': epoch_hist,
            'model_id': model_id,
            'model_state_dict': model.state_dict(),
            'model_args': model_args,
            'metrics':{
              'avg_train_loss_hist': avg_train_loss_hist,
              'std_train_loss_hist': std_train_loss_hist,
              'avg_val_loss_hist': avg_val_loss_hist,
              'std_val_loss_hist': std_val_loss_hist,
              'train_acc_hist': train_acc_hist,
              'train_prec_hist': train_prec_hist,
              'train_recall_hist': train_recall_hist,
              'train_f1_hist': train_f1_hist,
              'val_acc_hist': val_acc_hist,
              'val_prec_hist': val_prec_hist,
              'val_recall_hist': val_recall_hist,
              'val_f1_hist': val_f1_hist,},
            'dataset_info': dataset.metadata,
            'notes': notes,
            'summary': summary_str,
            'experiment_params': experiment_params,
        }, checkpoint_path+checkpoint_name)
      print("model saved")

# Initialize params

In [34]:
CHECKPOINT_ROOT = "/content/drive/MyDrive/ITCS 5156 project/trained_models/layers/"

model_args = {
	"channels": [8, 8, 32, 32, 64],
	"conv_kernel_sizes": [3, 3, 3, 3, 3],
	"conv_strides": [1, 1, 1, 1, 1],
	"conv_paddings": [1, 1, 1, 1, 1],
	"pool_masks": [True, True, True, True, True],
	"pool_kernel_sizes": [2, 2, 2, (1, 2), (1, 2)],
	"pool_strides": [2, 2, 2, (1, 2), (1, 2)],
	"linear_features": [128, 64],
	"dropout_probs": [0, 0],
}

args_dict = {
	"filename": filename, 
	"model_id": "Conv_5_layer",
	"num_epochs": 100,
	"interval": 16,
	"batch_size": 32,
	"val_split": 0.2,
	"save_checkpoint": True,
	"criterion": nn.CrossEntropyLoss(),
	"patience": 2,
	"min_epochs": 3,
	"buffer": 0.05,
  "lr": 0.01,
	"model_args": model_args,
  'checkpoint_path': CHECKPOINT_ROOT,
}

model_ids = ["Conv_1_layer", "Conv_3_layer", "Conv_5_layer"]
pool_types = ["sym_stride", "asym_stride"]
pool_strides = [
  [
    [(2, 2)], [(1, 2)]
  ],
  [
    [(2, 2), (2, 2), (2, 2)], [(1, 2), (1, 2), (1, 2)]
  ],
  [
    [(2, 2), (2, 2), (2, 2), (2, 2), (2, 2)],
    [(1, 2), (1, 2), (1, 2), (1, 2), (1, 2)]
  ]
]
channels = [
  [
    [8], [16], [32], [64], [128]
  ],
  [
    [8, 16, 32], [8, 16, 64], [8, 32, 64], [8, 32, 128],
    [16, 32, 64], [16, 64, 128], [128, 128, 128]
  ],
  [
    [8, 16, 32, 64, 128], [8, 16, 16, 32, 64], [8, 32, 64, 64, 128,],
    [16, 16, 32, 64, 128], [16, 32, 32, 64, 128], [32, 32, 64, 64, 128]
  ]
]

for i, id in enumerate(model_ids):
  for k, channel in enumerate(channels[i]):
    for j, stride in enumerate(pool_types):
      print("model id: ", id)
      print("stride: ", pool_strides[i][j])
      print("channel: ", channel)
      model_args = {
      "channels": channel,
      "conv_kernel_sizes": [3, 3, 3, 3, 3],
      #"conv_kernel_sizes": [3],
       "conv_strides": [1, 1, 1, 1, 1],
      #"conv_strides": [1],
       "conv_paddings": [1, 1, 1, 1, 1],
      #"conv_paddings": [1],
       "pool_masks": [True, True, True, True, True],
      #"pool_masks": [True],
      "pool_kernel_sizes": pool_strides[i][j],
      "pool_strides": pool_strides[i][j],
      "linear_features": [128, 64],
      "dropout_probs": [0.3, 0.3],
      }
      args_dict['model_id'] = id
      args_dict['model_args'] = model_args
      args_dict['experiment_params'] = {
        'channels': channel,
        'pool_kernel_sizes': pool_strides[i][j],
        'stride_type': stride,
      }
      args_dict['notes'] = dedent("""
      varying stride and channel depth with dropout prob=0.3.

      Other hyperparams:
      lr = 0.01
      interval: 16
      batch_size: 32
      criterion: CrossEntropyLoss
      
      """)
      args_dict['checkpoint_name'] = "{}_channel{}_{}.pt".format(
        id, k,stride,)
      train_model(**args_dict)

Output hidden; open in https://colab.research.google.com to view.