In [1]:
import os

import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import torchaudio
import librosa
import pandas as pd
import numpy as np

#Initialize Dataset

In [2]:
#800 audio samples in training, will split into 640 for train, 160 for validation. 200 test data

annotations = '/content/drive/MyDrive/elec378/final_project_data/train.csv' #directory that annotation file is in
train_data = '/content/drive/MyDrive/elec378/final_project_data/train/' #directory that train data are in
test_data = '/content/drive/MyDrive/elec378/final_project_data/test/' #directory that test data are in

audio_directory = '/content/drive/MyDrive/elec378/final_project_data/train/'
#extract sampling rate
dummy, sr = librosa.load('/content/drive/MyDrive/elec378/final_project_data/train/train333.wav')

In [51]:
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=sr,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

queef = mel_spectrogram(torch.from_numpy(dummy))
sliced_queef = queef[:,:1293]
print(queef.shape)

torch.Size([64, 1293])


In [3]:
xy = pd.read_csv(annotations)

filenames = xy['ID']
labels = xy['Genre']
print(filenames[0])
bong = "bongo"
print(f'/content/drive/MyDrive/elec378/final_project_data/train/{bong}')

train000.wav
/content/drive/MyDrive/elec378/final_project_data/train/bongo


In [46]:
xy.iloc[0,1]

'pop'

In [11]:
class TrainDataset(Dataset): #dataset class to create dataloader object
  def __init__(self,audio_dir,transformation):
    self.annotations = xy
    self.audio_dir = audio_dir
    self.transformation = transformation

  def __getitem__(self, index):
    audio_sample_path = self._get_audio_sample_path(index)
    label = self._get_audio_sample_label(index)
    signal, sr = torchaudio.load(audio_sample_path)
    signal = self._chop_down_(signal)
    signal = self.transformation(signal)
    print('the label is:', type(label))
    return signal, label

  def _get_audio_sample_label(self,index):
    return self.annotations.iloc[index,1]

  def _chop_down_(self, signal):
    return signal[:,:1293] #max length

  def __len__(self):
    return len(self.annotations)

  def _get_audio_sample_path(self,index):
    return f'/content/drive/MyDrive/elec378/final_project_data/train/{filenames[index]}'


#Convolutional Net

In [5]:
#code and architecture partially from musikalkemist on github, modified to fit our data. Inspired by CNN videos by far1din
#I want final layer with 128 features
class MGenreCNN(nn.Module):
  def __init__(self):
    super().__init__()

    self.conv1 = nn.Sequential(
        nn.Conv2d(
            in_channels = 1, #mel spectrograms are in grayscale
            out_channels = 16, #number of convolutional filters ("features learned")
            kernel_size = 3, #3x3 filter for every convolution
            stride = 1, #step size
            padding = 2 #prevents the data's dimensionality from decreasing
        ),
        nn.ReLU(), #activation function

        #get maximum out of the result mtx of convolution layer depending on kernel size (pooling)
        nn.MaxPool2d(kernel_size=2) #2x2 window for each pooling
    )
    self.conv2 = nn.Sequential(
        nn.Conv2d(
            in_channels = 16, #output of conv1
            out_channels = 32, #double input
            kernel_size = 3,
            stride = 1,
            padding = 2
        ),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2)
    )
    self.conv3 = nn.Sequential(
        nn.Conv2d(
            in_channels = 32, #output of conv2
            out_channels = 64, #double input
            kernel_size = 3,
            stride = 1,
            padding = 2
        ),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2)
    )
    self.conv4 = nn.Sequential(
        nn.Conv2d(
            in_channels = 64, #output of conv3
            out_channels = 128, #double input
            kernel_size = 3,
            stride = 1,
            padding = 2
        ),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2)
    )
    self.flatten = nn.Flatten()
    self.linear = nn.Linear(128 * 5 * 4, 10)
    self.softmax = nn.Softmax(dim=1)

  def forward(self,input_data):
    x = self.conv1(input_data)
    x = self.conv2(x)
    x = self.conv3(x)
    x = self.conv4(x)
    x = self.flatten(x)
    logits = self.linear(x)
    predictions = self.softmax(logits)
    return predictions

#Training Pipeline

In [9]:
def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
    for input, target in data_loader:
        print('input: ', type(input))
        print('target: ', type(target))
        input = input.to(device)
        target = target.to(device)

        # calculate loss
        prediction = model(input)
        loss = loss_fn(prediction, target)

        # backpropagate error and update weights
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    print(f"loss: {loss.item()}")


def train(model, data_loader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")


In [14]:
batch_size = 128
epochs = 10 #plot loss function over time to check for overfitting
learning_rate = 0.001 #lambda/alpha value



if __name__ == "__main__":
  if torch.cuda.is_available():
    device = "cuda"
  else:
    device = "cpu"
  print(f"Using {device}")

  mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=sr,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

  training_set = TrainDataset(train_data, mel_spectrogram)
  data_loader = DataLoader(training_set, batch_size=batch_size)

  model = MGenreCNN().to(device)
  print(model)

  loss_fn = nn.CrossEntropyLoss()
  optimiser = torch.optim.Adam(model.parameters(),lr=learning_rate)
  train(model, data_loader, loss_fn, optimiser, device, epochs)

  torch.save(model.state_dict(), "genrenet.py")

Using cpu
MGenreCNN(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=2560, out_features=10, bias=True)
  (softmax): Softmax(dim=1)
)
Epoch 1
the label is: <class '

AttributeError: 'tuple' object has no attribute 'to'