In [1]:
import os
import math
import torch
from torch.autograd import Variable
from torch.optim import Adam
from torch import nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import DataLoader, random_split, Dataset
from scipy.io import wavfile
import scipy.signal
import numpy as np

In [2]:
NUM_CLASSES = 3
BATCH_SIZE = 1
SONG_LENGTH_SECONDS = 2

In [3]:
class MusicDataset(Dataset):
    def __init__(self, directory, genres, add_dimension, downsample=None, noise=False):
        self.directory = directory
        self.files = []
        self.downsample = downsample
        self.add_dimension = add_dimension
        self.noise = noise
        for label, genre in enumerate(genres):
            genre_path = os.path.join(directory, genre)
            self.files.extend([(os.path.join(genre_path, f), label) for f in os.listdir(genre_path)])

    def __getitem__(self, index):
        song, label = self.files[index]
        rate, data = wavfile.read(f'{self.directory}/{song}')
        
        data = data[:44100*SONG_LENGTH_SECONDS]
        
        if self.downsample:
            data = scipy.signal.resample(data, self.downsample * SONG_LENGTH_SECONDS)

        if self.noise:
            gauss = np.random.normal(0.01, 0.001, (len(data),))
            data = data + gauss
        
        tensor = torch.Tensor(data) / (2**15)
        # add an input dimension to the data [441000] => [1, 441000]. Conv1d expects data in this format.
        if self.add_dimension:
            tensor.unsqueeze_(0)
        return tensor, torch.tensor(label, dtype=torch.long)
    
    def input_size(self):
        if self.add_dimension:
            return len(self[0][0][0])
        else:
            return len(self[0][0])
    
    def __len__(self):
        return len(self.files)



def load_dataset(add_dimension, downsample=None, noise=False):
    d = MusicDataset('.', ['rock', 'electro', 'classic'], add_dimension, downsample=downsample, noise=noise)
    train, validate = random_split(d, [900, 300])

    loader = DataLoader(train, batch_size=BATCH_SIZE)
    validation_loader = DataLoader(validate, batch_size=BATCH_SIZE)
    return d.input_size(), loader, validation_loader

In [4]:
class Model1Linear(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.h1 = nn.Linear(input_size, hidden_size)
        self.h2 = nn.Linear(hidden_size, hidden_size)
        self.h3 = nn.Linear(hidden_size, hidden_size)
        self.h4 = nn.Linear(hidden_size, hidden_size)
        self.h5 = nn.Linear(hidden_size, hidden_size)
        self.h6 = nn.Linear(hidden_size, hidden_size)
        self.h7 = nn.Linear(hidden_size, hidden_size)
        self.h8 = nn.Linear(hidden_size, hidden_size)
        self.h9 = nn.Linear(hidden_size, NUM_CLASSES)
    
    def forward(self, x):
        x = x.data.view(-1, input_size)
         
        x = self.h1(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        x = self.h2(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        x = self.h3(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        x = self.h4(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        
        x = self.h5(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        
        x = self.h6(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        
        x = self.h7(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        x = self.h8(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        x = self.h9(x)
        x = F.softmax(x, dim=1)
        return x

In [5]:
class ModelConv1(nn.Module):
    def __init__(self, input_size, kernel_size=5, conv_out_channels=5, linear_size=50):
        super().__init__()

        if kernel_size % 2 != 1:
            raise Exception('Only odd kernel_size are supported')
        self.conv_out_channels = conv_out_channels
        self.conv1 = nn.Conv1d(1, conv_out_channels, kernel_size=kernel_size)
        conv_layer_output_size = int(input_size - (kernel_size - 1))

        self.pooled_samples = int(conv_layer_output_size / 2)
        self.h1 = nn.Linear(self.pooled_samples * conv_out_channels, linear_size)
        self.h2 = nn.Linear(linear_size, linear_size)
        self.h3 = nn.Linear(linear_size, linear_size)
        self.h4 = nn.Linear(linear_size, linear_size)
        self.h9 = nn.Linear(linear_size, NUM_CLASSES)
    
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool1d(x, 2)

        x = x.view(-1, self.pooled_samples * self.conv_out_channels)

        x = self.h1(x)
        x = F.relu(x)
        
        x = self.h2(x)
        x = F.relu(x)
        
        x = self.h3(x)
        x = F.relu(x)
        
        x = self.h4(x)
        x = F.relu(x)

        x = self.h9(x)
        x = F.softmax(x, dim=1)
        return x

In [6]:
class ModelConv2(nn.Module):
    def __init__(self, input_size, kernel_size=5, conv_out_channels=5, linear_size=50):
        super().__init__()

        if kernel_size % 2 != 1:
            raise Exception('Only odd kernel_size are supported')
        self.conv_out_channels = conv_out_channels
        self.conv1 = nn.Conv1d(1, conv_out_channels, kernel_size=kernel_size)
        self.conv2 = nn.Conv1d(conv_out_channels, conv_out_channels, kernel_size=kernel_size)
        self.conv3 = nn.Conv1d(conv_out_channels, conv_out_channels, kernel_size=kernel_size)

        #conv_layer_output_size = int(input_size - (kernel_size - 1))
        x = input_size
        x = x - (kernel_size - 1)
        x = int(x / 5)
        
        x = x - (kernel_size - 1)
        x = int(x / 5)
        
        x = x - (kernel_size - 1)
        x = int(x / 5)
        self.pooled_samples = x * conv_out_channels

        self.h1 = nn.Linear(self.pooled_samples, linear_size)
        self.h2 = nn.Linear(linear_size, linear_size)
        self.h3 = nn.Linear(linear_size, linear_size)
        self.h4 = nn.Linear(linear_size, linear_size)
        self.h9 = nn.Linear(linear_size, NUM_CLASSES)
    
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool1d(x, 5)
        
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool1d(x, 5)
        
        x = self.conv3(x)
        x = F.relu(x)
        x = F.max_pool1d(x, 5)

        x = x.view(BATCH_SIZE, self.pooled_samples)
        x = self.h1(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        
        x = self.h2(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        
        x = self.h3(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        
        x = self.h4(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        x = self.h9(x)
        x = F.softmax(x, dim=1)
        return x

In [7]:
class ModelConv2BatchNorm(nn.Module):
    def __init__(self, input_size, kernel_size=5, conv_out_channels=5, linear_size=50):
        super().__init__()

        if kernel_size % 2 != 1:
            raise Exception('Only odd kernel_size are supported')
        self.conv_out_channels = conv_out_channels
        self.conv1 = nn.Conv1d(1, conv_out_channels, kernel_size=kernel_size)
        self.batch_norm1 = nn.BatchNorm1d(conv_out_channels)
        self.conv2 = nn.Conv1d(conv_out_channels, conv_out_channels, kernel_size=kernel_size)
        self.batch_norm2 = nn.BatchNorm1d(conv_out_channels)
        self.conv3 = nn.Conv1d(conv_out_channels, conv_out_channels, kernel_size=kernel_size)
        self.batch_norm3 = nn.BatchNorm1d(conv_out_channels)

        #conv_layer_output_size = int(input_size - (kernel_size - 1))
        x = input_size
        x = x - (kernel_size - 1)
        x = int(x / 5)
        
        x = x - (kernel_size - 1)
        x = int(x / 5)
        
        x = x - (kernel_size - 1)
        x = int(x / 5)
        self.pooled_samples = x * conv_out_channels

        self.h1 = nn.Linear(self.pooled_samples, linear_size)
        self.h2 = nn.Linear(linear_size, linear_size)
        self.h3 = nn.Linear(linear_size, linear_size)
        self.h4 = nn.Linear(linear_size, linear_size)
        #self.h5 = nn.Linear(linear_size, linear_size)
        #self.h6 = nn.Linear(linear_size, linear_size)
        #self.h7 = nn.Linear(linear_size, linear_size)
        #self.h8 = nn.Linear(linear_size, linear_size)
        self.h9 = nn.Linear(linear_size, NUM_CLASSES)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.batch_norm1(x)
        x = F.relu(x)
        x = F.max_pool1d(x, 5)
        
        x = self.conv2(x)
        x = self.batch_norm2(x)
        x = F.relu(x)
        x = F.max_pool1d(x, 5)
        
        x = self.conv3(x)
        x = self.batch_norm3(x)
        x = F.relu(x)
        x = F.max_pool1d(x, 5)

        x = x.view(BATCH_SIZE, self.pooled_samples)
        x = self.h1(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        
        x = self.h2(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        
        x = self.h3(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        
        x = self.h4(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        x = self.h9(x)
        x = F.softmax(x, dim=1)
        return x

In [8]:
from datetime import datetime

def evalulate(model, validation_loader):
    model.eval()
    loss = 0.0
    correct = 0
    for data, labels in validation_loader:
        labels = labels.cuda()
        predictions_per_class = model(data.cuda())
        _, highest_prediction_class = predictions_per_class.max(1)
        loss += F.nll_loss(predictions_per_class, labels)
        correct += torch.sum(highest_prediction_class == labels)
    return loss/len(validation_loader), correct.item()/len(validation_loader)

def learn(model, loader, validation_loader, epochs=30, learning_rate=0.001):
    torch.cuda.empty_cache()
    optimizer = Adam(params=model.parameters(), lr=learning_rate)

    f = open(f'{datetime.now().isoformat()}.txt', 'w', buffering=1)

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        total_correct = 0
        for data, labels in loader:
            labels = labels.cuda()
            predictions_per_class = model(data.cuda())
            highest_prediction, highest_prediction_class = predictions_per_class.max(1)

            # how good are we? compare output with the target classes
            loss = F.nll_loss(predictions_per_class, labels)
            total_loss += loss.item()
            total_correct += torch.sum(highest_prediction_class == labels)

            model.zero_grad()
            loss.backward()
            optimizer.step()
        
        accuracy = total_correct.item()/len(loader)
        train_loss = total_loss/len(loader)
        validation_loss, validation_accuracy = evalulate(model, validation_loader)
        stats = f'Epoch: {epoch}, TL: {train_loss}, VL: {validation_loss.item()}, TA: {accuracy}, VA: {validation_accuracy}'
        print(stats)
        f.write(f'{stats}\n')
        
    return model

In [9]:
#input_size, loader, validation_loader = load_dataset(add_dimension=False, noise=False)
#model = Model1Linear(input_size, 500).cuda()
#learn(model, loader, validation_loader, 10000, learning_rate=0.0001)

In [10]:
input_size, loader, validation_loader = load_dataset(add_dimension=True, downsample=22050, noise=True)
model = ModelConv2(input_size, kernel_size=5, conv_out_channels=10, linear_size=500).cuda()
learn(model, loader, validation_loader, epochs=10000, learning_rate=0.0001)

Epoch: 0, TL: -0.3649764043216904, VL: -0.4614359736442566, TA: 0.3522222222222222, VA: 0.64
Epoch: 1, TL: -0.6230933185807319, VL: -0.7087326645851135, TA: 0.6322222222222222, VA: 0.7533333333333333
Epoch: 2, TL: -0.686960410416758, VL: -0.7406435608863831, TA: 0.6966666666666667, VA: 0.74
Epoch: 3, TL: -0.7380014009080672, VL: -0.7406772971153259, TA: 0.7444444444444445, VA: 0.74
Epoch: 4, TL: -0.7407940070184649, VL: -0.7743238210678101, TA: 0.7477777777777778, VA: 0.7766666666666666
Epoch: 5, TL: -0.7707625677954436, VL: -0.75943922996521, TA: 0.7722222222222223, VA: 0.7633333333333333
Epoch: 6, TL: -0.7690609264652284, VL: -0.7727417349815369, TA: 0.7722222222222223, VA: 0.7733333333333333
Epoch: 7, TL: -0.8005458617900321, VL: -0.747424304485321, TA: 0.8055555555555556, VA: 0.7466666666666667
Epoch: 8, TL: -0.8067662760784916, VL: -0.7575620412826538, TA: 0.8077777777777778, VA: 0.7566666666666667
Epoch: 9, TL: -0.8010284138400926, VL: -0.7590256333351135, TA: 0.8055555555555556,

Epoch: 82, TL: -0.9007519820001412, VL: -0.7655062675476074, TA: 0.9011111111111111, VA: 0.7666666666666667
Epoch: 83, TL: -0.909693901619159, VL: -0.8455739617347717, TA: 0.91, VA: 0.8466666666666667
Epoch: 84, TL: -0.9219435848167752, VL: -0.8537136316299438, TA: 0.9222222222222223, VA: 0.8533333333333334
Epoch: 85, TL: -0.9109116214528837, VL: -0.8413394689559937, TA: 0.9111111111111111, VA: 0.8433333333333334
Epoch: 86, TL: -0.5941891447834128, VL: -0.496910959482193, TA: 0.5944444444444444, VA: 0.49666666666666665
Epoch: 87, TL: -0.8634561198220497, VL: -0.8454908728599548, TA: 0.8633333333333333, VA: 0.8433333333333334
Epoch: 88, TL: -0.9267281784867735, VL: -0.8370979428291321, TA: 0.9266666666666666, VA: 0.8366666666666667
Epoch: 89, TL: -0.9146617701213428, VL: -0.8607524037361145, TA: 0.9144444444444444, VA: 0.86
Epoch: 90, TL: -0.9147202455406662, VL: -0.8460326790809631, TA: 0.9144444444444444, VA: 0.8466666666666667
Epoch: 91, TL: -0.9354147641384927, VL: -0.79293680191040

Epoch: 163, TL: -0.9287210460765681, VL: -0.8366678953170776, TA: 0.9288888888888889, VA: 0.8366666666666667
Epoch: 164, TL: -0.93645261405177, VL: -0.840001106262207, TA: 0.9366666666666666, VA: 0.84
Epoch: 165, TL: -0.9268232147125367, VL: -0.8527682423591614, TA: 0.9266666666666666, VA: 0.8533333333333334
Epoch: 166, TL: -0.9527531723254894, VL: -0.8486665487289429, TA: 0.9533333333333334, VA: 0.8466666666666667
Epoch: 167, TL: -0.9490482899322119, VL: -0.8551359176635742, TA: 0.9488888888888889, VA: 0.8533333333333334
Epoch: 168, TL: -0.9549675599642099, VL: -0.8470048904418945, TA: 0.9555555555555556, VA: 0.8466666666666667
Epoch: 169, TL: -0.9544306262550328, VL: -0.8391308188438416, TA: 0.9544444444444444, VA: 0.8366666666666667
Epoch: 170, TL: -0.9531985682625487, VL: -0.8550567626953125, TA: 0.9533333333333334, VA: 0.8566666666666667
Epoch: 171, TL: -0.9181636585316941, VL: -0.7587472796440125, TA: 0.9188888888888889, VA: 0.76
Epoch: 172, TL: -0.9407381905174607, VL: -0.852015

Epoch: 243, TL: -0.9486573676600579, VL: -0.8149665594100952, TA: 0.9488888888888889, VA: 0.8133333333333334
Epoch: 244, TL: -0.945485077397175, VL: -0.8424011468887329, TA: 0.9455555555555556, VA: 0.8433333333333334
Epoch: 245, TL: -0.9381842186377845, VL: -0.7936537265777588, TA: 0.9377777777777778, VA: 0.7933333333333333
Epoch: 246, TL: -0.9582747430085404, VL: -0.8433337211608887, TA: 0.9577777777777777, VA: 0.8433333333333334
Epoch: 247, TL: -0.957843022168947, VL: -0.8420405387878418, TA: 0.9577777777777777, VA: 0.8433333333333334
Epoch: 248, TL: -0.9599999982780949, VL: -0.8420293927192688, TA: 0.96, VA: 0.8433333333333334
Epoch: 249, TL: -0.9611111175067006, VL: -0.8433529138565063, TA: 0.9611111111111111, VA: 0.8433333333333334
Epoch: 250, TL: -0.9229529881683028, VL: -0.8332869410514832, TA: 0.9233333333333333, VA: 0.8333333333333334
Epoch: 251, TL: -0.9372153850903528, VL: -0.7643394470214844, TA: 0.9366666666666666, VA: 0.7633333333333333
Epoch: 252, TL: -0.941111741673503,

Epoch: 322, TL: -0.8878107238123236, VL: -0.7703158259391785, TA: 0.8877777777777778, VA: 0.77
Epoch: 323, TL: -0.9134271996255668, VL: -0.8153831362724304, TA: 0.9133333333333333, VA: 0.8166666666666667
Epoch: 324, TL: -0.9502631583407881, VL: -0.8294571042060852, TA: 0.95, VA: 0.83
Epoch: 325, TL: -0.9111735822161137, VL: -0.7574597597122192, TA: 0.9111111111111111, VA: 0.7566666666666667
Epoch: 326, TL: -0.921291844518972, VL: -0.8266666531562805, TA: 0.9211111111111111, VA: 0.8266666666666667
Epoch: 327, TL: -0.9507696522734619, VL: -0.8344448804855347, TA: 0.9511111111111111, VA: 0.8333333333333334
Epoch: 328, TL: -0.9554500818010648, VL: -0.8399080634117126, TA: 0.9555555555555556, VA: 0.84
Epoch: 329, TL: -0.9490059713720543, VL: -0.8333337903022766, TA: 0.95, VA: 0.8333333333333334
Epoch: 330, TL: -0.9444442834049348, VL: -0.8368630409240723, TA: 0.9444444444444444, VA: 0.8366666666666667
Epoch: 331, TL: -0.9490496711432962, VL: -0.8066275715827942, TA: 0.9488888888888889, VA: 

KeyboardInterrupt: 