### Task3: Deep Learning Model

- Train a deep learning model (e.g. CNN or attention based model) with Mel-spectrograms extracted from the audio as input

- Need to compare 2 different kinds of inputs: Mel-spectrograms with or without taking the log

- You can choose whatever FFT window size and hop length you like

- You can choose whatever deep learning model you like

- Need to report how to implement the model clearly

- Need to report the testing result (not validation result) with confusion matrix, top1 accuracy, and top3 accuracy

- You can use any music tagging model. For a novice, the short chunk CNN in this repo is recommended. (Need to replace the BCE loss to Cross-entropy loss)

In [3]:
# training dataset file path:
traning_data_path = '<PUT THE PATH TO THE TRAINING DATA HERE>'

# traning_data_path = 'nsynth-subtrain'


In [4]:
import os
import json
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
from torch.utils.data import DataLoader
import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
from torch.utils.data import DataLoader
from sklearn.metrics import confusion_matrix

import pickle

In [5]:
# load the json file
def load_json(json_file):
    with open(json_file) as f:
        data = json.load(f)
    return data

# let the json path be /examples.json under the "traning_data_path"
json_path = os.path.join(traning_data_path, 'examples.json')

data = load_json(json_path)

In [6]:
# get all keys in data
keys = list(data.keys())

In [7]:
def feature_extraction(key, file_path):
    y, sr = librosa.load(file_path)

    # FFT window size=2048, and the hop length=512
    # extract the mel spectrogram feature
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=512, hop_length=512)

    # extract the mel spectrogram feature with log scaling
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)

    # put all features into a list
    features = [mel_spectrogram, log_mel_spectrogram]

    return features

In [6]:
# extract the features from each audio file

features = []

# for file in keys:
for key in tqdm.tqdm(keys):
    file = os.path.join(traning_data_path, 'audio', key + '.wav')
    # extract the features
    feature = feature_extraction(key, file)
    features.append(feature)

  0%|          | 0/48037 [00:00<?, ?it/s]

100%|██████████| 48037/48037 [02:50<00:00, 281.81it/s]


In [7]:
mel_spectrogram = [f[0] for f in features]
log_mel_spectrogram = [f[1] for f in features]

In [8]:
# get one hot encoding of the labels

# Extract labels from the data
labels = [data[key]["instrument_family_str"] for key in keys]

# Initialize the LabelEncoder and OneHotEncoder
label_encoder = LabelEncoder()

# dump the labels_encoder
with open('label_encoder_mel.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Convert string labels to integer labels
integer_encoded = label_encoder.fit_transform(labels)

# Reshape integer_encoded to 2D array (necessary for OneHotEncoder)
integer_encoded = integer_encoded.reshape(-1, 1)

In [10]:
x = np.array(mel_spectrogram)
y = integer_encoded

In [19]:
class Conv_2d(nn.Module):
    def __init__(self, input_channels, output_channels, kernel_size=3, stride=1, pooling=2):
        super(Conv_2d, self).__init__()
        self.conv = nn.Conv2d(input_channels, output_channels, kernel_size, stride=stride, padding=kernel_size//2)
        self.bn = nn.BatchNorm2d(output_channels)
        self.relu = nn.ReLU()
        self.mp = nn.MaxPool2d(pooling)
        
    def forward(self, x):
        out = self.mp(self.relu(self.bn(self.conv(x))))
        return out

class ShortChunkCNN(nn.Module):
    '''
    Short-chunk CNN architecture.
    So-called VGG-like model with a small receptive field.
    Deeper layers, smaller pooling (2x2).
    '''
    def __init__(self,
                 n_channels=1,
                 n_class=11):
        super(ShortChunkCNN, self).__init__()

        # CNN Layers
        self.layer1 = Conv_2d(1, n_channels, pooling=2)
        self.layer2 = Conv_2d(n_channels, n_channels, pooling=2)
        self.layer3 = Conv_2d(n_channels, n_channels*2, pooling=2)
        self.layer4 = Conv_2d(n_channels*2, n_channels*2, pooling=2)
        self.layer5 = Conv_2d(n_channels*2, n_channels*2, pooling=2)
        self.layer6 = Conv_2d(n_channels*2, n_channels*2, pooling=2)
        self.layer7 = Conv_2d(n_channels*2, n_channels*4, pooling=2)

        # Fully Connected Layers
        self.dense1 = nn.Linear(n_channels*4, n_channels*4)
        self.bn1 = nn.BatchNorm1d(n_channels*4)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.dense2 = nn.Linear(n_channels*4, n_class)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # x: (batch_size, 1, 128, 137)

        # CNN Forward Pass
        x = self.layer1(x)  # -> (batch_size, n_channels, H/2, W/2)
        x = self.layer2(x)  # -> (batch_size, n_channels, H/4, W/4)
        x = self.layer3(x)  # -> (batch_size, n_channels*2, H/8, W/8)
        x = self.layer4(x)  # -> (batch_size, n_channels*2, H/16, W/16)
        x = self.layer5(x)  # -> (batch_size, n_channels*2, H/32, W/32)
        x = self.layer6(x)  # -> (batch_size, n_channels*2, H/64, W/64)
        x = self.layer7(x)  # -> (batch_size, n_channels*4, H/128, W/128)

        if x.size(3) != 1:
            x = nn.MaxPool2d(kernel_size=(1, x.size(3)))(x)
        x = x.squeeze(3)  # -> (batch_size, n_channels*4, H/128)

        if x.size(2) != 1:
            x = nn.MaxPool1d(x.size(2))(x)
        x = x.squeeze(2)  # -> (batch_size, n_channels*4)

        x = self.dense1(x)          # -> (batch_size, n_channels*4)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.dense2(x)          # -> (batch_size, n_class)
        x = self.softmax(x)

        return x


In [37]:
# with GPU
batch_size = 16
num_epochs = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load the x and y into the data loader
train_loader = DataLoader(dataset=list(zip(x, y)), batch_size=batch_size, shuffle=True)

model = ShortChunkCNN(n_channels=1, n_class=11)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 訓練迴圈
for epoch in range(num_epochs):
    model.train()
    batch_idx = 0
    for inputs, integer_labels in train_loader:
        # 假設 inputs 的形狀為 (batch_size, 128, 137)
        inputs = inputs.unsqueeze(1)  # -> (batch_size, 1, 128, 137)
        # labels: from 0 to 10
        labels = integer_labels.squeeze() # -> (batch_size)

        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)  # -> (batch_size, n_class)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print batch number 
        print(f'batch number: {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}')
        batch_idx += 1

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


In [None]:
# save the model
torch.save(model.state_dict(), 'mel_model.pth')