### Task3: Deep Learning Model

- Train a deep learning model (e.g. CNN or attention based model) with Mel-spectrograms extracted from the audio as input

- Need to compare 2 different kinds of inputs: Mel-spectrograms with or without taking the log

- You can choose whatever FFT window size and hop length you like

- You can choose whatever deep learning model you like

- Need to report how to implement the model clearly

- Need to report the testing result (not validation result) with confusion matrix, top1 accuracy, and top3 accuracy

- You can use any music tagging model. For a novice, the short chunk CNN in this repo is recommended. (Need to replace the BCE loss to Cross-entropy loss)

In [68]:
import os
import json
import time
import pickle
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from sklearn import metrics
import librosa
import torchaudio
from torch.utils import data
from torch.utils.data import DataLoader
# from torch.utils.tensorboard import SummaryWriter



In [57]:
# load the json file
def load_json(json_file):
    with open(json_file) as f:
        data = json.load(f)
    return data

data = load_json('nsynth-subtrain/examples.json')

# get all "instrument_family_str"
instrument_family_str = set()
for key in data:
    instrument_family_str.add(data[key]["instrument_family_str"])

# turn "instrument_family_str" into a list
instrument_family_str = list(instrument_family_str)
print(instrument_family_str)

['synth_lead', 'vocal', 'reed', 'organ', 'bass', 'flute', 'string', 'mallet', 'guitar', 'brass', 'keyboard']


In [58]:
# get all keys in data
keys = list(data.keys())

In [59]:
def feature_extraction(key, file_path):
    y, sr = librosa.load(file_path)

    # FFT window size=2048, and the hop length=512
    # extract the mel spectrogram feature
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=512, hop_length=512)

    # extract the mel spectrogram feature with log scaling
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)

    # put all features into a list
    features = [mel_spectrogram, log_mel_spectrogram]

    return features

In [60]:
import tqdm
# extract the features from each audio file

features = []

# for file in keys:
for key in tqdm.tqdm(keys):
    file = 'nsynth-subtrain/audio/' + key + '.wav'
    # extract the features
    feature = feature_extraction(key, file)
    features.append(feature)

100%|██████████| 48037/48037 [03:06<00:00, 257.79it/s]


In [61]:
mel_spectrogram = [f[0] for f in features]
log_mel_spectrogram = [f[1] for f in features]

# save the features
# np.save('mel_spectrogram.npy', mel_spectrogram)
# np.save('log_mel_spectrogram.npy', log_mel_spectrogram)

In [62]:
# # read the features
# mel_spectrogram = np.load('mel_spectrogram.npy', allow_pickle=True)
# log_mel_spectrogram = np.load('log_mel_spectrogram.npy', allow_pickle=True)

In [63]:
# print the shape of the features
print(mel_spectrogram[0].shape)

(128, 173)


In [64]:
# get one hot encoding of the labels
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Extract labels from the data
labels = [data[key]["instrument_family_str"] for key in keys]

# Initialize the LabelEncoder and OneHotEncoder
label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(sparse_output=False)

# Convert string labels to integer labels
integer_encoded = label_encoder.fit_transform(labels)

# Reshape integer_encoded to 2D array (necessary for OneHotEncoder)
integer_encoded = integer_encoded.reshape(-1, 1)

# Perform one-hot encoding
one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)

# Output the results
print("Original labels:", labels)
print("Integer encoded:", integer_encoded.flatten())
print("One-hot encoded:\n", one_hot_encoded)

Original labels: ['bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass', 'bass'

In [65]:
x = np.array(mel_spectrogram)
y = integer_encoded

In [66]:
class Conv_2d(nn.Module):
    def __init__(self, input_channels, output_channels, kernel_size=3, stride=1, pooling=2):
        super(Conv_2d, self).__init__()
        self.conv = nn.Conv2d(input_channels, output_channels, kernel_size, stride=stride, padding=kernel_size//2)
        self.bn = nn.BatchNorm2d(output_channels)
        self.relu = nn.ReLU()
        self.mp = nn.MaxPool2d(pooling)
        
    def forward(self, x):
        out = self.mp(self.relu(self.bn(self.conv(x))))
        return out

class ShortChunkCNN(nn.Module):
    '''
    Short-chunk CNN architecture.
    So-called VGG-like model with a small receptive field.
    Deeper layers, smaller pooling (2x2).
    '''
    def __init__(self,
                 n_channels=128,
                 n_class=11):
        super(ShortChunkCNN, self).__init__()

        # CNN Layers
        self.layer1 = Conv_2d(1, n_channels, pooling=2)
        self.layer2 = Conv_2d(n_channels, n_channels, pooling=2)
        self.layer3 = Conv_2d(n_channels, n_channels*2, pooling=2)
        self.layer4 = Conv_2d(n_channels*2, n_channels*2, pooling=2)
        self.layer5 = Conv_2d(n_channels*2, n_channels*2, pooling=2)
        self.layer6 = Conv_2d(n_channels*2, n_channels*2, pooling=2)
        self.layer7 = Conv_2d(n_channels*2, n_channels*4, pooling=2)

        # Fully Connected Layers
        self.dense1 = nn.Linear(n_channels*4, n_channels*4)
        self.bn1 = nn.BatchNorm1d(n_channels*4)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.dense2 = nn.Linear(n_channels*4, n_class)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # x: (batch_size, 1, 128, 137)

        # CNN Forward Pass
        x = self.layer1(x)  # -> (batch_size, n_channels, H/2, W/2)
        x = self.layer2(x)  # -> (batch_size, n_channels, H/4, W/4)
        x = self.layer3(x)  # -> (batch_size, n_channels*2, H/8, W/8)
        x = self.layer4(x)  # -> (batch_size, n_channels*2, H/16, W/16)
        x = self.layer5(x)  # -> (batch_size, n_channels*2, H/32, W/32)
        x = self.layer6(x)  # -> (batch_size, n_channels*2, H/64, W/64)
        x = self.layer7(x)  # -> (batch_size, n_channels*4, H/128, W/128)

        # 確保特徵圖的寬度為1，進行全局池化
        if x.size(3) != 1:
            x = nn.MaxPool2d(kernel_size=(1, x.size(3)))(x)
        x = x.squeeze(3)  # -> (batch_size, n_channels*4, H/128)

        # 全局池化後，如果高度仍大於1，進行一次全局池化
        if x.size(2) != 1:
            x = nn.MaxPool1d(x.size(2))(x)
        x = x.squeeze(2)  # -> (batch_size, n_channels*4)

        # 全連接層
        x = self.dense1(x)          # -> (batch_size, n_channels*4)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.dense2(x)          # -> (batch_size, n_class)
        x = self.softmax(x)

        return x


In [72]:
# with GPU
batch_size = 16
num_epochs = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load the x and y into the data loader
train_loader = data.DataLoader(dataset=list(zip(x, y)), batch_size=32, shuffle=True)

model = ShortChunkCNN(n_channels=128, n_class=11)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 訓練迴圈
for epoch in range(num_epochs):
    model.train()
    batch_idx = 0
    for inputs, one_hot_labels in train_loader:
        # 假設 inputs 的形狀為 (batch_size, 128, 137)
        inputs = inputs.unsqueeze(1)  # -> (batch_size, 1, 128, 137)
        labels = torch.argmax(one_hot_labels, dim=1)  # -> (batch_size)

        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)  # -> (batch_size, n_class)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print batch number 
        print(f'batch number: {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}')
        batch_idx += 1

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


batch number: 1/1502, Loss: 2.3917


KeyboardInterrupt: 

In [None]:
# save the model
torch.save(model.state_dict(), 'mel_model.pth')

In [None]:
# # without GPU

# batch_size = 16
# num_epochs = 10


# # load the x and y into the data loader
# train_loader = data.DataLoader(dataset=list(zip(x, y)), batch_size=32, shuffle=True)

# model = ShortChunkCNN(n_channels=128, n_class=11)
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)
# # 訓練迴圈
# for epoch in range(num_epochs):
#     model.train()
#     batch_idx = 0
#     for inputs, one_hot_labels in train_loader:
#         # 假設 inputs 的形狀為 (batch_size, 128, 137)
#         inputs = inputs.unsqueeze(1)  # -> (batch_size, 1, 128, 137)
#         labels = torch.argmax(one_hot_labels, dim=1)  # -> (batch_size)

#         optimizer.zero_grad()
#         outputs = model(inputs)  # -> (batch_size, n_class)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()

#         # print batch number 
#         print(f'batch number: {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}')
#         batch_idx += 1

#     print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


In [None]:
# save model checkpoint
import joblib
joblib.dump(knn, 'short_cnn_model.pkl')


In [None]:
# # get the knn model
# knn = joblib.load('short_cnn_model.pkl')

In [None]:
class Conv_2d(nn.Module):
    def __init__(self, input_channels, output_channels, shape=3, stride=1, pooling=2):
        super(Conv_2d, self).__init__()
        self.conv = nn.Conv2d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
        self.bn = nn.BatchNorm2d(output_channels)
        self.relu = nn.ReLU()
        self.mp = nn.MaxPool2d(pooling)
    def forward(self, x):
        out = self.mp(self.relu(self.bn(self.conv(x))))
        return out

In [None]:
class ShortChunkCNN(nn.Module):
    '''
    Short-chunk CNN architecture.
    So-called vgg-ish model with a small receptive field.
    Deeper layers, smaller pooling (2x2).
    '''
    def __init__(self,
                n_channels=128,
                sample_rate=16000,
                n_fft=512,
                f_min=0.0,
                f_max=8000.0,
                n_mels=128,
                n_class=11):
                # n_class=50):
        super(ShortChunkCNN, self).__init__()

        # Spectrogram
        self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate,
                                                         n_fft=n_fft,
                                                         f_min=f_min,
                                                         f_max=f_max,
                                                         n_mels=n_mels)
        self.to_db = torchaudio.transforms.AmplitudeToDB()
        self.spec_bn = nn.BatchNorm2d(1)

        # CNN
        self.layer1 = Conv_2d(1, n_channels, pooling=2)
        self.layer2 = Conv_2d(n_channels, n_channels, pooling=2)
        self.layer3 = Conv_2d(n_channels, n_channels*2, pooling=2)
        self.layer4 = Conv_2d(n_channels*2, n_channels*2, pooling=2)
        self.layer5 = Conv_2d(n_channels*2, n_channels*2, pooling=2)
        self.layer6 = Conv_2d(n_channels*2, n_channels*2, pooling=2)
        self.layer7 = Conv_2d(n_channels*2, n_channels*4, pooling=2)

        # Dense
        self.dense1 = nn.Linear(n_channels*4, n_channels*4)
        self.bn = nn.BatchNorm1d(n_channels*4)
        self.dense2 = nn.Linear(n_channels*4, n_class)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()

    def forward(self, x):
        # Spectrogram
        x = self.spec(x)
        x = self.to_db(x)
        x = x.unsqueeze(1)
        x = self.spec_bn(x)

        # CNN
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        x = self.layer6(x)
        x = self.layer7(x)
        x = x.squeeze(2)

        # Global Max Pooling
        if x.size(-1) != 1:
            x = nn.MaxPool1d(x.size(-1))(x)
        x = x.squeeze(2)

        # Dense
        x = self.dense1(x)
        x = self.bn(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.dense2(x)
        x = nn.Sigmoid()(x)

        return x

In [None]:
input_length = 59049
epochs = 200
batch_size = 16

In [None]:
class Solver(object):
    def __init__(self, data_loader):
        # Initialize data loader and training settings
        # x is the input data, y is the labels
        self.data_loader = data_loader
        self.n_epochs = 200
        self.lr = 1e-4
        self.batch_size = 16
        # path: current directory
        self.model_save_path = ""
        self.model_load_path = ""

        # Check for CUDA availability
        self.is_cuda = torch.cuda.is_available()

        # Build and load model
        self.build_model()

        # Tensorboard writer
        # self.writer = SummaryWriter()

    def build_model(self):
        self.model = ShortChunkCNN()  # Assuming this is your desired model
        if self.is_cuda:
            self.model.cuda()

        # Load pretrained model if specified
        if self.model_load_path:
            self.load(self.model_load_path)

        # Adam optimizer
        self.optimizer = torch.optim.Adam(self.model.parameters(), self.lr, weight_decay=1e-4)

    def load(self, filename):
        S = torch.load(filename)
        if 'spec.mel_scale.fb' in S.keys():
            self.model.spec.mel_scale.fb = S['spec.mel_scale.fb']
        self.model.load_state_dict(S)

    def to_var(self, x):
        if self.is_cuda:
            x = x.cuda()
        return Variable(x)

    def get_loss_function(self):
        return nn.CrossEntropyLoss()

    def train(self):
        # Start training
        start_t = time.time()
        reconst_loss = self.get_loss_function()

        for epoch in range(self.n_epochs):
            self.model.train()
            for x, y in self.data_loader:
                x, y = self.to_var(x), self.to_var(y)
                out = self.model(x)

                # Backpropagation
                loss = reconst_loss(out, y)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                # Log loss
                self.writer.add_scalar('Loss/train', loss.item(), epoch)

            # Validation after each epoch
            self.validation(epoch)

        print(f"Training finished. Elapsed: {time.time() - start_t:.2f} seconds")

    def save(self, filename):
        torch.save(self.model.state_dict(), filename)

    def validation(self, epoch):
        self.model.eval()
        est_array, gt_array = [], []
        losses = []
        reconst_loss = self.get_loss_function()

        for x, y in self.data_loader:
            x = self.to_var(x)
            out = self.model(x)
            loss = reconst_loss(out, y)
            losses.append(loss.item())
            est_array.append(out.detach().cpu().numpy())
            gt_array.append(y.cpu().numpy())

        est_array, gt_array = np.vstack(est_array), np.vstack(gt_array)
        loss_avg = np.mean(losses)
        roc_auc, pr_auc = self.get_auc(est_array, gt_array)
        
        # Logging validation metrics
        self.writer.add_scalar('Loss/valid', loss_avg, epoch)
        self.writer.add_scalar('AUC/ROC', roc_auc, epoch)
        self.writer.add_scalar('AUC/PR', pr_auc, epoch)

    def get_auc(self, est_array, gt_array):
        roc_auc = metrics.roc_auc_score(gt_array, est_array, average='macro')
        pr_auc = metrics.average_precision_score(gt_array, est_array, average='macro')
        return roc_auc, pr_auc
