In [None]:
# multi-label classification
# using MERT model as pre-trained model

In [107]:
import json
import numpy as np
import os

# huggingface
# from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2FeatureExtractor
from transformers import AutoModel
import torch
from torch import nn
import torchaudio.transforms as T
from datasets import load_dataset
import nnAudio

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim

import torch
from torch.utils.data import Dataset, DataLoader


In [89]:
SAMPLE_RATE = 24000

In [55]:
# read index-label mapping
with open('hw1/class_idx2MIDIClass.json', 'r') as f:
    class_idx2MIDIClass = json.load(f)

print(class_idx2MIDIClass)
# print the mapping
print(class_idx2MIDIClass['0'])


{'0': 'Piano', '1': 'Percussion', '2': 'Organ', '3': 'Guitar', '4': 'Bass', '5': 'Strings', '6': 'Voice', '7': 'Wind Instruments', '8': 'Synth'}
Piano


In [22]:
# define a function cast np array file back into wav file
# sampling rate 24kHz
# use scipy.io.wavfile
import scipy.io.wavfile as wavfile
def np2wav(np_array, filename):
    wavfile.write(filename, 24000, np_array)

In [54]:
# read train dataset
train_audio_path = 'hw1/slakh/train/'

# get all the audio file names
train_audio_files = []
for root, dirs, files in os.walk(train_audio_path):
    for file in files:
        if file.endswith('.npy'):
            train_audio_files.append(file)

# for all the file in the dataset(under the train audio path), store the audio-file name pair in a list
train_audio = []
for file in range(len(train_audio_files)):
    audio = np.load(train_audio_path + train_audio_files[file])
    train_audio.append((train_audio_files[file], audio))



In [60]:
# read train label
train_label_path = 'hw1/slakh/train_labels.json'

# read the label file
with open(train_label_path, 'r') as f:
    train_label = json.load(f)

# for every key in the label file, find the corresponding label in train_audio, and append it in the tuple
train_data = []
for key in train_label:
    for audio in train_audio:
        if key == audio[0]:
            train_data.append((audio[0], audio[1], train_label[key]))


In [61]:
# train data 
# (filename, audio, label)
print(train_data[0])

('Track00001_17.npy', array([-0.18277621, -0.21051419, -0.16272902, ...,  0.06085217,
        0.04477724,  0.04972876], dtype=float32), [1, 0, 1, 1, 1, 0, 0, 0, 0])


In [103]:
# device = torch.device("mps")
device = torch.device("cpu")

# loading our model weights
model = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True).to(device)
# loading the corresponding preprocessor config
processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v1-95M",trust_remote_code=True)

# happen to be 24kHz, the same as the dataset
resample_rate = processor.sampling_rate

# (label, embedding)
train_embedding_label = []

# use tqdm to show the progress
# for(filename, audio, label) in tqdm(train_data[:2]): # only use the first 2 data for test
for(filename, audio, label) in tqdm(train_data):
    input_audio = torch.tensor(audio).float().to(device)
    inputs = processor(input_audio, sampling_rate=resample_rate, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze()
    train_embedding_label.append((label, all_layer_hidden_states))

# save train_embedding_label
with open('train_embedding_label.json', 'w') as f:
    json.dump(train_embedding_label, f)

Some weights of the model checkpoint at m-a-p/MERT-v1-95M were not used when initializing MERTModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing MERTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MERTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MERTModel were not initialized from the model checkpoint at m-a-p/MERT-v1-95M and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
10

In [106]:
print(train_embedding_label[0][1].shape)

torch.Size([13, 374, 768])


In [112]:
class EmbeddingDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label, embedding = self.data[idx]
        return torch.tensor(embedding, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

# class MultiClassClassifier(nn.Module):
#     def __init__(self, input_size, num_classes):
#         super(MultiClassClassifier, self).__init__()
#         self.pool = nn.AdaptiveAvgPool1d(1)  # Pooling along the time axis
#         self.fc1 = nn.Linear(input_size[2], 512)  # Adjusted input size
#         self.fc2 = nn.Linear(512, num_classes)
#         self.sigmoid = nn.Sigmoid()

#     def forward(self, x):
#         # x shape: [batch_size, 13, 374, 768]
#         x = x.mean(dim=1)  # Pooling along the time axis, resulting shape: [batch_size, 374, 768]
#         x = self.pool(x.permute(0, 2, 1)).squeeze(-1)  # Shape: [batch_size, 768]
#         x = torch.relu(self.fc1(x))  # Fully connected layer with ReLU activation
#         x = self.fc2(x)  # Output layer
#         return self.sigmoid(x)  # Apply sigmoid to get multi-hot output

class MultiClassClassifier(nn.Module):
    def __init__(self, input_size, num_classes, threshold=0.5):
        super(MultiClassClassifier, self).__init__()
        self.pool = nn.AdaptiveAvgPool1d(1)  # Pooling along the time axis
        self.fc1 = nn.Linear(input_size[2], 512)  # Adjusted input size
        self.fc2 = nn.Linear(512, num_classes)
        self.sigmoid = nn.Sigmoid()
        self.threshold = threshold

    def forward(self, x):
        # x shape: [batch_size, 13, 374, 768]
        x = x.mean(dim=1)  # Pooling along the time axis, resulting shape: [batch_size, 374, 768]
        x = self.pool(x.permute(0, 2, 1)).squeeze(-1)  # Shape: [batch_size, 768]
        x = torch.relu(self.fc1(x))  # Fully connected layer with ReLU activation
        x = self.fc2(x)  # Output layer
        return self.sigmoid(x)  # Apply sigmoid to get probabilities

    def predict(self, x):
        with torch.no_grad():
            probabilities = self.forward(x)
            return (probabilities >= self.threshold).float()  # Apply threshold to get binary output

def train(model, criterion, optimizer, train_loader, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [113]:
batch_size = 16
num_epochs = 3
input_size = (13, 374, 768)
num_classes = 9

# Instantiate the model, loss function, and optimizer
model = MultiClassClassifier(input_size, num_classes)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

dataset = EmbeddingDataset(train_embedding_label)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

train(model, criterion, optimizer, train_loader, num_epochs)

Epoch [1/10], Loss: 0.6957
Epoch [2/10], Loss: 0.6825
Epoch [3/10], Loss: 0.6704
Epoch [4/10], Loss: 0.6584
Epoch [5/10], Loss: 0.6455
Epoch [6/10], Loss: 0.6316
Epoch [7/10], Loss: 0.6172
Epoch [8/10], Loss: 0.6028
Epoch [9/10], Loss: 0.5889
Epoch [10/10], Loss: 0.5760


  return torch.tensor(embedding, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)


In [None]:
torch.save(model.state_dict(), 'model.pth')

In [115]:
# # inference

# sample_input = torch.randn(1, *input_size)
# binary_output = model.predict(sample_input)
# print(binary_output)

tensor([[1., 0., 1., 1., 1., 0., 0., 0., 0.]])
