In [155]:
# multi-label classification
# using MERT model as pre-trained model

In [156]:
class_idx2MIDIClass_path = "hw1/class_idx2MIDIClass.json"

In [157]:
train_audio_path = "hw1/slakh/train/"
train_label_path = "hw1/slakh/train_labels.json"

In [158]:
validation_audio_path = "hw1/slakh/validation/"
validation_label_path = "hw1/slakh/validation_labels.json"

In [159]:
test_audio_path = "hw1/slakh/test/"
test_label_path = "hw1/slakh/test_labels.json"

In [160]:
import json
import numpy as np
import os

# huggingface
# from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2FeatureExtractor
from transformers import AutoModel
import torch
from torch import nn
import torchaudio.transforms as T
from datasets import load_dataset
import nnAudio

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim

import torch
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import classification_report, accuracy_score, f1_score

In [161]:
device = torch.device("mps" if torch.cuda.is_available() else "cpu")

In [162]:
SAMPLE_RATE = 24000

In [163]:
# read index-label mapping
with open(class_idx2MIDIClass_path) as f:
    class_idx2MIDIClass = json.load(f)

print(class_idx2MIDIClass)
# print the mapping
print(class_idx2MIDIClass['0'])


{'0': 'Piano', '1': 'Percussion', '2': 'Organ', '3': 'Guitar', '4': 'Bass', '5': 'Strings', '6': 'Voice', '7': 'Wind Instruments', '8': 'Synth'}
Piano


In [164]:

# get all the audio file names
train_audio_files = []
for root, dirs, files in os.walk(train_audio_path):
    for file in files:
        if file.endswith('.npy'):
            train_audio_files.append(file)

# for all the file in the dataset(under the train audio path), store the audio-file name pair in a list
train_audio = []
for file in range(len(train_audio_files)):
    audio = np.load(train_audio_path + train_audio_files[file])
    train_audio.append((train_audio_files[file], audio))



In [165]:

# read the label file
with open(train_label_path, 'r') as f:
    train_label = json.load(f)

# for every key in the label file, find the corresponding label in train_audio, and append it in the tuple
train_data = []
for key in train_label:
    for audio in train_audio:
        if key == audio[0]:
            train_data.append((audio[0], audio[1], train_label[key]))


In [166]:
# if GPU is available, use it, otherwise use CPU
device = torch.device("mps" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
# torch.cuda.empty_cache()

# loading our model weights
# model = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True)
MERT_model = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True).to(device)
# loading the corresponding preprocessor config
processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v1-95M",trust_remote_code=True)

# happen to be 24kHz, the same as the dataset
resample_rate = processor.sampling_rate

# (label, embedding)
train_embedding_label = []

# use tqdm to show the progress
# for(filename, audio, label) in tqdm(train_data):
# process the data in batches, or the kernel will die
# total: 14994

for(filename, audio, label) in tqdm(train_data):
    input_audio = torch.tensor(audio).float().to(device)
    # input_audio = torch.tensor(audio).float()
    inputs = processor(input_audio, sampling_rate=resample_rate, return_tensors="pt").to(device)
    # inputs = processor(input_audio, sampling_rate=resample_rate, return_tensors="pt")
    with torch.no_grad():
        outputs = MERT_model(**inputs, output_hidden_states=True)
    all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze() # (13, 374, 768)
    time_reduced_hidden_states = all_layer_hidden_states.mean(-2) # (13, 768)
    
    train_embedding_label.append((label, time_reduced_hidden_states))

# Convert tensors to lists
serializable_train_embedding_label = [
    (label, embedding.tolist()) for label, embedding in train_embedding_label
]

# save train_embedding_label
with open('train_embedding_label.json', 'w') as f:
    json.dump(serializable_train_embedding_label, f)

In [167]:
# read the label file back
# Load all JSON file
embedded_data_filename = ["train_embedding_label.json"]

train_embedding_label = []
for filename in embedded_data_filename:
    with open(filename, 'r') as f:
        loaded_train_embedding_label = json.load(f)
        loaded_train_embedding_label = [
            (label, torch.tensor(embedding)) for label, embedding in loaded_train_embedding_label
        ]
        train_embedding_label.extend(loaded_train_embedding_label)

In [215]:
class EmbeddingDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label, embedding = self.data[idx]
        embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]
        return embedding, torch.tensor(label, dtype=torch.float32)

class MultiClassClassifier(nn.Module):
    def __init__(self, input_size, num_classes, thresholds=None):
        super(MultiClassClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size[0] * input_size[1], 512)  # Adjusted input size
        self.fc2 = nn.Linear(512, num_classes)
        self.sigmoid = nn.Sigmoid()
        self.thresholds = thresholds if thresholds is not None else [0.5] * num_classes

    def forward(self, x):
        # x shape: [batch_size, 13, 768]
        x = x.view(x.size(0), -1)  # Flatten, shape: [batch_size, 13 * 768]
        x = torch.relu(self.fc1(x))  # Fully connected layer with ReLU activation
        x = self.fc2(x)  # Output layer
        return self.sigmoid(x)  # Apply sigmoid to get probabilities

    def predict(self, x, thresholds=None):
        if thresholds is None:
            thresholds = self.thresholds
        with torch.no_grad():
            probabilities = self.forward(x)
            return (probabilities >= torch.tensor(thresholds).to(probabilities.device)).float()  # Apply thresholds to get binary output

In [216]:
def train(model, criterion, optimizer, train_loader, num_epochs):
    model.train()
    total_batches = len(train_loader)
    for epoch in range(num_epochs):
        for batch_num, (inputs, labels) in enumerate(train_loader, 1):
            inputs, labels = inputs.to(device), labels.to(device)  # Move data to the device
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_num}/{total_batches}], Loss: {loss.item():.4f}')
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [228]:
def evaluate(model, val_loader, thresholds):
    num_classes = len(thresholds)
    model.eval()
    best_thresholds = [0.5] * num_classes
    best_scores = [0] * num_classes
    
    for i in range(num_classes):
        for threshold in thresholds:
            all_labels = []
            all_preds = []
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                preds = model.predict(inputs, thresholds=[threshold] * num_classes)
                all_labels.append(labels.cpu().numpy()[:, i])
                all_preds.append(preds.cpu().numpy()[:, i])
            all_labels = np.concatenate(all_labels, axis=0)
            all_preds = np.concatenate(all_preds, axis=0)
            score = f1_score(all_labels, all_preds)  # Use F1-score for evaluation
            if score > best_scores[i]:
                best_scores[i] = score
                best_thresholds[i] = threshold
    
    return best_thresholds, best_scores

In [218]:
def test(model, test_loader, thresholds, class_idx2MIDIClass):
    model.eval()
    all_labels = []
    all_preds = []
    
    # Collect predictions and true labels
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        preds = model.predict(inputs, thresholds)
        all_labels.append(labels.cpu().numpy())
        all_preds.append(preds.cpu().numpy())
    
    # Concatenate all predictions and labels
    all_labels = np.concatenate(all_labels, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    
    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    
    # Generate classification report
    target_names = [class_idx2MIDIClass[str(i)] for i in range(len(class_idx2MIDIClass))]
    report = classification_report(all_labels, all_preds, target_names=target_names, zero_division=0)
    
    # Print accuracy and classification report
    print(f'Accuracy: {accuracy:.4f}')
    print(report)
    
    return accuracy, report

In [219]:
batch_size = 16
num_epochs = 10
input_size = (13, 768)
num_classes = 9

# Instantiate the model, loss function, and optimizer
model = MultiClassClassifier(input_size, num_classes).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_dataset = EmbeddingDataset(train_embedding_label)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

train(model, criterion, optimizer, train_loader, num_epochs)

  embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]


Epoch [1/10], Batch [1/938], Loss: 0.7474
Epoch [1/10], Batch [2/938], Loss: 0.6523
Epoch [1/10], Batch [3/938], Loss: 0.6986
Epoch [1/10], Batch [4/938], Loss: 0.6303
Epoch [1/10], Batch [5/938], Loss: 0.5842
Epoch [1/10], Batch [6/938], Loss: 0.6325
Epoch [1/10], Batch [7/938], Loss: 0.5874
Epoch [1/10], Batch [8/938], Loss: 0.5947
Epoch [1/10], Batch [9/938], Loss: 0.6132
Epoch [1/10], Batch [10/938], Loss: 0.6147
Epoch [1/10], Batch [11/938], Loss: 0.6220
Epoch [1/10], Batch [12/938], Loss: 0.6012
Epoch [1/10], Batch [13/938], Loss: 0.6176
Epoch [1/10], Batch [14/938], Loss: 0.6216
Epoch [1/10], Batch [15/938], Loss: 0.6081
Epoch [1/10], Batch [16/938], Loss: 0.5874
Epoch [1/10], Batch [17/938], Loss: 0.6045
Epoch [1/10], Batch [18/938], Loss: 0.6012
Epoch [1/10], Batch [19/938], Loss: 0.6152
Epoch [1/10], Batch [20/938], Loss: 0.6126
Epoch [1/10], Batch [21/938], Loss: 0.6012
Epoch [1/10], Batch [22/938], Loss: 0.6163
Epoch [1/10], Batch [23/938], Loss: 0.6150
Epoch [1/10], Batch 

In [220]:
torch.save(model.state_dict(), 'MERT_model_different_threshold.pth')

In [221]:
# Instantiate the model architecture
model = MultiClassClassifier(input_size, num_classes)

# Load the state dictionary
model.load_state_dict(torch.load('MERT_model_different_threshold.pth'))

# Move the model to the appropriate device
model.to(device)

  model.load_state_dict(torch.load('MERT_model_different_threshold.pth'))


MultiClassClassifier(
  (fc1): Linear(in_features=9984, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=9, bias=True)
  (sigmoid): Sigmoid()
)

In [222]:

# get all the validation audio file names
validation_audio_files = []
for root, dirs, files in os.walk(validation_audio_path):
    for file in files:
        if file.endswith('.npy'):
            validation_audio_files.append(file)

# for all the file in the dataset(under the validation audio path), store the audio-file name pair in a list
validation_audio = []
for file in range(len(validation_audio_files)):
    audio = np.load(validation_audio_path + validation_audio_files[file])
    validation_audio.append((validation_audio_files[file], audio))



In [223]:

# read the label file
with open(validation_label_path, 'r') as f:
    validation_label = json.load(f)

# for every key in the label file, find the corresponding label in train_audio, and append it in the tuple
validation_data = []
for key in validation_label:
    for audio in validation_audio:
        if key == audio[0]:
            validation_data.append((audio[0], audio[1], validation_label[key]))

In [224]:
# if GPU is available, use it, otherwise use CPU
device = torch.device("mps" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
# torch.cuda.empty_cache()

# loading our model weights
# model = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True)
MERT_model = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True).to(device)
# loading the corresponding preprocessor config
processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v1-95M",trust_remote_code=True)

# happen to be 24kHz, the same as the dataset
resample_rate = processor.sampling_rate

# (label, embedding)
validation_embedding_label = []

# use tqdm to show the progress
# process the data in batches, or the kernel will die
# total: 3747

for(filename, audio, label) in tqdm(validation_data):
    input_audio = torch.tensor(audio).float().to(device)
    # input_audio = torch.tensor(audio).float()
    inputs = processor(input_audio, sampling_rate=resample_rate, return_tensors="pt").to(device)
    # inputs = processor(input_audio, sampling_rate=resample_rate, return_tensors="pt")
    with torch.no_grad():
        outputs = MERT_model(**inputs, output_hidden_states=True)
    all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze() # (13, 374, 768)
    time_reduced_hidden_states = all_layer_hidden_states.mean(-2) # (13, 768)
    
    validation_embedding_label.append((label, time_reduced_hidden_states))

# Convert tensors to lists
serializable_validation_embedding_label = [
    (label, embedding.tolist()) for label, embedding in validation_embedding_label
]

# save train_embedding_label
with open('validation_embedding_label.json', 'w') as f:
    json.dump(serializable_validation_embedding_label, f)

In [225]:
# read the label file back
# Load all JSON file
embedded_data_filename = ["validation_embedding_label"]

validation_embedding_label = []
for filename in embedded_data_filename:
    with open(filename, 'r') as f:
        loaded_validation_embedding_label = json.load(f)
        loaded_validation_embedding_label = [
            (label, torch.tensor(embedding)) for label, embedding in loaded_validation_embedding_label
        ]
        validation_embedding_label.extend(loaded_validation_embedding_label)

In [226]:
val_dataset = EmbeddingDataset(validation_embedding_label)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [229]:
# Evaluate the model on the validation set to find the best threshold
thresholds = np.arange(0.1, 1.0, 0.1)
best_threshold, best_score = evaluate(model, val_loader, thresholds)
print(f'Best Threshold: {best_threshold}, Best Score: {best_score}')

  embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]
  embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]
  embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]
  embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]
  embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]
  embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]
  embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]
  embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]
  embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]
  embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]
  embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]
  embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]
  embedding = torch.tensor(embedding, dtype=torch.fl

Best Threshold: [0.1, 0.5, 0.5, 0.1, 0.1, 0.1, 0.5, 0.1, 0.5], Best Score: [0.9224782067247821, 0, 0, 0.9315109642535296, 0.9726918075422627, 0.8211448918104629, 0, 0.6764447051921385, 0]


In [230]:
test_audio_files = []
for root, dirs, files in os.walk(test_audio_path):
    for file in files:
        if file.endswith('.npy'):
            test_audio_files.append(file)

# for all the file in the dataset(under the test audio path), store the audio-file name pair in a list
test_audio = []
for file in range(len(test_audio_files)):
    audio = np.load(test_audio_path + test_audio_files[file])
    test_audio.append((test_audio_files[file], audio))



In [231]:

# read the label file
with open(test_label_path, 'r') as f:
    test_label = json.load(f)

# for every key in the label file, find the corresponding label in train_audio, and append it in the tuple
test_data = []
for key in test_label:
    for audio in test_audio:
        if key == audio[0]:
            test_data.append((audio[0], audio[1], test_label[key]))

In [232]:
# if GPU is available, use it, otherwise use CPU
device = torch.device("mps" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
# torch.cuda.empty_cache()

# loading our model weights
# model = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True)
MERT_model = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True).to(device)
# loading the corresponding preprocessor config
processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v1-95M",trust_remote_code=True)

# happen to be 24kHz, the same as the dataset
resample_rate = processor.sampling_rate

# (label, embedding)
test_embedding_label = []

# use tqdm to show the progress
# process the data in batches, or the kernel will die
for(filename, audio, label) in tqdm(test_data):
    input_audio = torch.tensor(audio).float().to(device)
    # input_audio = torch.tensor(audio).float()
    inputs = processor(input_audio, sampling_rate=resample_rate, return_tensors="pt").to(device)
    # inputs = processor(input_audio, sampling_rate=resample_rate, return_tensors="pt")
    with torch.no_grad():
        outputs = MERT_model(**inputs, output_hidden_states=True)
    all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze() # (13, 374, 768)
    time_reduced_hidden_states = all_layer_hidden_states.mean(-2) # (13, 768)
    
    test_embedding_label.append((label, time_reduced_hidden_states))

# Convert tensors to lists
serializable_test_embedding_label = [
    (label, embedding.tolist()) for label, embedding in test_embedding_label
]

# save train_embedding_label
with open('test_embedding_label.json', 'w') as f:
    json.dump(serializable_test_embedding_label, f)

In [233]:
# read the label file back
# Load all JSON file
embedded_data_filename = ["test_embedding_label.json"]

test_embedding_label = []
for filename in embedded_data_filename:
    with open(filename, 'r') as f:
        loaded_test_embedding_label = json.load(f)
        loaded_test_embedding_label = [
            (label, torch.tensor(embedding)) for label, embedding in loaded_test_embedding_label
        ]
        test_embedding_label.extend(loaded_test_embedding_label)

In [234]:
test_dataset = EmbeddingDataset(test_embedding_label)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [235]:
# Test the model on the test set using the best threshold
test_metrics = test(model, test_loader, best_threshold, class_idx2MIDIClass)
print(f'Test Metrics: {test_metrics}')

# Set the best threshold in the model
model.threshold = best_threshold

  embedding = torch.tensor(embedding, dtype=torch.float32)  # Shape: [13, 768]


Accuracy: 0.1762
                  precision    recall  f1-score   support

           Piano       0.91      0.93      0.92      1889
      Percussion       0.00      0.00      0.00       243
           Organ       0.00      0.00      0.00       461
          Guitar       0.90      0.97      0.93      1943
            Bass       0.97      0.97      0.97      2076
         Strings       0.88      0.70      0.78      1235
           Voice       0.00      0.00      0.00       485
Wind Instruments       0.57      0.71      0.63       889
           Synth       0.00      0.00      0.00       647

       micro avg       0.87      0.73      0.79      9868
       macro avg       0.47      0.48      0.47      9868
    weighted avg       0.72      0.73      0.72      9868
     samples avg       0.87      0.73      0.78      9868

Test Metrics: (0.17623497997329773, '                  precision    recall  f1-score   support\n\n           Piano       0.91      0.93      0.92      1889\n      Percu

In [236]:
print(best_threshold)

[0.1, 0.5, 0.5, 0.1, 0.1, 0.1, 0.5, 0.1, 0.5]


In [238]:
torch.save(model.state_dict(), 'MERT_model_different_threshold.pth')
# save the threshold
with open('best_threshold.json', 'w') as f:
    json.dump(best_threshold, f)

In [148]:
# # Inference with the best threshold
# sample_input = torch.randn(1, *input_size).to(device)  # Ensure the input is on the same device as the model
# binary_output = model.predict(sample_input)
# print(binary_output)