### Task2: Traditional ML Model

- Train a traditional ML model (e.g. k-NN, SVM, random forest) with any features
extracted from the audio

- Need to report how to implement the model clearly

- Need to report the testing result (not validation result) with confusion matrix,
top1 accuracy, and top3 accuracy

- Remember to utilize standardization (e.g. mean, std), pooling and normalization
to ensure consistent feature scales, reducing overfitting, and improving model
stability and performance during training

In [20]:
# training dataset file path:
traning_data_path = '<PUT THE PATH TO THE TRAINING DATA HERE>'

traning_data_path = 'nsynth-subtrain'

In [21]:
# training dataset file path:
validation_data_path = '<PUT THE PATH TO THE VALIDATION DATA HERE>'

# the code of validation is comment out at the end of the file

In [22]:
import sklearn
import numpy as np
import os
import json
import librosa
import torchaudio
import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

import joblib

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix



In [23]:
# load the json file
def load_json(json_file):
    with open(json_file) as f:
        data = json.load(f)
    return data

# let the json path be /examples.json under the "traning_data_path"
json_path = os.path.join(traning_data_path, 'examples.json')

data = load_json(json_path)

In [24]:
# get all keys in data
keys = list(data.keys())

In [25]:
def feature_extraction(key, file_path):
    y, sr = librosa.load(file_path)

    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)

    # put all features into a list
    features = [spectral_contrast, mfcc]

    return features

In [26]:
# extract the features from each audio file

features = []

# for file in keys:
for key in tqdm.tqdm(keys):
    file = os.path.join(traning_data_path, 'audio', key + '.wav')
    # extract the features
    feature = feature_extraction(key, file)
    features.append(feature)

100%|██████████| 48037/48037 [13:24<00:00, 59.69it/s]


In [27]:
for i in features:
    print(len(i))
    print(i[0].shape)
    print(i[1].shape)
    break

2
(7, 173)
(20, 173)


In [28]:
# flatten
spectral_contrast_features = [f[0].flatten() for f in features]
mfcc_features = [f[1].flatten() for f in features]

In [29]:

# normalize

scaler_spectral_contrast = StandardScaler()
spectral_contrast_normalized = scaler_spectral_contrast.fit_transform(spectral_contrast_features)

scaler_mfcc = StandardScaler()
mfcc_normalized = scaler_mfcc.fit_transform(mfcc_features)

# dump the scaler
joblib.dump(scaler_spectral_contrast, 'scaler_spectral_contrast.pkl')
joblib.dump(scaler_mfcc, 'scaler_mfcc.pkl')

['scaler_mfcc.pkl']

In [30]:
features_combined = np.concatenate((spectral_contrast_normalized, mfcc_normalized), axis=1)

In [31]:
# print the size of the features
print(features_combined.shape)

(48037, 4671)


In [32]:
# get label and file name of each audio file
labels = []
for key in keys:
    labels.append(data[key]["instrument_family_str"])


In [33]:
print(labels[40000])

synth_lead


In [34]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(features_combined, labels)

In [35]:
# save model checkpoint
joblib.dump(knn, 'knn_model.pkl')


['knn_model.pkl']

In [36]:
# # get the knn model
# knn = joblib.load('knn_model.pkl')

In [37]:
# # validate the model
# val_data = load_json('nsynth-valid/examples.json')
# val_keys = list(val_data.keys())

# val_features = []
# for key in tqdm.tqdm(val_keys):
#     file = 'nsynth-valid/audio/' + key + '.wav'
#     feature = feature_extraction(key, file)
#     val_features.append(feature)

# val_spectral_contrast_features = [f[0].flatten() for f in val_features]
# val_mfcc_features = [f[1].flatten() for f in val_features]
# # normalize
# val_spectral_contrast_normalized = scaler_spectral_contrast.transform(val_spectral_contrast_features)
# val_mfcc_normalized = scaler_mfcc.transform(val_mfcc_features)
# val_features_combined = np.concatenate((val_spectral_contrast_normalized, val_mfcc_normalized), axis=1)

# val_labels = []
# for key in val_keys:
#     val_labels.append(val_data[key]["instrument_family_str"])

# # predict the labels of the validation data
# val_pred = knn.predict(val_features_combined)

# # calculate the accuracy
# accuracy = accuracy_score(val_labels, val_pred)
# print(accuracy)

# # precision, recall, f1-score
# print(classification_report(val_labels, val_pred))

In [38]:

# # Assume val_labels and val_pred are your true and predicted labels
# # Compute confusion matrix
# cm = confusion_matrix(val_labels, val_pred)

# # Plot confusion matrix using seaborn's heatmap
# plt.figure(figsize=(8, 6))  # Set the figure size
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, 
#             xticklabels=instrument_family_str,
#             yticklabels=instrument_family_str)
#             # xticklabels=['Predicted Class 0', 'Predicted Class 1'], 
#             # yticklabels=['Actual Class 0', 'Actual Class 1'])

# # Add labels and title
# plt.ylabel('True label')
# plt.xlabel('Predicted label')
# plt.title('Confusion Matrix')

# # Show the plot
# plt.show()



In [39]:
# # top 1 accuracy
# top1 = 0
# for i in range(len(val_labels)):
#     if val_labels[i] == val_pred[i]:
#         top1 += 1
# top1 /= len(val_labels)
# print(top1)

# # top 3 accuracy
# top3 = 0
# for i in range(len(val_labels)):
#     if val_labels[i] in knn.classes_[np.argsort(knn.predict_proba(val_features_combined)[i])[-3:]]:
#         top3 += 1
# top3 /= len(val_labels)
# print(top3)