### Task2: Traditional ML Model

- Train a traditional ML model (e.g. k-NN, SVM, random forest) with any features
extracted from the audio

- Need to report how to implement the model clearly

- Need to report the testing result (not validation result) with confusion matrix,
top1 accuracy, and top3 accuracy

- Remember to utilize standardization (e.g. mean, std), pooling and normalization
to ensure consistent feature scales, reducing overfitting, and improving model
stability and performance during training

In [2]:
import sklearn
import numpy as np
import os
import json
import librosa
import torchaudio

In [3]:
# load the json file
def load_json(json_file):
    with open(json_file) as f:
        data = json.load(f)
    return data

data = load_json('nsynth-subtrain/examples.json')

# get all "instrument_family_str"
instrument_family_str = set()
for key in data:
    instrument_family_str.add(data[key]["instrument_family_str"])

print(instrument_family_str)

{'bass', 'mallet', 'reed', 'string', 'synth_lead', 'keyboard', 'vocal', 'brass', 'flute', 'organ', 'guitar'}


In [4]:
# get all keys in data
keys = list(data.keys())

In [5]:
# def feature_extraction(key, file_path):
#     # feature extraction from audio file
#     y, sr = librosa.load(file_path)
#     # extract the chroma feature
#     chroma = librosa.feature.chroma_stft(y=y, sr=sr)
#     # extract the spectral contrast feature
#     spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
#     # extract the Mel-frequency Cepstral Coefficients
#     mfcc = librosa.feature.mfcc(y=y, sr=sr)
#     # extract the Zero-Crossing Rate
#     zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
#     # extract the Spectral Centroid
#     spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)

#     # put all features into a list
#     features = [chroma, spectral_contrast, mfcc, zero_crossing_rate, spectral_centroid]

#     return features
    

def feature_extraction(key, file_path):
    y, sr = librosa.load(file_path)

    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)

    # put all features into a list
    features = [spectral_contrast, mfcc]

    return features

In [6]:
import tqdm
# extract the features from each audio file

features = []

# for file in keys:
for key in tqdm.tqdm(keys):
    file = 'nsynth-subtrain/audio/' + key + '.wav'
    # extract the features
    feature = feature_extraction(key, file)
    features.append(feature)

 14%|█▍        | 6804/48037 [01:35<09:36, 71.50it/s]


KeyboardInterrupt: 

In [6]:
for i in features:
    print(len(i))
    print(i[0].shape)
    print(i[1].shape)
    break

2
(7, 173)
(20, 173)


In [7]:
# flatten
spectral_contrast_features = [f[0].flatten() for f in features]
mfcc_features = [f[1].flatten() for f in features]

In [8]:
from sklearn.preprocessing import StandardScaler
# normalize

scaler_spectral_contrast = StandardScaler()
spectral_contrast_normalized = scaler_spectral_contrast.fit_transform(spectral_contrast_features)

scaler_mfcc = StandardScaler()
mfcc_normalized = scaler_mfcc.fit_transform(mfcc_features)

In [9]:
features_combined = np.concatenate((spectral_contrast_normalized, mfcc_normalized), axis=1)

In [10]:
# print the size of the features
print(features_combined.shape)

(48037, 4671)


In [None]:
# save the features
np.save('features.npy', features_combined)

In [11]:
# get label and file name of each audio file
labels = []
for key in keys:
    labels.append(data[key]["instrument_family"])


In [1]:
# features_combined: the features of each audio file
# labels: the labels of each audio file
# keys: the file name of each audio file

# use knn to classify the audio files
# use all the data to train the model

# # use svm to classify the audio files
# # use all the data to train the model
# from sklearn.svm import SVC
# model = SVC(kernel='linear')
# model.fit(features_combined, labels)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(features_combined, labels)

NameError: name 'features_combined' is not defined

In [None]:
# save model checkpoint
import joblib
joblib.dump(knn, 'knn_model.pkl')


In [None]:
# # get the knn model
# knn = joblib.load('knn_model.pkl')

In [17]:
# validate the model
val_data = load_json('nsynth-valid/examples.json')
val_keys = list(val_data.keys())

val_features = []
for key in tqdm.tqdm(val_keys):
    file = 'nsynth-valid/audio/' + key + '.wav'
    feature = feature_extraction(key, file)
    val_features.append(feature)

val_spectral_contrast_features = [f[0].flatten() for f in val_features]
val_mfcc_features = [f[1].flatten() for f in val_features]
# normalize
val_spectral_contrast_normalized = scaler_spectral_contrast.transform(val_spectral_contrast_features)
val_mfcc_normalized = scaler_mfcc.transform(val_mfcc_features)
val_features_combined = np.concatenate((val_spectral_contrast_normalized, val_mfcc_normalized), axis=1)

val_labels = []
for key in val_keys:
    val_labels.append(val_data[key]["instrument_family"])

# predict the labels of the validation data
val_pred = knn.predict(val_features_combined)

# calculate the accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(val_labels, val_pred)
print(accuracy)

# precision, recall, f1-score
from sklearn.metrics import classification_report
print(classification_report(val_labels, val_pred))

100%|██████████| 12678/12678 [03:08<00:00, 67.16it/s]


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assume val_labels and val_pred are your true and predicted labels
# Compute confusion matrix
cm = confusion_matrix(val_labels, val_pred)

# Plot confusion matrix using seaborn's heatmap
plt.figure(figsize=(8, 6))  # Set the figure size
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, 
            xticklabels=instrument_family_str,
            yticklabels=instrument_family_str)
            # xticklabels=['Predicted Class 0', 'Predicted Class 1'], 
            # yticklabels=['Actual Class 0', 'Actual Class 1'])

# Add labels and title
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Confusion Matrix')

# Show the plot
plt.show()



In [None]:
# top 1 accuracy
top1 = 0
for i in range(len(val_labels)):
    if val_labels[i] == val_pred[i]:
        top1 += 1
top1 /= len(val_labels)
print(top1)

# top 3 accuracy
top3 = 0
for i in range(len(val_labels)):
    if val_labels[i] in knn.classes_[np.argsort(knn.predict_proba(val_features_combined)[i])[-3:]]:
        top3 += 1
top3 /= len(val_labels)
print(top3)

In [None]:
# test the model
test_data = load_json('nsynth-test/examples.json')
test_keys = list(test_data.keys())

test_features = []
for key in tqdm.tqdm(test_keys):
    file = 'nsynth-test/audio/' + key + '.wav'
    feature = feature_extraction(key, file)
    test_features.append(feature)

test_spectral_contrast_features = [f[0].flatten() for f in test_features]
test_mfcc_features = [f[1].flatten() for f in test_features]
# normalize
test_spectral_contrast_normalized = scaler_spectral_contrast.transform(test_spectral_contrast_features)
test_mfcc_normalized = scaler_mfcc.transform(test_mfcc_features)
test_features_combined = np.concatenate((test_spectral_contrast_normalized, test_mfcc_normalized), axis=1)

test_labels = []
for key in test_keys:
    test_labels.append(test_data[key]["instrument_family"])

# predict the labels of the test data
test_pred = knn.predict(test_features_combined)

# calculate the accuracy
accuracy = accuracy_score(test_labels, test_pred)
print(accuracy)

# precision, recall, f1-score
print(classification_report(test_labels, test_pred))
# confusion matrix
print(confusion_matrix(test_labels, test_pred))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assume val_labels and val_pred are your true and predicted labels
# Compute confusion matrix
cm = confusion_matrix(test_labels, test_pred)

# Plot confusion matrix using seaborn's heatmap
plt.figure(figsize=(8, 6))  # Set the figure size
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, 
            xticklabels=instrument_family_str,
            yticklabels=instrument_family_str)
            # xticklabels=['Predicted Class 0', 'Predicted Class 1'], 
            # yticklabels=['Actual Class 0', 'Actual Class 1'])

# Add labels and title
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Confusion Matrix')

# Show the plot
plt.show()



In [None]:
# top 1 accuracy
top1 = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_pred[i]:
        top1 += 1
top1 /= len(test_labels)
print(top1)

# top 3 accuracy
top3 = 0
for i in range(len(test_labels)):
    if test_labels[i] in knn.classes_[np.argsort(knn.predict_proba(test_features_combined)[i])[-3:]]:
        top3 += 1
top3 /= len(test_labels)
print(top3)