In [1]:
import os
import torchaudio
import librosa
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import torch

In [2]:
SAMPLE_RATE = 16000
N_MFCC = 13
N_FFT = 400
HOP_LENGTH = 160
N_MELS = 128
DATA_PATH = '/scratch/as20482/ML_Final_Proj/AudioSet-classification/Data'

In [3]:
def extract_mfcc_features(waveforms):
    mfcc_transform = torchaudio.transforms.MFCC(
        sample_rate=SAMPLE_RATE,
        n_mfcc=N_MFCC,
        melkwargs={"n_fft": N_FFT, "hop_length": HOP_LENGTH, "n_mels": N_MELS}
    )
    return mfcc_transform(waveforms).numpy()

In [4]:
def extract_mel_spectrogram_features(waveforms):
    mel_spectrogram_transform = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=N_FFT,
        hop_length=HOP_LENGTH,
        n_mels=N_MELS
    )
    return mel_spectrogram_transform(waveforms).numpy()

In [5]:
def extract_chroma_features(waveforms):
    def process_waveform(waveform):
        return librosa.feature.chroma_stft(y=waveform.numpy(), sr=SAMPLE_RATE, n_fft=N_FFT, hop_length=HOP_LENGTH)
    with ThreadPoolExecutor() as executor:
        chroma_features = list(executor.map(process_waveform, waveforms))
    return np.array(chroma_features)

In [6]:
def extract_zero_crossing_rate(waveform, sample_rate):
    with ThreadPoolExecutor() as executor:
        zero_crossing_rate = list(executor.map(librosa.feature.zero_crossing_rate, waveforms))
    return np.array(zero_crossing_rate)

In [7]:
data = torch.load(os.path.join(DATA_PATH, 'train', 'resampled_waveforms.pt'))

  data = torch.load(os.path.join(DATA_PATH, 'train', 'resampled_waveforms.pt'))


In [8]:
labels = torch.load(os.path.join(DATA_PATH, 'train', 'labels.pt'))

  labels = torch.load(os.path.join(DATA_PATH, 'train', 'labels.pt'))


In [9]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [32]:
forest = RandomForestClassifier(random_state=1, verbose=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=8)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', multi_target_forest)
], verbose=True)

In [22]:
X_tr = extract_mfcc_features(data).squeeze()

In [10]:
y_tr = labels.numpy()

In [28]:
X_tr_mean = X_tr.mean(axis=-1)

In [None]:
pipeline.fit(X_tr_mean, y_tr)

In [11]:
from sklearn.metrics import average_precision_score

In [27]:
def calculate_map(y_true, y_pred):
    n_classes = y_true.shape[1]
    average_precisions = []
    for i in range(n_classes):
        y_true_class = y_true[:, i]
        y_pred_class = y_pred[:, i]
        ap = average_precision_score(y_true_class, y_pred_class)
        average_precisions.append(ap)
    average_precisions.sort(reverse=True)
    print(average_precisions[:5])
    return np.mean(average_precisions)

In [None]:
y_tr_mean_pred = pipeline.predict(X_tr_mean)

In [41]:
calculate_map(y_tr, y_tr_mean_pred)

np.float64(0.9972722680120194)

In [42]:
average_precision_score(y_tr, y_tr_mean_pred)

np.float64(0.9972722680120194)

In [13]:
test_data = torch.load(os.path.join(DATA_PATH, 'test', 'resampled_waveforms.pt'))

  test_data = torch.load(os.path.join(DATA_PATH, 'test', 'resampled_waveforms.pt'))


In [14]:
test_labels = torch.load(os.path.join(DATA_PATH, 'test', 'labels.pt'))

  test_labels = torch.load(os.path.join(DATA_PATH, 'test', 'labels.pt'))


In [54]:
X_te = extract_mfcc_features(test_data).squeeze()



In [15]:
y_te = test_labels.numpy()

In [55]:
X_te_mean = X_te.mean(axis=-1)

In [None]:
y_te_mean_pred = pipeline.predict(X_te_mean)

In [57]:
calculate_map(y_te, y_te_mean_pred)

np.float64(0.006678865944385803)

In [58]:
average_precision_score(y_te, y_te_mean_pred)

np.float64(0.006678865944385803)

In [59]:
X_tr_max = X_tr.max(axis=-1)

In [62]:
forest2 = RandomForestClassifier(random_state=1)
multi_target_forest2 = MultiOutputClassifier(forest2, n_jobs=8)
pipeline2 = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', multi_target_forest2)
])

In [63]:
pipeline2.fit(X_tr_max, y_tr)

In [64]:
X_te_max = X_te.max(axis=-1)

In [65]:
y_te_max_pred = pipeline2.predict(X_te_max)

In [66]:
calculate_map(y_te, y_te_max_pred)

np.float64(0.0062313718063227285)

In [18]:
X_mel_tr = extract_mel_spectrogram_features(data).squeeze()



In [15]:
X_mel_tr.shape

(20550, 128, 1002)

In [16]:
X_mel_tr_mean = X_mel_tr.mean(axis=-1)

In [18]:
forest3 = RandomForestClassifier(random_state=1)
multi_target_forest3 = MultiOutputClassifier(forest3, n_jobs=8)
pipeline3 = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', multi_target_forest3)
])

In [21]:
pipeline3.fit(X_mel_tr_mean, y_tr)

In [19]:
X_mel_te = extract_mel_spectrogram_features(test_data).squeeze()

In [24]:
X_mel_te_mean = X_mel_te.mean(axis=-1)

In [25]:
y_mel_te_mean_pred = pipeline3.predict(X_mel_te_mean)

In [26]:
calculate_map(y_te, y_mel_te_mean_pred)

np.float64(0.007762111188551886)

In [16]:
from sklearn.neural_network import MLPClassifier

In [17]:
mlp = MLPClassifier(hidden_layer_sizes=(50, 30),
                    activation='relu',
                    solver='adam',
                    max_iter=500,
                    random_state=1)

In [20]:
X_mel_tr_flat = X_mel_tr.reshape(X_mel_tr.shape[0],-1)

In [21]:
X_mel_te_flat = X_mel_te.reshape(X_mel_te.shape[0],-1)

In [22]:
mlp.fit(X_mel_tr_flat, y_tr)

In [23]:
y_mel_nn_te_flat_pred = mlp.predict(X_mel_te_flat)

In [28]:
calculate_map(y_te, y_mel_nn_te_flat_pred)

[np.float64(0.27353595255744995), np.float64(0.25913374986762683), np.float64(0.05781987752285035), np.float64(0.03939426029863391), np.float64(0.034311129937519856)]


np.float64(0.004938648791470139)