# [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)

Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.

## Baselines

* This notebook evaluates standard classifiers from scikit-learn on the provided features.
* Moreover, it evaluates Deep Learning models on both audio and spectrograms.

In [9]:
import time
import os

import IPython.display as ipd
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import keras
from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape

from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier

import utils

In [10]:
# Retrieve the directory where the audio files are stored
AUDIO_DIR = os.environ.get('AUDIO_DIR')

# Load the main metadata and feature files using the custom utils.load function.
# This function handles FMA-specific formatting (e.g., multi-level headers, type conversions).
tracks   = utils.load('data/fma_metadata/tracks.csv')
genres   = utils.load('data/fma_metadata/genres.csv')
features = utils.load('data/fma_metadata/features.csv')
echonest = utils.load('data/fma_metadata/echonest.csv')

# Validate that the features DataFrame aligns with the tracks DataFrame
np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

# Print the shapes of the loaded DataFrames.
# - tracks: Contains track metadata (e.g., title, artist, album, subset/split info)
# - features: Contains low-level audio features (e.g., MFCCs, spectral features)
# - echonest: Contains additional audio and social features from Echonest
# - genres: Contains the genre taxonomy (IDs, titles, hierarchy, etc.)
print("tracks shape:   {} (tracks x metadata columns)".format(tracks.shape))
print("features shape: {} (tracks x audio features)".format(features.shape))
print("echonest shape: {} (tracks x Echonest features)".format(echonest.shape))
print("genres shape:   {} (genres x genre metadata)".format(genres.shape))

tracks shape:   (106574, 52) (tracks x metadata columns)
features shape: (106574, 518) (tracks x audio features)
echonest shape: (13129, 249) (tracks x Echonest features)
genres shape:   (163, 4) (genres x genre metadata)


## Subset

In [11]:
# subset = tracks.index[tracks['set', 'subset'] <= 'medium']

# assert subset.isin(tracks.index).all()
# assert subset.isin(features.index).all()

# features_all = features.join(echonest, how='inner').sort_index(axis=1)
# print('Not enough Echonest features: {}'.format(features_all.shape))

# tracks = tracks.loc[subset]
# features_all = features.loc[subset]

# tracks.shape, features_all.shape
# Filter the tracks DataFrame to include only those in the "small" and "medium" subsets.
# The 'subset' column is located under the top-level 'set' column.
# subset = tracks.index[tracks['set', 'subset'] <= 'medium']
subset = tracks.index[tracks['set', 'subset'] <= 'small']

# Verify that every track in the subset is present in both the 'tracks' and 'features' DataFrames.
assert subset.isin(tracks.index).all(), "Subset contains tracks not found in the tracks DataFrame."
assert subset.isin(features.index).all(), "Subset contains tracks not found in the features DataFrame."

# Join the 'features' and 'echonest' DataFrames using an inner join.
# This keeps only the tracks that have Echonest data.
# The joined DataFrame is then sorted by its columns for clarity.
features_all = features.join(echonest, how='inner').sort_index(axis=1)
print('Joined features (features + Echonest) shape: {}'.format(features_all.shape))
# Note: This printed shape should show a lower number of rows (e.g., 13,129) if only those tracks have Echonest data.

# Now, restrict the 'tracks' DataFrame to only the filtered subset (tracks in "small" and "medium").
tracks = tracks.loc[subset]

# Overwrite 'features_all' with the subset from the original 'features' DataFrame.
# WARNING: This step discards the Echonest data join above.
# If you want to keep the joined features (features + Echonest), you should apply the subset filter to that DataFrame.
features_all = features.loc[subset]

# Print out the final shapes of the filtered DataFrames for verification.
print("Subset tracks shape: {} (tracks x metadata)".format(tracks.shape))
print("Subset features shape: {} (tracks x audio features)".format(features_all.shape))


Joined features (features + Echonest) shape: (13129, 767)
Subset tracks shape: (8000, 52) (tracks x metadata)
Subset features shape: (8000, 518) (tracks x audio features)


In [12]:
# train = tracks.index[tracks['set', 'split'] == 'training']
# val = tracks.index[tracks['set', 'split'] == 'validation']
# test = tracks.index[tracks['set', 'split'] == 'test']

# print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, [train, val, test])))

# genres = list(LabelEncoder().fit(tracks['track', 'genre_top']).classes_)
# #genres = list(tracks['track', 'genre_top'].unique())
# print('Top genres ({}): {}'.format(len(genres), genres))

# genres = list(MultiLabelBinarizer().fit(tracks['track', 'genres_all']).classes_)
# print('All genres ({}): {}'.format(len(genres), genres))
# Split the dataset into training, validation, and test sets based on the 'split' information.
train = tracks.index[tracks['set', 'split'] == 'training']
val   = tracks.index[tracks['set', 'split'] == 'validation']
test  = tracks.index[tracks['set', 'split'] == 'test']

# Print the number of examples in each set.
print('{} training examples, {} validation examples, {} testing examples'.format(
    *map(len, [train, val, test])
))

# ------------------------------------------------------------------------------
# Top Genres: Single-label classification
#
# The 'genre_top' column in the tracks DataFrame contains the primary (top-level) genre
# for each track. We use LabelEncoder to extract and sort the unique genre names.
top_genre_encoder = LabelEncoder()
top_genre_encoder.fit(tracks['track', 'genre_top'])
top_genres_list = list(top_genre_encoder.classes_)

print('Top genres ({}): {}'.format(len(top_genres_list), top_genres_list))
# ------------------------------------------------------------------------------
# All Genres: Multi-label classification
#
# The 'genres_all' column contains a list of all genre IDs for each track.
# MultiLabelBinarizer finds all unique genre IDs across the dataset.
# (Note: If you wish to map these numeric IDs to human-readable names,
# you would need to use a mapping dictionary from your genres DataFrame.)
genre_map = genres['title'].to_dict()

mlb = MultiLabelBinarizer()
mlb.fit(tracks['track', 'genres_all'])
all_genre_ids = list(mlb.classes_)  # These are numeric (np.int64)
# Map each numeric genre ID to its actual name
all_genres_names = [genre_map.get(int(g), f"Unknown({g})") for g in all_genre_ids]




print('All genres ({}): {}'.format(len(all_genres_names), all_genres_names))


6400 training examples, 800 validation examples, 800 testing examples
Top genres (8): ['Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Pop', 'Rock']
All genres (114): ['Avant-Garde', 'International', 'Novelty', 'Pop', 'Rock', 'Electronic', 'Sound Effects', 'Folk', 'Soundtrack', 'Hip-Hop', 'Audio Collage', 'Punk', 'Post-Rock', 'Lo-Fi', 'Field Recordings', 'Metal', 'Noise', 'Psych-Folk', 'Krautrock', 'Experimental', 'Electroacoustic', 'Ambient Electronic', 'Loud-Rock', 'Latin America', 'Drone', 'Free-Folk', 'Noise-Rock', 'Psych-Rock', 'Electro-Punk', 'Indie-Rock', 'Industrial', 'No Wave', 'Experimental Pop', 'French', 'Reggae - Dub', 'Afrobeat', 'Nerdcore', 'Garage', 'Indian', 'New Wave', 'Post-Punk', 'Sludge', 'African', 'Freak-Folk', 'Progressive', 'Alternative Hip-Hop', 'Death-Metal', 'Middle East', 'Singer-Songwriter', 'Ambient', 'Hardcore', 'Power-Pop', 'Space-Rock', 'Polka', 'Balkan', 'Unclassifiable', 'Europe', 'Black-Metal', 'Brazilian', 'Asia-F

## 1 Multiple classifiers and feature sets

Todo:
* Cross-validation for hyper-parameters.
* Dimensionality reduction?

### 1.1 Pre-processing

In [13]:
# def pre_process(tracks, features, columns, multi_label=False, verbose=False):
#     if not multi_label:
#         # Assign an integer value to each genre.
#         enc = LabelEncoder()
#         labels = tracks['track', 'genre_top']
#         #y = enc.fit_transform(tracks['track', 'genre_top'])
#     else:
#         # Create an indicator matrix.
#         enc = MultiLabelBinarizer()
#         labels = tracks['track', 'genres_all']
#         #labels = tracks['track', 'genres']

#     # Split in training, validation and testing sets.
#     y_train = enc.fit_transform(labels[train])
#     y_val = enc.transform(labels[val])
#     y_test = enc.transform(labels[test])
    
#     X_train = features.loc[train, columns].as_matrix()
#     X_val = features.loc[val, columns].as_matrix()
#     X_test = features.loc[test, columns].as_matrix()
    
#     X_train, y_train = shuffle(X_train, y_train, random_state=42)
    
#     # Standardize features by removing the mean and scaling to unit variance.
#     scaler = StandardScaler(copy=False)
#     scaler.fit_transform(X_train)
#     scaler.transform(X_val)
#     scaler.transform(X_test)
    
#     return y_train, y_val, y_test, X_train, X_val, X_test
def pre_process(tracks, features, columns, multi_label=False, verbose=False):
    # 1. Encode the labels (genres) based on the type of classification.
    if not multi_label:
        # For single-label classification (one primary genre per track):
        #   - Use LabelEncoder to convert genre names (strings) to integers.
        enc = LabelEncoder()
        labels = tracks['track', 'genre_top']
        # y = enc.fit_transform(tracks['track', 'genre_top'])
    else:
        # For multi-label classification (multiple genres per track):
        #   - Use MultiLabelBinarizer to create an indicator matrix,
        #     where each column corresponds to a genre and each row contains 1s (present) or 0s (absent).
        enc = MultiLabelBinarizer()
        labels = tracks['track', 'genres_all']
        # labels = tracks['track', 'genres']
    
    # 2. Split the labels into training, validation, and test sets.
    #    'train', 'val', and 'test' are assumed to be pre-defined index arrays.
    y_train = enc.fit_transform(labels[train])
    y_val = enc.transform(labels[val])
    y_test = enc.transform(labels[test])
    
    # 3. Extract the features for the corresponding training, validation, and test tracks.
    #    'columns' specifies which columns (features) to include.
    #    .loc selects rows by index and the specified columns, and .as_matrix() converts it to a NumPy array.
    X_train = features.loc[train, columns].to_numpy()
    X_val = features.loc[val, columns].to_numpy()
    X_test = features.loc[test, columns].to_numpy()
    
    # 4. Shuffle the training data to randomize the order, ensuring reproducibility with a fixed random_state.
    X_train, y_train = shuffle(X_train, y_train, random_state=42)
    
    # 5. Standardize the features: remove the mean and scale to unit variance.
    #    The StandardScaler is fitted on the training data, then applied to the validation and test sets.
    scaler = StandardScaler(copy=False)
    scaler.fit_transform(X_train)  # Fits the scaler to X_train and transforms X_train in one step.
    scaler.transform(X_val)        # Transforms X_val using the same scaling parameters.
    scaler.transform(X_test)       # Transforms X_test similarly.
    
    # 6. Return the preprocessed labels and features.
    return y_train, y_val, y_test, X_train, X_val, X_test


### 1.2 Single genre

In [14]:
def test_classifiers_features(classifiers, feature_sets, multi_label=False):
    columns = list(classifiers.keys()).insert(0, 'dim')
    scores = pd.DataFrame(columns=columns, index=feature_sets.keys())
    times = pd.DataFrame(columns=classifiers.keys(), index=feature_sets.keys())
    for fset_name, fset in tqdm(feature_sets.items(), desc='features'):
        y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)
        scores.loc[fset_name, 'dim'] = X_train.shape[1]
        for clf_name, clf in classifiers.items():  # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):
            t = time.process_time()
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)
            scores.loc[fset_name, clf_name] = score
            times.loc[fset_name, clf_name] = time.process_time() - t
    return scores, times

def format_scores(scores):
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])

In [22]:
classifiers = {
    'LR': LogisticRegression(max_iter=500),
    'kNN': KNeighborsClassifier(n_neighbors=200),
    'SVCrbf': SVC(kernel='rbf'),
    'SVCpoly1': SVC(kernel='poly', degree=1),
    'linSVC1': SVC(kernel="linear"),
    'linSVC2': LinearSVC(),
    #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    'DT': DecisionTreeClassifier(max_depth=5),
    'RF': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    'AdaBoost': AdaBoostClassifier(n_estimators=10),
    'MLP1': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000),
    'MLP2': MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000),
    'NB': GaussianNB(),
    'QDA': QuadraticDiscriminantAnalysis(),
}


# Instead of the full dictionary, just use a single classifier for testing:
single_classifier = {
    'LR': LogisticRegression(max_iter=500),
    'kNN': KNeighborsClassifier(n_neighbors=200),
    'SVCrbf': SVC(kernel='rbf'),
    'SVCpoly1': SVC(kernel='poly', degree=1),
}

feature_sets = {
#    'echonest_audio': ('echonest', 'audio_features'),
#    'echonest_social': ('echonest', 'social_features'),
#    'echonest_temporal': ('echonest', 'temporal_features'),
#    'echonest_audio/social': ('echonest', ('audio_features', 'social_features')),
#    'echonest_all': ('echonest', ('audio_features', 'social_features', 'temporal_features')),
}
for name in features.columns.levels[0]:
    feature_sets[name] = name
feature_sets.update({
    'mfcc/contrast': ['mfcc', 'spectral_contrast'],
    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
    'all_non-echonest': list(features.columns.levels[0])
})

# If you also want to reduce the feature sets, pick just one or two, e.g.:
small_feature_sets = {
    'mfcc/contrast': ['mfcc', 'spectral_contrast'],
    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
}

scores, times = test_classifiers_features(classifiers, feature_sets)

ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))

features:   0%|          | 0/18 [00:00<?, ?it/s]



Unnamed: 0,dim,LR,kNN,SVCrbf,SVCpoly1,linSVC1,linSVC2,DT,RF,AdaBoost,MLP1,MLP2,NB,QDA
chroma_cens,84.0,25.00%,22.88%,32.00%,26.25%,26.25%,25.25%,18.25%,21.50%,14.88%,26.25%,25.37%,18.62%,23.75%
chroma_cqt,84.0,28.12%,22.25%,29.25%,26.88%,28.62%,27.25%,22.38%,24.12%,17.75%,26.50%,27.00%,17.25%,14.37%
chroma_stft,84.0,33.25%,30.88%,37.62%,32.75%,32.75%,34.62%,26.50%,29.12%,22.12%,32.00%,30.63%,16.00%,15.25%
mfcc,140.0,41.00%,36.88%,46.38%,42.50%,41.62%,43.12%,29.25%,37.00%,20.88%,38.75%,38.38%,36.00%,39.50%
rmse,7.0,21.00%,21.00%,23.12%,21.75%,21.75%,23.38%,25.37%,22.50%,19.50%,22.12%,24.38%,19.50%,17.75%
spectral_bandwidth,7.0,32.12%,30.50%,31.37%,31.75%,32.00%,31.37%,28.50%,31.75%,23.38%,31.13%,28.25%,29.00%,28.00%
spectral_centroid,7.0,30.75%,30.63%,33.12%,32.25%,32.25%,31.25%,29.75%,32.12%,24.75%,31.00%,30.12%,25.75%,26.00%
spectral_contrast,49.0,35.88%,34.75%,40.00%,37.00%,37.38%,34.50%,26.25%,31.37%,25.50%,33.50%,31.00%,35.12%,33.88%
spectral_rolloff,7.0,28.62%,30.50%,31.50%,31.37%,31.87%,30.12%,28.62%,31.87%,22.25%,32.25%,29.88%,24.62%,23.88%
tonnetz,42.0,26.88%,21.75%,27.50%,27.00%,26.25%,27.50%,21.12%,23.38%,14.75%,23.62%,23.50%,22.88%,22.50%


Unnamed: 0,LR,kNN,SVCrbf,SVCpoly1,linSVC1,linSVC2,DT,RF,AdaBoost,MLP1,MLP2,NB,QDA
chroma_cens,0.1033,0.3751,4.8304,2.7448,8.903,1.1363,0.2759,0.0356,0.6296,12.635,5.9458,0.0056,0.0395
chroma_cqt,0.1369,0.4002,4.6515,2.7364,7.8578,5.346,0.2574,0.0328,0.5929,9.466,6.4039,0.0046,0.039
chroma_stft,0.1856,0.3853,4.5513,2.7381,6.9454,7.3311,0.2461,0.0361,0.5849,7.4734,8.48,0.0047,0.0397
mfcc,0.2534,0.4253,4.2165,2.5186,13.164,3.3296,0.5044,0.0406,1.1702,5.2536,3.1074,0.0073,0.0769
rmse,0.0456,0.0791,1.3699,1.0249,1.195,0.0418,0.0256,0.0353,0.0699,3.6527,5.7304,0.0019,0.0024
spectral_bandwidth,0.0387,0.0734,1.3312,1.0238,1.2376,0.041,0.0261,0.0368,0.068,4.5657,11.5463,0.0019,0.0023
spectral_centroid,0.044,0.0662,1.2618,0.9829,1.1958,0.0367,0.0256,0.0372,0.0683,4.1366,7.7004,0.0019,0.0024
spectral_contrast,0.1278,0.3883,3.5875,1.6906,4.039,0.8047,0.1785,0.0379,0.4137,8.9253,6.502,0.0036,0.0158
spectral_rolloff,0.0532,0.0712,1.3326,1.0201,1.221,0.0411,0.0214,0.0328,0.0579,5.1688,8.7576,0.0019,0.0024
tonnetz,0.0692,0.3865,3.1915,1.8317,4.6269,0.3608,0.1528,0.0374,0.3609,8.4805,7.4615,0.0033,0.0137


In [26]:
scores.to_csv("scores_output.csv")
times.to_csv("times_output.csv")


### 1.3 Multiple genres

Todo:
* Ignore rare genres? Count them higher up in the genre tree? On the other hand it's not much tracks.

In [8]:
classifiers = {
    #LogisticRegression(),
    'LR': OneVsRestClassifier(LogisticRegression()),
    'SVC': OneVsRestClassifier(SVC()),
    'MLP': MLPClassifier(max_iter=700),
}

feature_sets = {
#    'echonest_audio': ('echonest', 'audio_features'),
#    'echonest_temporal': ('echonest', 'temporal_features'),
    'mfcc': 'mfcc',
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
}

scores, times = test_classifiers_features(classifiers, feature_sets, multi_label=True)

ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))

features:   0%|          | 0/3 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

## 2 Deep learning on raw audio

Other architectures:
* [Learning Features of Music from Scratch (MusicNet)](https://arxiv.org/abs/1611.09827), John Thickstun, Zaid Harchaoui, Sham Kakade.

In [None]:
labels_onehot = LabelBinarizer().fit_transform(tracks['track', 'genre_top'])
labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)

Load audio samples in parallel using `multiprocessing` so as to maximize CPU usage when decoding MP3s and making some optional pre-processing. There are multiple ways to load a waveform from a compressed MP3:
* librosa uses audioread in the backend which can use many native libraries, e.g. ffmpeg
    * resampling is very slow --> use `kaiser_fast`
    * does not work with multi-processing, for keras `fit_generator()`
* pydub is a high-level interface for audio modification, uses ffmpeg to load
    * store a temporary `.wav`
* directly pipe ffmpeg output
    * fastest method
* [pyAV](https://github.com/mikeboers/PyAV) may be a fastest alternative by linking to ffmpeg libraries

In [None]:
# Just be sure that everything is fine. Multiprocessing is tricky to debug.
utils.FfmpegLoader().load(utils.get_audio_path(AUDIO_DIR, 2))
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, utils.FfmpegLoader())
SampleLoader(train, batch_size=2).__next__()[0].shape

In [None]:
# Keras parameters.
NB_WORKER = len(os.sched_getaffinity(0))  # number of usables CPUs
params = {'pickle_safe': True, 'nb_worker': NB_WORKER, 'max_q_size': 10}

### 2.1 Fully connected neural network

* Two layers with 10 hiddens is no better than random, ~11%.

Optimize data loading to be CPU / GPU bound, not IO bound. Larger batches means reduced training time, so increase batch time until memory exhaustion. Number of workers and queue size have no influence on speed.

In [None]:
loader = utils.FfmpegLoader(sampling_rate=2000)
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)
print('Dimensionality: {}'.format(loader.shape))

keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Dense(output_dim=1000, input_shape=loader.shape))
model.add(Activation("relu"))
model.add(Dense(output_dim=100))
model.add(Activation("relu"))
model.add(Dense(output_dim=labels_onehot.shape[1]))
model.add(Activation("softmax"))

optimizer = keras.optimizers.SGD(lr=0.1, momentum=0.9, nesterov=True)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=64), train.size, nb_epoch=2, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=64), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=64), test.size, **params)
#Y = model.predict_generator(SampleLoader(test, batch_size=64), test.size, **params);

loss

### 2.2 Convolutional neural network

* Architecture: [End-to-end learning for music audio](http://www.mirlab.org/conference_papers/International_Conference/ICASSP%202014/papers/p7014-dieleman.pdf), Sander Dieleman, Benjamin Schrauwen.
* Missing: track segmentation and class averaging (majority voting)
* Compared with log-scaled mel-spectrograms instead of strided convolution as first layer.
* Larger net: http://benanne.github.io/2014/08/05/spotify-cnns.html

In [None]:
loader = utils.FfmpegLoader(sampling_rate=16000)
#loader = utils.LibrosaLoader(sampling_rate=16000)
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)

keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Reshape((-1, 1), input_shape=loader.shape))
print(model.output_shape)

model.add(Conv1D(128, 512, subsample_length=512))
print(model.output_shape)
model.add(Activation("relu"))

model.add(Conv1D(32, 8))
print(model.output_shape)
model.add(Activation("relu"))
model.add(MaxPooling1D(4))

model.add(Conv1D(32, 8))
print(model.output_shape)
model.add(Activation("relu"))
model.add(MaxPooling1D(4))

print(model.output_shape)
#model.add(Dropout(0.25))
model.add(Flatten())
print(model.output_shape)
model.add(Dense(100))
model.add(Activation("relu"))
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)

optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#, momentum=0.9, nesterov=True)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=10), train.size, nb_epoch=20, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=10), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=10), test.size, **params)

loss

### 2.3 Recurrent neural network

## 3 Deep learning on extracted audio features

Look at:
* Pre-processing in Keras: https://github.com/keunwoochoi/kapre
* Convolutional Recurrent Neural Networks for Music Classification: https://github.com/keunwoochoi/icassp_2017
* Music Auto-Tagger: https://github.com/keunwoochoi/music-auto_tagging-keras
* Pre-processor: https://github.com/bmcfee/pumpp

### 3.1 ConvNet on MFCC

* Architecture: [Automatic Musical Pattern Feature Extraction Using Convolutional Neural Network](http://www.iaeng.org/publication/IMECS2010/IMECS2010_pp546-550.pdf), Tom LH. Li, Antoni B. Chan and Andy HW. Chun
* Missing: track segmentation and majority voting.
* Best seen: 17.6%

In [None]:
class MfccLoader(utils.Loader):
    raw_loader = utils.FfmpegLoader(sampling_rate=22050)
    #shape = (13, 190)  # For segmented tracks.
    shape = (13, 2582)
    def load(self, filename):
        import librosa
        x = self.raw_loader.load(filename)
        # Each MFCC frame spans 23ms on the audio signal with 50% overlap with the adjacent frames.
        mfcc = librosa.feature.mfcc(x, sr=22050, n_mfcc=13, n_fft=512, hop_length=256)
        return mfcc

loader = MfccLoader()
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)
loader.load(utils.get_audio_path(AUDIO_DIR, 2))[0].shape

In [None]:
keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Reshape((*loader.shape, 1),  input_shape=loader.shape))
print(model.output_shape)

model.add(Conv2D(3, 13, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Conv2D(15, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Conv2D(65, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Flatten())
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)

optimizer = keras.optimizers.SGD(1e-3)#lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=16), train.size, nb_epoch=20, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=16), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=16), test.size, **params)
#Y = model.predict_generator(loader, test.size, pickle_safe=True, nb_worker=NB_WORKER, max_q_size=5)

loss