In [2]:
import os

import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
import librosa
import librosa.display
import ast
import pickle
from scipy.spatial import distance

In [3]:
def get_audio_path(audio_dir, track_id):
    """
    Return the path to the mp3 given the directory where the audio is stored
    and the track ID.
    Examples
    --------
    >>> import utils
    >>> AUDIO_DIR = os.environ.get('AUDIO_DIR')
    >>> utils.get_audio_path(AUDIO_DIR, 2)
    '../data/fma_small/000/000002.mp3'
    """
    tid_str = '{:06d}'.format(track_id)
    return os.path.join(audio_dir, tid_str[:3], tid_str + '.mp3')

In [4]:
def load(filepath):

    filename = os.path.basename(filepath)

    if 'features' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'echonest' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'genres' in filename:
        return pd.read_csv(filepath, index_col=0)

    if 'tracks' in filename:
        tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

        COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
                   ('track', 'genres'), ('track', 'genres_all')]
        for column in COLUMNS:
                tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
                   ('album', 'date_created'), ('album', 'date_released'),
                   ('artist', 'date_created'), ('artist', 'active_year_begin'),
                   ('artist', 'active_year_end')]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ('small', 'medium', 'large')
        tracks['set', 'subset'] = tracks['set', 'subset'].astype(pd.api.types.CategoricalDtype(categories=SUBSETS, ordered=True))
        
        

#         COLUMNS = [('track', 'license'), ('artist', 'bio'),
#                    ('album', 'type'), ('album', 'information')]
        COLUMNS = [('track', 'genre_top'), ('track', 'license'),
                   ('album', 'type'), ('album', 'information'),
                   ('artist', 'bio')]
        for column in COLUMNS:
            tracks[column] = tracks[column].astype(pd.api.types.CategoricalDtype())

        return tracks


Extract features from audio files

In [5]:
def save_pickle(file_name, file_data):
    with open(file_name, "wb") as fp:
        pickle.dump(file_data, fp)

In [6]:
def load_pickle(file_name):
    with open(file_name, "rb") as fp:
        return pickle.load(fp)

In [7]:
def extract_feature_from_files(track_id, feature_extraction_func):
    extracted_features = []
    for num_track,i in enumerate(track_id):
        if(num_track % 100 == 0):
            print("Processed {} tracks".format(num_track))
        filename = get_audio_path(AUDIO_DIR, i)
        x, sr = librosa.load(filename, sr=None, mono=True)
        x = x[:(10*sr)]
        extracted_features.append(feature_extraction_func(x, sr))
    return extracted_features

In [8]:
def mfcc_extraction_func(x, sr):
    stft = np.abs(librosa.stft(x, n_fft=2048, hop_length=512))
    mel = librosa.feature.melspectrogram(sr=sr, S=stft**2)
    mfcc = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20)
    return mfcc

In [9]:
def chroma_extraction_func(x, sr):
    cqt = np.abs(librosa.cqt(x, sr=sr, hop_length=512, bins_per_octave=12, n_bins=7*12, tuning=None))
    f = librosa.feature.chroma_cens(C=cqt, n_chroma=12, n_octaves=7)
    return f

In [11]:
# Load metadata and features.
BAD_TRACK_ID = [98565, 98567, 98569, 99134, 108925, 133297]

tracks = load('fma_metadata/tracks.csv')
tracks = tracks.drop(labels = BAD_TRACK_ID)

genres = load('fma_metadata/genres.csv')
features = load('fma_metadata/features.csv')
features = features.drop(labels = BAD_TRACK_ID)

np.testing.assert_array_equal(features.index, tracks.index)

In [12]:
AUDIO_DIR = "fma_small"
extracted_features = []
small = tracks['set', 'subset'] <= 'small'
track_id = tracks.loc[small].index

In [None]:
# features = extract_feature_from_files(track_id, mfcc_extraction_func)
# save_pickle('preprocessed_data/mfcc.pkl', features)

In [None]:
# features = extract_feature_from_files(track_id, chroma_extraction_func)
# save_pickle('preprocessed_data/chroma.pkl', features)

# Split data

In [13]:
def select_track_features_from_id(track_id, selected_track_id, feature_list):
    result_feature_list = []
    selected_track_index = 0
    all_track_index = 0
    while all_track_index < track_id.shape[0] and selected_track_index < selected_track_id.shape[0]:
        if track_id[all_track_index] == selected_track_id[selected_track_index]:
            result_feature_list.append(feature_list[all_track_index])
            selected_track_index += 1
        all_track_index += 1
    return result_feature_list

In [14]:
def split_data(feature_list):
    train_entries = tracks['set', 'split'] == 'training'
    test_entries = tracks['set', 'split'] == 'test'

    small = tracks['set', 'subset'] <= 'small'
    
    track_id = tracks.loc[small].index
    train_track_id = tracks.loc[small & train_entries].index
    test_track_id = tracks.loc[small & test_entries].index
    
    train_feature_list = select_track_features_from_id(track_id, train_track_id, feature_list)
    test_feature_list = select_track_features_from_id(track_id, test_track_id, feature_list)
    return train_track_id, test_track_id, train_feature_list, test_feature_list

In [15]:
# stacking all features and store which feature corresponses to which idx in train_track_id / test_track_id
def vectorize_data(feature_list):
    feature_idx_list = []
    for i in range(len(feature_list)):
        feature_idx_list.append(np.ones((feature_list[i].shape[1], 1)) * i)

    feature_array = np.hstack(feature_list).T
    feature_idx = np.vstack(feature_idx_list)
    return feature_array, feature_idx

In [16]:
# using test_data for training
feature_list = load_pickle('preprocessed_data/mfcc.pkl')
train_track_id, test_track_id, train_feature_list, test_feature_list = split_data(feature_list)
train_feature_array, train_feature_idx = vectorize_data(train_feature_list)
train_track_count = len(train_feature_list)
                                
print (train_feature_idx.shape)
print (train_feature_array.shape)

(5534576, 1)
(5534576, 20)


# Vector quantization

In [17]:
def pca(feature_array, feature_name):
    # zero_mean standard scaler
    zero_mean = skl.preprocessing.StandardScaler(with_std=False)
    feature_array = zero_mean.fit_transform(feature_array)
    pca = skl.decomposition.PCA(n_components=8)
    reduced_train_feature_array = pca.fit_transform(feature_array)
    save_pickle("sklearn_models/{}_pca.pkl".format(feature_name), pca)
    save_pickle("sklearn_models/{}_zero_mean.pkl".format(feature_name), zero_mean)
    return reduced_train_feature_array

In [18]:
# load pca model
# mfcc_pca = load_pickle('mfcc_pca.pkl')
# reduced_train_feature_array = loaded_pca.transform(train_feature_array)

In [19]:
def minibatch_kmeans(feature_array, feature_name, cluster_count):
    kmeans = skl.cluster.MiniBatchKMeans(n_clusters=cluster_count, verbose=1, batch_size=3000)
    kmeans.fit(feature_array)
    kmeans.transform(feature_array)
    save_pickle("sklearn_models/{}_kmeans.pkl".format(feature_name), kmeans)
    return kmeans

In [20]:
# cluster_count = 1000
# reduced_train_feature_array = pca(train_feature_array, "mfcc")
# vector_quantization_result = minibatch_kmeans(reduced_train_feature_array, "mfcc", cluster_count)

# Ranking

In [21]:
def tf(raw_document_vector, document_length):
    return raw_document_vector / document_length

In [22]:
def compute_idf(documents):
    num_documents = documents.shape[1]
    num_documents_contain = np.count_nonzero(documents, axis=0)
    idf = np.log(np.divide(num_documents, num_documents_contain))
    return idf

In [23]:
def normalize_document_vector(raw_document_vector):
    document_length = np.sum(raw_document_vector)
    normalized_document_vector = np.multiply(tf(raw_document_vector, document_length), idf)
    return normalized_document_vector

In [24]:
def normalize_documents(documents):
    return np.apply_along_axis(normalize_document_vector, 1, documents)

In [25]:
def ranking(query, documents, document_ids, dist_func='cos', num_results=10):
    query = normalize_document_vector(query)
    distances = np.zeros(documents.shape[0])
    for i in range(documents.shape[0]):
        if dist_func == 'cos':
            distances[i] = distance.cosine(query, documents[i])
        elif dist_func == 'norm':
            distances[i] = np.linalg.norm(documents[i] - query)
    ranking_result_index = np.argsort(distances)
    return document_ids[ranking_result_index[:num_results]]

In [34]:
# def addRandomNoiseToFile(filename, file_id):
#     x, sr = librosa.load(filename, sr=None, mono=True)
#     x += 0.1 * np.random.rand(len(x))
#     librosa.output.write_wav('noised_{}.wav'.format(file_id), x, sr)
    
def addRandomNoiseToWave(x, noise_level):
    x += noise_level * np.random.rand(len(x))
    return x

def musicToVector(filename, feature_extraction_func, data_processors, noise_level, cluster_size):
    zero_mean, pca, kmeans = data_processors
    
    x, sr = librosa.load(filename, sr=None, mono=True)
    x = x[:(10*sr)]
    # adding distortion to loaded wave
    x = addRandomNoiseToWave(x, noise_level)
    extracted_features = feature_extraction_func(x, sr).T
    processed_features = zero_mean.transform(extracted_features)
    processed_features = pca.transform(processed_features)
    kmeans_labels = kmeans.predict(processed_features)
    cluster_vector = np.zeros(cluster_size)
    for val in kmeans_labels:
        cluster_vector[val] += 1
    return cluster_vector

def loadDataProcessors(name):  
    # load zero_mean model
    zero_mean = load_pickle('sklearn_models/{}_zero_mean.pkl'.format(name))
    # load pca model
    pca = load_pickle('sklearn_models/{}_pca.pkl'.format(name))
    # load kmeans model
    kmeans = load_pickle('sklearn_models/{}_kmeans.pkl'.format(name))
    kmeans.verbose = 0

    return zero_mean, pca, kmeans

def loadTrainTrackClusters(name, cluster_count):
    kmeans = load_pickle('sklearn_models/{}_kmeans.pkl'.format(name))
    documents = np.zeros((train_track_count, cluster_count))
    labels = kmeans.labels_
    for i in range(len(labels)):
        track_idx = train_feature_idx[i]
        documents[int(track_idx)][labels[i]] += 1
    return documents

def evaluate(feature_name, documents, feature_extract_func, noise_level, dist_func='cos', num_queries=100, cluster_size=1000):
    data_processors = loadDataProcessors(feature_name)
    chosen_tracks = np.random.choice(train_track_id, num_queries, replace=False)
    correct_ranking = 0
    ranking_result_list = []
    for i, track_id in enumerate(chosen_tracks, 1):
        filename = get_audio_path(AUDIO_DIR, track_id)
        query = musicToVector(filename, feature_extract_func, data_processors, noise_level, cluster_size)
        ranking_result = ranking(query, documents, train_track_id, dist_func)
        ranking_result_list.append(ranking_result)
        if ranking_result[0] == track_id:
            correct_ranking += 1
    accuracy = correct_ranking / num_queries
    print ('Accuracy for {} method on {} queries: '.format(feature_name, num_queries), accuracy)
    return chosen_tracks, ranking_result_list

raw_documents = loadTrainTrackClusters("chroma", 1000)
idf = compute_idf(raw_documents)
normalized_documents = normalize_documents(raw_documents)

# mfcc_cos_acc = evaluate('mfcc', normalized_documents, mfcc_extraction_func, noise_level=0, dist_func='cos', num_queries=100)
# mfcc_cos_acc = evaluate('mfcc', normalized_documents, mfcc_extraction_func, noise_level=0.01, dist_func='cos', num_queries=100)
# mfcc_cos_acc = evaluate('mfcc', normalized_documents, mfcc_extraction_func, noise_level=0.1, dist_func='cos', num_queries=100)

# mfcc_norm_acc = evaluate('mfcc', normalized_documents, mfcc_extraction_func, noise_level=0, dist_func='norm', num_queries=100)
# mfcc_norm_acc = evaluate('mfcc', normalized_documents, mfcc_extraction_func, noise_level=0.01, dist_func='norm', num_queries=100)
# mfcc_norm_acc = evaluate('mfcc', normalized_documents, mfcc_extraction_func, noise_level=0.1, dist_func='norm', num_queries=100)

test_track_ids, ranking_result_list = evaluate('chroma', normalized_documents, chroma_extraction_func, noise_level=0, dist_func='cos', num_queries=100)
# chroma_cos_acc = evaluate('chroma', normalized_documents, chroma_extraction_func, noise_level=0.01, dist_func='cos', num_queries=100)
# chroma_cos_acc = evaluate('chroma', normalized_documents, chroma_extraction_func, noise_level=0.1, dist_func='cos', num_queries=100)

# chroma_norm_acc = evaluate('chroma', normalized_documents, chroma_extraction_func, noise_level=0, dist_func='norm', num_queries=100)
# chroma_norm_acc = evaluate('chroma', normalized_documents, chroma_extraction_func, noise_level=0.01, dist_func='norm', num_queries=100)
# chroma_norm_acc = evaluate('chroma', normalized_documents, chroma_extraction_func, noise_level=0.1, dist_func='norm', num_queries=100)

save_pickle("boaw_test_track_ids.pkl", test_track_ids)
save_pickle("boaw_ranking_result_list.pkl", ranking_result_list)

Accuracy for chroma method on 100 queries:  1.0


In [None]:
false_ranking = []
for test_index in range(1):
    print("Testing track id:", train_track_id[test_index])
    original_query = normalized_documents[test_index]
    ranking_result = ranking(original_query, normalized_documents, train_track_id, dist_func='cos')
    if ranking_result[0] != train_track_id[test_index]:
        false_ranking.append((train_track_id[test_index], ranking_result[0]))
print(false_ranking)

In [None]:
filename = get_audio_path(AUDIO_DIR, 108037)
ipd.Audio(filename)

In [None]:
zero_mean = load_pickle('sklearn_models/{}_zero_mean.pkl'.format("chroma"))
pca = load_pickle('sklearn_models/{}_pca.pkl'.format("chroma"))
kmeans = load_pickle('sklearn_models/{}_kmeans.pkl'.format("chroma"))
old_labels = kmeans.labels_
kmeans.verbose = 0

feature_list = load_pickle('preprocessed_data/chroma.pkl')
train_track_id, test_track_id, train_feature_list, test_feature_list = split_data(feature_list)
train_feature_array, train_feature_idx = vectorize_data(train_feature_list)
train_track_count = len(train_feature_list)

processed_features = zero_mean.transform(train_feature_array[:2100])
processed_features = pca.transform(processed_features[:2100])
new_labels = kmeans.predict(processed_features[:2100])


In [None]:
print(np.sum(old_labels[:2100] != new_labels))

In [None]:
print(new_labels)

In [None]:
def test(filename, feature_extraction_func, data_processors, noise_level, cluster_size):
    zero_mean, pca, kmeans = data_processors
    
    x, sr = librosa.load(filename, sr=None, mono=True)
    x = x[:(10*sr)]
    # adding distortion to loaded wave
#     x = addRandomNoiseToWave(x, noise_level)
    extracted_features = feature_extraction_func(x, sr).T
    zero_features = zero_mean.transform(extracted_features)
    pca_features = pca.transform(zero_features)
    kmeans_labels = kmeans.predict(pca_features)
#     cluster_vector = np.zeros(cluster_size)
#     for val in kmeans_labels:
#         cluster_vector[val] += 1
    return extracted_features, zero_features, pca_features, kmeans_labels

filename = get_audio_path(AUDIO_DIR, 2)
data_processors = loadDataProcessors('chroma')
extracted_features, zero_features, pca_features, kmeans_labels = test(filename, chroma_extraction_func, data_processors, 0.01, 1000)

In [227]:

processed_features = zero_mean.transform(train_feature_array[:2582])
processed_features = pca.transform(processed_features[:2582])

In [229]:
print(pca_features)
print(processed_features)

[[-0.03396339 -0.23078826 -0.1691253  ... -0.09072542 -0.09195164
  -0.05509767]
 [-0.03010589 -0.24011316 -0.16976384 ... -0.08827809 -0.0906359
  -0.05278084]
 [-0.02562837 -0.24946558 -0.16951365 ... -0.08589326 -0.08933992
  -0.05045947]
 ...
 [ 0.22702525 -0.11454669 -0.05175217 ...  0.05255807 -0.09923446
  -0.02833947]
 [ 0.2274325  -0.12803563 -0.04436911 ...  0.05567943 -0.09602342
  -0.02975243]
 [ 0.22830835 -0.14097878 -0.03720105 ...  0.0586235  -0.09259355
  -0.03104894]]
[[-0.03396339 -0.23078826 -0.1691253  ... -0.09072542 -0.09195164
  -0.05509767]
 [-0.03010589 -0.24011316 -0.16976384 ... -0.08827809 -0.0906359
  -0.05278084]
 [-0.02562837 -0.24946558 -0.16951365 ... -0.08589326 -0.08933992
  -0.05045947]
 ...
 [ 0.17321961  0.09735506 -0.06236397 ...  0.07582098 -0.10686997
  -0.04551584]
 [ 0.1814824   0.09410527 -0.0617262  ...  0.07613659 -0.10546319
  -0.04520431]
 [ 0.18900968  0.09059635 -0.06124072 ...  0.07672863 -0.10402947
  -0.04516938]]
