In [1]:
# This notebook is a kind of cheat sheet for the flask app that takes in an mp3 and outputs the model's prediction.
# It's not technically part of my workflow anymore but still useful to be able to run the prediction on an
# arbitrary sound file within a jupyter notebook.

import os
import multiprocessing
import warnings

from tqdm import tqdm
import numpy as np
from scipy import stats
import pandas as pd
import librosa

In [2]:
def columns():
    feature_sizes = dict(chroma_stft=12, chroma_cqt=12, chroma_cens=12,
                         tonnetz=6, mfcc=20, rmse=1, zcr=1,
                         spectral_centroid=1, spectral_bandwidth=1,
                         spectral_contrast=7, spectral_rolloff=1)
    moments = ('mean', 'std', 'skew', 'kurtosis', 'median', 'min', 'max')

    columns = []
    for name, size in feature_sizes.items():
        for moment in moments:
            it = ((name, moment, '{:02d}'.format(i+1)) for i in range(size))
            columns.extend(it)

    names = ('feature', 'statistics', 'number')
    columns = pd.MultiIndex.from_tuples(columns, names=names)

    # More efficient to slice if indexes are sorted.
    return columns.sort_values()

In [3]:
def compute_features(songfile):

    features = pd.Series(index=columns(), dtype=np.float32, name=1)

    # Catch warnings as exceptions (audioread leaks file descriptors).
    warnings.filterwarnings('error', module='librosa')

    def feature_stats(name, values):
        features[name, 'mean'] = np.mean(values, axis=1)
        features[name, 'std'] = np.std(values, axis=1)
        features[name, 'skew'] = stats.skew(values, axis=1)
        features[name, 'kurtosis'] = stats.kurtosis(values, axis=1)
        features[name, 'median'] = np.median(values, axis=1)
        features[name, 'min'] = np.min(values, axis=1)
        features[name, 'max'] = np.max(values, axis=1)

    try:
        #filepath = songfile
        # The name of the file goes here -- was utils.get_audio_path(os.environ.get('AUDIO_DIR'), tid)
        x, sr = librosa.load(songfile, sr=None, mono=True)  # kaiser_fast

        f = librosa.feature.zero_crossing_rate(x, frame_length=2048, hop_length=512)
        feature_stats('zcr', f)

        cqt = np.abs(librosa.cqt(x, sr=sr, hop_length=512, bins_per_octave=12,
                                 n_bins=7*12, tuning=None))
        assert cqt.shape[0] == 7 * 12
        assert np.ceil(len(x)/512) <= cqt.shape[1] <= np.ceil(len(x)/512)+1

        f = librosa.feature.chroma_cqt(C=cqt, n_chroma=12, n_octaves=7)
        feature_stats('chroma_cqt', f)
        f = librosa.feature.chroma_cens(C=cqt, n_chroma=12, n_octaves=7)
        feature_stats('chroma_cens', f)
        f = librosa.feature.tonnetz(chroma=f)
        feature_stats('tonnetz', f)

        del cqt
        stft = np.abs(librosa.stft(x, n_fft=2048, hop_length=512))
        assert stft.shape[0] == 1 + 2048 // 2
        assert np.ceil(len(x)/512) <= stft.shape[1] <= np.ceil(len(x)/512)+1
        del x

        f = librosa.feature.chroma_stft(S=stft**2, n_chroma=12)
        feature_stats('chroma_stft', f)

        f = librosa.feature.rmse(S=stft)
        feature_stats('rmse', f)

        f = librosa.feature.spectral_centroid(S=stft)
        feature_stats('spectral_centroid', f)
        f = librosa.feature.spectral_bandwidth(S=stft)
        feature_stats('spectral_bandwidth', f)
        f = librosa.feature.spectral_contrast(S=stft, n_bands=6)
        feature_stats('spectral_contrast', f)
        f = librosa.feature.spectral_rolloff(S=stft)
        feature_stats('spectral_rolloff', f)

        mel = librosa.feature.melspectrogram(sr=sr, S=stft**2)
        del stft
        f = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20)
        feature_stats('mfcc', f)

    except Exception as e:
        print('{}: {}'.format(1, repr(e)))

    return features

In [4]:
features = compute_features("UPLOAD_FOLDER/01 This Unkind World.mp3")

features_df = pd.DataFrame(features).fillna(0)
features_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,1
feature,statistics,number,Unnamed: 3_level_1
chroma_cens,kurtosis,1,-1.069031
chroma_cens,kurtosis,2,-0.832866
chroma_cens,kurtosis,3,-0.47453
chroma_cens,kurtosis,4,-1.141648
chroma_cens,kurtosis,5,-0.550345


In [5]:
# The compute_features function outputs 518 rows in a single column, and we need them all in one row so we need
# to transpose it. We also need to insert the other five columns that our model takes in, and I'm using the median
# value for each from among the X_train dataset. (These columns had essentially no impact on the predictions.)

temp = features_df.T
temp.insert(0, 'track_id', 67706)
temp.insert(1, 'album_tracks', 11)
temp.insert(2, 'track_duration', 212)
temp.insert(3, 'track_listens', 495)
temp.insert(4, 'track_number', 5)
temp.head()

feature,track_id,album_tracks,track_duration,track_listens,track_number,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,01,02,03,04,05,...,04,05,06,01,01,01,01,01,01,01
1,67706,11,212,495,5,-1.069031,-0.832866,-0.47453,-1.141648,-0.550345,...,0.154764,0.033174,0.02519,199.804062,0.728027,0.042039,0.039062,0.001953,11.729385,0.037483


In [6]:
import pickle
from sklearn.decomposition import PCA

pca = pickle.load(open("PCA_transform.pkl","rb"))

temp_pca = pd.DataFrame(pca.transform(temp))
print(temp_pca.head())

            0           1           2            3          4           5   \
0 -5127.782761 -980.334707 -305.704939 -2813.966368  217.66804  669.574418   

           6           7           8           9          10         11  \
0 -432.333094 -109.080125 -165.479859 -128.560891  19.897444 -93.272515   

          12        13        14  
0  29.283751 -32.67651  2.476235  


In [7]:
temp_pca.columns = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14']

temp_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-5127.782761,-980.334707,-305.704939,-2813.966368,217.66804,669.574418,-432.333094,-109.080125,-165.479859,-128.560891,19.897444,-93.272515,29.283751,-32.67651,2.476235


In [8]:
import xgboost as xgb

gbm_oversmp = pickle.load(open("gradient_boost_oversmp_genre.pkl","rb"))

prediction = gbm_oversmp.predict_proba(temp_pca)
np.set_printoptions(precision=3, suppress=True)
print(prediction)
type(prediction)

[[0.093 0.234 0.099 0.023 0.367 0.16  0.01  0.015 0.   ]]


numpy.ndarray

In [9]:
genres = ['rock', 'experimental', 'electronic', 'hip-hop', 'folk', 'pop', 'instrumental', 'international', 'classical']
predicted_genre = genres[np.argmax(prediction)]
predicted_prob = int(round(prediction.max() * 100))
message = "I am {} percent sure that your song is {}!".format(predicted_prob, predicted_genre)

print(message)

I am 37 percent sure that your song is folk!
