In [None]:
import sys, os, re, gzip, json, pickle, shutil, random

import numpy as np
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#!{sys.executable} -m pip install librosa
#!{sys.executable} -m pip install pydub
%config IPCompleter.greedy=True
%config Completer.use_jedi = False

In [None]:
from pydub.utils import mediainfo

import librosa
import librosa.display

In [None]:
data_path = '../data'
local_mp3s_path = '%s/mp3s' % data_path
metadata_path = '%s/metadata.json.gz' % data_path
genre_map_path = '%s/genre_map.txt' % data_path

mp3_path = 'D:/mp3_dot_com'
mp3s_path = '%s/mp3' % mp3_path

In [None]:
def load_metadata(metadata_path):
    with gzip.open(metadata_path, 'rt', encoding='utf-8') as fz:
        data = json.dump(fz)
    print('loaded metadata for %d records' % len(data))
    return data

def read_genre_map(genre_map_path):
    genre_map ={}
    with open(genre_map_path, 'r') as f:
        for line in f:
            if not line:
                continue
            g,gm = line.replace('\n','').split('\t')
            genre_map[g]=gm
            
    return genre_map

In [None]:
data = load_metadata(metadata_path)
genre_map = read_genre_map(genre_map_path)

In [None]:
def extract_genre_features(data, genre, local_mp3s_path, sample_rate=44100, max_mb=5,
                     features=['waveform', 'spectrogram', 'melspectrogram', 'spectral_centroids',
                               'spectral_rolloff', 'spectral_bandwidth_2', 'spectral_bandwidth_3',
                               'spectral_bandwidth_4', 'zero_crossings', 'mfcc', 'chromagram']):
    '''Extracts features from files in genre folder'''
    file_feats = {}
    
    gp = '%s/%s' % (local_mp3s_path, genre)
    for fn in os.listdir(gp):
        if not fn.endswith('.mp3'):
            continue
            
        if not fn in data:
            print('File %s not in metadata!' % fn)
            continue
            
        if 'size_mb' in data[fn] and data[fn]['size_mb'] > max_mb:
            continue

        fp = '%s/%s' % (gp, fn)

        feats = extract_features_from_file(fp, features)

        file_feats[fn] = feats
            
    return file_feats
            
            
def extract_features_from_file(fp, features):
    feats = {}
    x, sr = librosa.load(fp)
    
    if 'spectrogram' in features:
        x_spectrogram = librosa.stft(x)
        x_spectrogram = librosa.amplitude_to_db(abs(x_spectrogram))
        feats['spectrogram'] = x_spectrogram
        
    if 'melspectrogram' in features:
        x_melspectrogram = librosa.feature.melspectrogram(x, sr=sr, n_mels=128, fmin=20, fmax=16000)
        x_melspectrogram = librosa.power_to_db(x_melspectrogram, ref=np.max)
        feats['melspectrogram'] = x_melspectrogram
        
    if 'spectral_centroids' in features:
        #frames len
        x_centroids = librosa.feature.spectral_centroid(x, sr=sr)[0]
        x_centroids = normalize(spectral_centroids)
        feats['spectral_centroids'] = x_centroids
        
    if 'spectral_rolloff' in features:
        x_rolloff = librosa.feature.spectral_rolloff(x+0.01, sr=sr)[0]
        x_rolloff = normalize(spectral_rolloff)
        feats['spectral_rolloff'] = x_rolloff
     
    if 'spectral_bandwidth_2' in features:
        #p = Power to raise deviation from spectral centroid.
        x_bandwidth_2 = librosa.feature.spectral_bandwidth(x+0.01, sr=sr)[0]
        feats['spectral_bandwidth_2'] = x_bandwidth_2
        
    if 'spectral_bandwidth_3' in features:
        x_bandwidth_3 = librosa.feature.spectral_bandwidth(x+0.01, sr=sr, p=3)[0]
        feats['spectral_bandwidth_3'] = x_bandwidth_3
            
    if 'spectral_bandwidth_4' in features:
        x_bandwidth_4 = librosa.feature.spectral_bandwidth(x+0.01, sr=sr, p=4)[0]
        feats['spectral_bandwidth_4'] = x_bandwidth_4
        
    if 'zero_crossings' in features:
        x_crossings = librosa.zero_crossings(x, pad=False)
        feats['zero_crossings'] = x_crossings
        
    #Mel-Frequency Cepstral Coefficients (MFCCs)
    if 'mfcc' in features:
        x_mfcc = librosa.feature.mfcc(x, sr=sr)
        feats['mfcc'] = x_mfcc
        
    if 'chromagram' in features:
        hop_length=512
        x_chromagram = librosa.feature.chroma_stft(x, sr=sr, hop_length=hop_length)
        feats['chromagram'] = x_chromagram
    
    return feats

        
# Normalising the spectral centroid for visualisation
def normalize(x, axis=0):
    return sklearn.preprocessing.minmax_scale(x, axis=axis)