In [1]:
import sys, os, re, gzip, json, pickle, shutil, random

import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# Uncomment and run to install

#!{sys.executable} -m pip install librosa
#!{sys.executable} -m pip install matplotlib
#!{sys.executable} -m pip install numpy

In [3]:
import librosa
import librosa.display

import soundfile as sf

In [4]:
data_path = '../data'

# Songs in numbered folders sorted into genre folders
myspace_mp3s_path = '%s/myspace_mp3s' % data_path

#song metadata
metadata_path = '%s/metadata.json.gz' % myspace_mp3s_path

#mapping of raw genres to general genres
genre_map_path = '%s/genre_map.pkl' % myspace_mp3s_path

# We will write extracted audio features here
features_path = '%s/audio_features' % data_path

#### General helper methods

In [5]:
def load_metadata(metadata_path):
    with gzip.open(metadata_path, 'rt', encoding='utf-8') as fz:
        metadata = json.load(fz)
    return metadata


def load_genre_map(genre_map_path):
    with open(genre_map_path, 'rb') as f:
        genre_map = pickle.load(f)
    return genre_map

#### Load song metadata and genre map

In [6]:
metadata = load_metadata(metadata_path)
genre_map = load_genre_map(genre_map_path)

#### Extract and store features for myspace mp3s, resorted into genre folders by majority mapped genre.

In [20]:
def extract_features(mp3_path, zip_folders, features_path, metadata, genre_map, feat_type):
    '''Use librosa to extract some type of feature from the mp3s and store in the features folder.'''
    #various metrics to report
    not_in_metadata = []
    bad_mp3s = []
    over_6 = []
    
    genre_cts = {g:0 for g in set(genre_map.values())}
    ttl=0
    
    for zip_folder in zip_folders:
        zip_path = '%s/%s' % (mp3_path, zip_folder)
        print('Zip folder', zip_folder)
        
        for genre in os.listdir(zip_path):
            if genre=='unknown':
                continue
                
            genre_path = '%s/%s' % (zip_path, genre)
            
            for fn in os.listdir(genre_path):
                if not fn.endswith('.mp3'):
                    continue
                    
                fp = '%s/%s' % (genre_path, fn)
                
                gp = '%s/%s' % (features_path, genre)
                if not os.path.exists(gp):
                    os.mkdir(gp)

                ttl+=1
                if ttl%100==0:
                    print('Total: %d, Not found: %d, Bad mp3s: %d, Over 6 min: %d, %s' % (
                        ttl, len(not_in_metadata), len(bad_mp3s), len(over_6), over_6[-1]))
                
                arr_path = '%s/msp_%s_%s.npy' % (gp, zip_folder, fn.replace('.mp3', ''))
                if os.path.exists(arr_path):
                    continue
                    
                try:
                    #the waveform and the sample_rate (22050 Hz - samples per second)
                    x, sr = librosa.load(fp)
                    
                    #clip at 8M (~6min)
                    if x.shape[0]>8000000:
                        x = x[:8000000]
                        over_6.append((genre, fn))
                        
                    #add more types if desired
                    if feat_type=='mfcc':
                        x_feat = librosa.feature.mfcc(x, sr=sr)
                    elif feat_type=='melspec':
                        x_feat = librosa.feature.melspectrogram(x, sr=sr, n_mels=128, n_fft=2048, hop_length=512)
                    
                    genre_cts[genre]+=1    
                except Exception as ex:
                    print(ex)
                    bad_mp3s.append(zip_num_filename)
                    continue
                    
                np.save(arr_path, x_feat)
                
    return not_in_metadata, bad_mp3s

##### Extract Mel Frequency Cepstral Coefficients (mfcc)

In [22]:
feat_type = 'mfcc'

mfcc_features_path = '%s/mfcc' % features_path
if not os.path.exists(mfcc_features_path):
    os.mkdir(mfcc_features_path)

# There are around 4500 songs per numbered folder
#zip_folders = [zf for zf in os.listdir(myspace_mp3s_path) if zf.isdigit()]
zip_folders = ['78']
    
not_in_metadata, bad_mp3s = extract_features(myspace_mp3s_path, zip_folders, mfcc_features_path,
                                             metadata, genre_map, feat_type)
len(not_in_metadata), len(bad_mp3s) #1852, 6

Zip folder 78
Total: 100, Not found: 0, Bad mp3s: 0, Over 6 min: 5, ('alternative', 'std_05a2df5d3cdf28181c3fa12914aaf483.mp3')
Total: 200, Not found: 0, Bad mp3s: 0, Over 6 min: 13, ('alternative', 'std_1e21759c8c989d6f67d152079e5c6c1f.mp3')
Total: 300, Not found: 0, Bad mp3s: 0, Over 6 min: 18, ('alternative', 'std_3ee5860515934e7aaf2e267f6c5309d7.mp3')
Total: 400, Not found: 0, Bad mp3s: 0, Over 6 min: 23, ('alternative', 'std_5308e9b61391b93b45d9208723a7a935.mp3')
Total: 500, Not found: 0, Bad mp3s: 0, Over 6 min: 29, ('alternative', 'std_73eb1b3e722e5a242446d4cf93c81e8a.mp3')
Total: 600, Not found: 0, Bad mp3s: 0, Over 6 min: 32, ('alternative', 'std_8954bf8da355853de2138345871b057d.mp3')
Total: 700, Not found: 0, Bad mp3s: 0, Over 6 min: 37, ('alternative', 'std_ad1d9ae91f69f5d9bc08d8f0496e1382.mp3')
Total: 800, Not found: 0, Bad mp3s: 0, Over 6 min: 44, ('alternative', 'std_c1f94e2fff71e6a61da1c575113d5a56.mp3')
Total: 900, Not found: 0, Bad mp3s: 0, Over 6 min: 47, ('alternativ

Total: 7500, Not found: 0, Bad mp3s: 0, Over 6 min: 685, ('rock', 'std_e479982e31b99cfb97610ab4774e5bf3.mp3')
Total: 7600, Not found: 0, Bad mp3s: 0, Over 6 min: 697, ('rock', 'std_fd91e955df22ba9e2eb40e79eb68ddb5.mp3')
Total: 7700, Not found: 323, Bad mp3s: 0, Over 6 min: 702, ('world', 'std_27a46a6a135b7f4df20b335b0e373cea.mp3')
Total: 7800, Not found: 323, Bad mp3s: 0, Over 6 min: 708, ('world', 'std_908f758c14d1058ec6e1dfcefbbe3566.mp3')
Total: 7900, Not found: 323, Bad mp3s: 0, Over 6 min: 715, ('world', 'std_e6d83f9bfb01056e71dc77581726dc98.mp3')


(323, 0)

##### Extract Mel Spectrograms (melspec)

In [26]:
feat_type = 'melspec'

melspec_features_path = '%s/melspec' % features_path

if not os.path.exists(melspec_features_path):
    os.mkdir(melspec_features_path)

#zip_folders = [zf for zf in os.listdir(myspace_mp3s_path) if zf.isdigit()]
zip_folders = ['81']
_, _ = extract_features(myspace_mp3s_path, zip_folders, melspec_features_path, 
                        metadata, genre_map, feat_type)

Zip folder 81
Total: 100, Not found: 0, Bad mp3s: 0, Over 6 min: 2, ('alternative', 'std_252cfdca9da60c689372d0d101272668.mp3')
Total: 200, Not found: 0, Bad mp3s: 0, Over 6 min: 6, ('alternative', 'std_58e7ce3e52219fd534b8f6d41d4c7b5b.mp3')
Total: 300, Not found: 0, Bad mp3s: 0, Over 6 min: 9, ('alternative', 'std_8b52778b9f4b9f07ab4ea610853e2380.mp3')
Total: 400, Not found: 0, Bad mp3s: 0, Over 6 min: 13, ('alternative', 'std_ba05e8685ab31957a957e16b9859db99.mp3')
Total: 500, Not found: 0, Bad mp3s: 0, Over 6 min: 16, ('alternative', 'std_ebbf131932ccb794654a1741530db0a0.mp3')
Total: 600, Not found: 0, Bad mp3s: 0, Over 6 min: 33, ('blues', 'std_32842abfe2e37538ee7984a4626433f4.mp3')
Total: 700, Not found: 0, Bad mp3s: 0, Over 6 min: 36, ('classical', 'std_8106a1266281dc4ae06b931d3d389b92.mp3')
Total: 800, Not found: 0, Bad mp3s: 0, Over 6 min: 60, ('dance', 'std_4bd158332d3141d6776a999ece3e0cec.mp3')
Total: 900, Not found: 0, Bad mp3s: 0, Over 6 min: 89, ('dance', 'std_9e3777e15de65