In [89]:
import numpy as np
import pandas as pd
import os
import glob
import json

from progressbar import ProgressBar
import cPickle

In [2]:
FEATURES_PATH = "E:/temp/eu_sounds_features/"

## Create Featurespaces

### Load extracted features from json files

In [24]:
features = {}
indexes = []

idx = 0

for fs_name in [u'rp', u'mfcc', u'chroma', u'rmse', u'tempo', u'tonnetz', u'spectral_centroid', u'zero_crossing_rate', u'ssd']:
    features[fs_name] = []

for collection_path in glob.glob(os.path.join(FEATURES_PATH, "*")):
    
    if os.path.isdir(collection_path):

        collection_name = os.path.basename(collection_path)
        
        for feature_file_path in glob.glob(os.path.join(collection_path, "*.json")):
            
            with open(feature_file_path, 'r') as input_file:
                data = json.load(input_file)
                
            for fs_name in data.keys():
                features[fs_name].append(data[fs_name])
            
            current_index = "%s/%s" % (collection_name, os.path.basename(feature_file_path).replace(".mp3.json", ""))
            
            indexes.append(current_index)
            
            idx += 1
            
            if idx % 1000 == 0:
                print idx

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000


### Store feature-spaces to disk

In [26]:
for fs_name in features.keys():
    pd.DataFrame(features[fs_name]).to_hdf("%s/%s.h5" % (FEATURES_PATH, fs_name), "features")
    

**Convert to Numpy Array**

In [39]:
features.keys()

[u'rp',
 u'rmse',
 u'tempo',
 u'mfcc',
 u'chroma',
 u'tonnetz',
 u'spectral_centroid',
 u'zero_crossing_rate',
 u'ssd']

In [27]:
for fs_name in features.keys():
    features[fs_name] = np.asarray(features[fs_name])
    
features["tempo"] = features["tempo"].reshape((features["tempo"].shape[0],1))

In [28]:
canberra_distance = lambda data, query : np.nansum(abs(data - query) / (abs(data) + abs(query)), axis=1)

In [51]:
def calc_similar_items(features, query_idx):
    
    dists_ssd  = canberra_distance(features["ssd"], features["ssd"][query_idx,:])
    dists_ssd  = (dists_ssd / dists_ssd.max())

    dists_rp   = canberra_distance(features["rp"], features["rp"][query_idx,:])
    dists_rp   = (dists_rp / dists_rp.max())

    dists_mfcc = canberra_distance(features["mfcc"], features["mfcc"][query_idx,:])
    dists_mfcc = (dists_mfcc / dists_mfcc.max())

    dists_chroma = canberra_distance(features["chroma"], features["chroma"][query_idx,:])
    dists_chroma = (dists_chroma / dists_chroma.max())

    dists_bpm    = canberra_distance(features["tempo"], features["tempo"][query_idx])
    dists_bpm    = (dists_bpm / dists_bpm.max())
    
    dists_rmse   = canberra_distance(features["rmse"], features["rmse"][query_idx,:])
    dists_rmse   = (dists_rmse / dists_rmse.max())
    
    dists_spectral_centroid = canberra_distance(features["spectral_centroid"], features["spectral_centroid"][query_idx,:])
    dists_spectral_centroid = (dists_spectral_centroid / dists_spectral_centroid.max())
    
    dists_tonnetz = canberra_distance(features["tonnetz"], features["tonnetz"][query_idx,:])
    dists_tonnetz = (dists_tonnetz / dists_tonnetz.max())
    
    dists_zero_crossing_rate = canberra_distance(features["zero_crossing_rate"], features["zero_crossing_rate"][query_idx,:])
    dists_zero_crossing_rate = (dists_zero_crossing_rate / dists_zero_crossing_rate.max())
    
    sims = (dists_ssd                * 0.9  + \
            dists_rp                 * 1.99 + \
            dists_mfcc               * 2.5  + \
            dists_chroma             * 1.3  + \
            dists_bpm                * 0.8  + \
            dists_rmse               * 1.0  + \
            dists_spectral_centroid  * 0.9  + \
            dists_tonnetz            * 1.3  + \
            dists_zero_crossing_rate * 0.2)
    
    #sims /= sims.max()

    nn = np.argsort(sims)

    return sims, nn

In [87]:
pbar = ProgressBar()

distances = []

for idx in pbar(range(features["ssd"].shape[0])):
    distances.append(calc_similar_items(features, idx)[0])

100% (39844 of 39844) |#################################################################################################| Elapsed Time: 18:26:46 Time: 18:26:46


In [90]:
# save as pickle as a precaution
with open("E:/temp/eu_sounds_features/distances.pickle", "wb") as output_file:
    cPickle.dump(distances, output_file)

In [91]:
del features

In [107]:
DST_DIR = "G:/_eus_distances"

In [111]:
with open("%s/indexes_for_distance_resultfiles.csv" % DST_DIR, 'w') as out_file:
    for i in range(len(indexes)):
        out_file.write("%s\n" % indexes[i])

In [116]:
pbar = ProgressBar()

for i in pbar(range(len(indexes))):
    
    dir_name, filename = indexes[i].split("/")
    
    store_path = "%s/%s.csv.gz" % (DST_DIR, indexes[i])
    
    #print store_path, os.path.dirname(store_path)
    
    if not os.path.exists(os.path.dirname(store_path)):
        os.makedirs(os.path.dirname(store_path))
    
    pd.DataFrame(distances[i], index=indexes).to_csv(store_path, 
                                                     header=None, index=None, compression='gzip')
    

100% (39844 of 39844) |###################################################################################################| Elapsed Time: 5:28:02 Time: 5:28:02
