## Statistical Significance

In [1]:
## Import
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import collections, numpy
from scipy import spatial
import numpy as np
import librosa
import math
import re
import os, sys
import pandas as pd
import IPython.display as ipd
import pickle
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.spatial.distance import cdist
from sklearn.manifold import TSNE
import copy

In [2]:
# Load metadata and features.
tracks = pd.read_csv('fma_metadata/tracks.csv')
r_tracks = pd.read_csv('fma_metadata/raw_tracks.csv')
genres = pd.read_csv('fma_metadata/genres.csv')
features = pd.read_csv('fma_metadata/features.csv')
echonest = pd.read_csv('fma_metadata/echonest.csv')
r_artists = pd.read_csv('fma_metadata/raw_artists.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# overall subset
tracks_sm = tracks.loc[(tracks['set.1'] == 'small') & (tracks['set'] == 'training')]

#### Eval Baseline Model

In [4]:
def get_features(opt):
    
    f1 = features['feature']
    
    if opt == 1:
        f2 = features[['spectral_centroid','spectral_centroid.1', 'spectral_centroid.2', 'spectral_centroid.3', 'spectral_centroid.4', 'spectral_centroid.5', 'spectral_centroid.6']]
        f3 = features.iloc[:, 512:]
        f3 = features.iloc[:, 393:400]
        fs = pd.concat([f1, f2], axis=1)
        fs = pd.concat([fs, f3], axis=1)
        
    elif opt == 2:
        f2 = features.iloc[:, 400:470] # spectral bunch
        fs = pd.concat([f1, f2], axis=1)
        
    elif opt == 3:
        f2 = features.iloc[:, 253:393] # mfcc
        fs = pd.concat([f1, f2], axis=1)
    
    else:
        return 0
    
    
    fs = fs.loc[fs['feature'].isin(tracks_sm['Unnamed: 0'])]

    # ipd.display(fs)

    overall_fs = fs.values[:, 1:]
    return overall_fs, fs

In [78]:
overall_fs, fs = get_features(2)
overall_fs.shape

(5634, 70)

In [6]:
def get_bog_model(data, num_clusters):
    # all features in all songs in all artists
    # X = get_data
#     A = 100
#     F = 13
#     all_features = numpy.random.rand(A, F)
    model = KMeans(num_clusters)
    model.fit(data)
    return model

In [79]:
model = get_bog_model(overall_fs, 70) # going for 100 clusters

In [8]:
# load artists Full Set
def load_spot_rel_artists(name):
    with open(name, 'rb') as handle:
        thing = pickle.load(handle)   
    return thing

In [59]:
# dont run this here
res = load_spot_rel_artists("saved_data/results_eval.pickle")

In [80]:
artist_rel = load_spot_rel_artists("saved_data/artist_rel_small_training.pickle")

# Creating Condensed Artist Map, and Master 2D Table
artist_fs = {} # artists in Full Set

for key in artist_rel:
    artist_fs[key] = set()

for key in artist_rel:
    for elem in artist_rel[key]:
        if artist_rel.get(elem, None) is not None:
            artist_fs[key].add(elem)
            artist_fs[elem].add(key)

# assert len(artist_all) == len(artist_rel)  
# deleting artists with no related artist in set
keys = artist_fs.keys()
for key in keys:
    if not artist_fs[key]:
        del artist_fs[key]
#         pass
    else:
        artist_fs[key] = list(artist_fs[key])
        
# Artist Index Map
artist_index = {}
for i, a in enumerate(artist_fs.keys()):
    artist_index[a] = i


In [11]:
# Input is num_clips * feature_dimension
# Output is 1 * num_clusters 
def get_histogram_song(song_features, model, num_clusters):
    clusters = model.predict(song_features)
    Y = numpy.bincount(clusters, minlength=num_clusters)
    return Y

In [12]:
# Input is num_songs * num_clusters 
# Output is 1 * num_clusters 
def get_artist_feature(all_songs_y):
    artist_feature = numpy.sum(all_songs_y, axis=0)
    return artist_feature/numpy.linalg.norm(artist_feature)

In [13]:
# Output is num_artists * num_clusters 
def construct_artist_similarity_matrix(model, num_clusters, num_artists, artist_index):
    
    global fs
    
    artist_similarity_matrix = numpy.zeros((num_artists, num_clusters))

    # for artist_index, artist in enumerate(data):
    for artist in (artist_index):
        a_i = artist_index[artist]
        artist_feature = []
        # for song in artist:
        
        artist_fs = fs.loc[fs['feature'].isin(tracks_sm.loc[tracks_sm['artist.12'] == artist]['Unnamed: 0'])]
        artist_fs = artist_fs.values[:, 1:]
        
        for song in artist_fs:
            
            song = song.reshape(1, len(song))
            
#             song_feature = []
#             # for clip in song:
#             for clip in range(10):
#                 # is of 1 X F dimensions
#                 # features = get_feature(clip)
#                 features = numpy.random.rand(1,13)
#                 song_feature.append(features)

#             song_features = numpy.vstack(song_feature)

            artist_feature.append(get_histogram_song(song, model, num_clusters))
    
        if artist_feature:
            artist_feature = numpy.vstack(artist_feature)
            artist_similarity_matrix[a_i, :] = get_artist_feature(artist_feature)
    
    return artist_similarity_matrix


In [81]:
artist_clusters_matrix = construct_artist_similarity_matrix(model, 70, len(artist_index), artist_index)

In [15]:
def get_similarity(i, j, artist_similarity_matrix):
#     v = 1.0 - spatial.distance.cosine(artist_similarity_matrix[i, :], artist_similarity_matrix[j, :])
#     v = spatial.distance.cosine(artist_similarity_matrix[i, :], artist_similarity_matrix[j, :]) # simple cosine distance
    v = spatial.distance.euclidean(artist_similarity_matrix[i, :], artist_similarity_matrix[j, :]) 
#     v = np.linalg.norm(artist_similarity_matrix[i, :] - artist_similarity_matrix[j, :])
    return v

In [16]:
def scale(mat):
    new = np.zeros(mat.shape)
    for i in range(len(mat)):
        m = np.amax(mat[i, :])
        if m>0:
            new[i, :] = 1 - mat[i, :]/m
    return new   

In [82]:
artist_sim_matrix = np.zeros((len(artist_index), len(artist_index)))


for a_i_1 in artist_index:
    for a_i_2 in artist_index:
#         if a_i_1 == a_i_2:
#             continue
#         print a_i_1, a_i_2
#         print artist_index[a_i_1], artist_index[a_i_2]
        sim = get_similarity(artist_index[a_i_1], artist_index[a_i_2], artist_clusters_matrix)
        artist_sim_matrix[artist_index[a_i_1], artist_index[a_i_2]] = sim
        artist_sim_matrix[artist_index[a_i_2], artist_index[a_i_1]] = sim


In [83]:
# scaling and getting actual similarity rather than distance
artist_sim_matrix = scale(artist_sim_matrix)

In [84]:
# get the ground truth
ground = np.load('saved_data/ground.npy')

In [85]:
# Thresholding
# model_out = artist_sim_matrix > 0.5
ground = ground > 0.5

In [86]:
for i in range(ground.shape[0]):
    ground[i, i] = 1.0

In [65]:
def norm_model_out(artist_sim_matrix):

    sim_means = np.mean(artist_sim_matrix, axis=1)

    model_out = np.zeros(artist_sim_matrix.shape)

    for i in range(len(artist_sim_matrix)):


        # Top K Method
#         k = np.sum(ground[i, :] == 1)
#         row_idx = artist_sim_matrix[i, :].argsort()[::-1][:k]
#         model_out[i, row_idx] = 1.0

        # Means Method
        model_out[i, :] = artist_sim_matrix[i, :] > sim_means[i]
        
        for i in range(model_out.shape[0]):
            model_out[i, i] = 1.0
        
    return model_out

In [23]:
def calc_metrics(artist_sim_matrix, ground):

    # correlation
    model_out = artist_sim_matrix
    g_flatten = ground.flatten()
    m_flatten = model_out.flatten()
    corr = np.corrcoef(g_flatten, m_flatten)

    # normalise artist_sim_matrix to model_out
    model_out = norm_model_out(artist_sim_matrix)
    
    # Measure Performance
    ntp = numpy.sum((model_out == 1) & (ground == 1))
    nfn = numpy.sum((model_out == 0) & (ground == 1))
    ntn = numpy.sum((model_out == 0) & (ground == 0))
    nfp = numpy.sum((model_out == 1) & (ground == 0))

    tpr = ntp*1.0/(ntp+nfn)
    tnr = ntn*1.0/(ntn+nfp)
    
    # Accuaracy 
    acc = (ntp+ntn)*1.0/(ntp+nfn+ntn+nfp)
    
    # F1 Score
    p = ntp*1.0/(ntp+nfp)
    r = ntp*1.0/(ntp+nfn)
    f1 = 2 * (p * r)/(p + r)
    
    # Precision
    precision = ntp*1.0/(ntp+nfp)
    

# float(tp)/tpg
    return ntp, nfn, ntn, nfp, tpr, tnr, acc, f1, precision, corr  

In [87]:
ntp, nfn, ntn, nfp, tpr, tnr, acc, f1, precision, corr = calc_metrics(artist_sim_matrix, ground)

In [67]:
len(model_out[model_out>0.5])

NameError: name 'model_out' is not defined

In [68]:
model_out[model_out>0]

NameError: name 'model_out' is not defined

In [88]:
ntp, nfn, ntn, nfp

(636, 650, 117819, 36131)

#### Statistical Tests

In [35]:
asm2 = np.copy(artist_sim_matrix) # Spectral stuff

In [45]:
asm3 = np.copy(artist_sim_matrix) # MFCC

In [24]:
asm1 = np.copy(artist_sim_matrix) # Rest

In [25]:
print ground.shape
print type(artist_sim_matrix)

(394, 394)
<type 'numpy.ndarray'>


In [47]:
orig_ground = np.copy(ground)

In [48]:
print orig_ground

[[ True False False ..., False False False]
 [False  True False ..., False False False]
 [False False  True ..., False False False]
 ..., 
 [False False False ...,  True False False]
 [False False False ..., False  True False]
 [False False False ..., False False  True]]


In [131]:
# randomizing ground
numpy.random.shuffle(ground)

In [138]:
assert numpy.array_equal(ground, orig_ground)

AssertionError: 

In [49]:
m1_results = []
m2_results = []
m3_results = []

for i in range(25000):
    
    ntp, nfn, ntn, nfp, tpr, tnr, acc, f1, precision, corr = calc_metrics(asm1, ground)
    m1_results.append([ntp, nfn, ntn, nfp, tpr, tnr, acc, f1, precision, corr])

    ntp, nfn, ntn, nfp, tpr, tnr, acc, f1, precision, corr = calc_metrics(asm2, ground)
    m2_results.append([ntp, nfn, ntn, nfp, tpr, tnr, acc, f1, precision, corr])

    ntp, nfn, ntn, nfp, tpr, tnr, acc, f1, precision, corr = calc_metrics(asm3, ground)
    m3_results.append([ntp, nfn, ntn, nfp, tpr, tnr, acc, f1, precision, corr])
    
    numpy.random.shuffle(ground)
    
    

In [110]:
len(m3_results)

25000

In [92]:
# [elem[9][0][1] for elem in m1_results] -- for correlation
m1_metric = [elem[4] for elem in m1_results]
m2_metric = [elem[4] for elem in m2_results]
m3_metric = [elem[4] for elem in m3_results]

In [93]:
# diff = [a[1] - a[0] for a in zip(m2_metric, m3_metric)]

# ans = [1 if a[1]==max(a) else 0 for a in zip(m1_metric, m2_metric, m3_metric)] 
ans = [[a[1]-a[0], a[1]-a[2]] for a in zip(m1_metric, m2_metric, m3_metric)]

In [94]:
ans[0]

[0.00077760497667189732, 0.0038880248833592645]

In [101]:
# a = zip(m1_metric, m2_metric, m3_metric)
ans2 = [1 if (a[0]>0.006299 and a[1]>0.00700) else 0 for a in ans]

In [102]:
sum(ans2)/25000.0

0.03008

In [57]:
np.max(m3_metric)

0.29948109710896964

In [87]:
print np.mean(diff)
print np.std(diff)
print np.max(diff)

0.000171732084079
0.00309199143325
0.0113048405761


In [325]:
-2.11013765423e-05 + (0.00262842985705 * 3)

0.0078641881946077

In [88]:
final = 0.2819 - 0.2873
# final = 0.3008 - 0.2947
# final = 406 - 398
# final = 
print final

-0.0054


In [96]:
0.3157 - 0.3087

0.007000000000000006

In [114]:
sum(ans) / (len(ans) * 1.0)

0.33512

In [89]:
sum([d>final for d in diff])/(len(diff) * 1.0)

0.96309999999999996

In [69]:
sum([d>final for d in diff])

152