In [1]:
import os
import sys
import time
import glob
import datetime
import sqlite3
import numpy as np
import configparser
from nltk import ngrams
import hashlib
import numpy.matlib
from sklearn import preprocessing
from math import cos, sqrt, pi, ceil


# $Step 1$Read parameters from the config file

In [2]:
g65_configFileLocation = './config'
Config = configparser.ConfigParser()
Config.read(g65_configFileLocation)
def ConfigSectionMap(section):
    dict1 = {}
    options = Config.options(section)
    for option in options:
        try:
            dict1[option] = Config.get(section, option)
            if dict1[option] == -1:
                DebugPrint("skip: %s" % option)
        except:
            print("exception on %s!" % option)
            dict1[option] = None
    return dict1

In [3]:
# set pathes
msd_subset_path = ConfigSectionMap("MainSection")['dataset_location']
msd_code_path = ConfigSectionMap("MainSection")['code_location']
msd_subset_data_path=os.path.join(msd_subset_path,'data')
msd_subset_addf_path=os.path.join(msd_subset_path,'AdditionalFiles')
assert os.path.isdir(msd_subset_path),'wrong path'
assert os.path.isdir(msd_code_path),'wrong path'
sys.path.append( os.path.join(msd_code_path,'PythonSrc') )

# g65_selctedFeatures stores all the features
g65_selectedFeatures =ConfigSectionMap("MainSection")['features'].split(",")
num_feature = len(g65_selectedFeatures)

# g65_n is number of songs to process
g65_n = int(ConfigSectionMap("MainSection")['number_of_songs_to_process'])

# g65_r is number of rows in each band
g65_r = int(ConfigSectionMap("MainSection")['number_of_rows_each_band'])

# g65_b is number of bands
g65_b = int(ConfigSectionMap("MainSection")['number_of_bands'])

# g65_epsilon is the tolerant difference
g65_epsilon = float(ConfigSectionMap("MainSection")['tolerance_diff'])
epsilon = cos(g65_epsilon*pi/180)

#signiture is the raw value of each song
signiture = np.zeros([num_feature,g65_n])

#the hashing algorithm used for hashing bands pieces
g65_hashalgorithm = ConfigSectionMap("MainSection")['hashalgorithm']
print (epsilon, g65_n, num_feature)

0.9996573249755573 100000 9


In [4]:
import hdf5_getters as GETTERS

In [5]:
# extract all query features into the signiture matrix
s_counter = 0
songsIds = [0]*g65_n
string_f = 0
for root, dirs, files in os.walk(msd_subset_data_path):
    files = glob.glob(os.path.join(root,'*'+'.h5'))
    for f in files :
        if s_counter == g65_n:
            break
        else:
            h5 = GETTERS.open_h5_file_read(f)
            f_counter = 0
            songsIds[s_counter] = GETTERS.get_song_id(h5)
            for features in g65_selectedFeatures:
                temp = str(getattr(GETTERS, 'get_'+features)(h5))
                signiture[f_counter,s_counter] = temp
                f_counter += 1
            s_counter += 1
            h5.close()


In [6]:
#print signiture.shape, g65_n
#signiture = preprocessing.scale(signiture, axis=1) 
#along each row i.e standardizing the features values of all songs
#ss = signiture
#S2 = signiture*signiture
#sNorm = np.sqrt(S2.sum(axis=0)/len(S2))
#sNorm = np.matlib.repmat(sNorm,num_feature,1)
#signiture = (ss-np.mean(ss,axis=0))/sNorm
#print (signiture.shape)



######old code:
#
signiture = preprocessing.scale(signiture, axis=1) ll songs
ss = signiture
S2 = signiture*signiture
sNorm = np.sqrt(S2.sum(axis=0))
sNorm = np.matlib.repmat(sNorm,num_feature,1)
signiture = ss/sNorm
#print signiture.shape

# $Step2$Construct a new signiture matrix M for LSH with cosin distance

In [7]:
vec = np.random.randn(g65_b*g65_r,num_feature)
vec2 = vec*vec
vecNorm = np.sqrt(vec2.sum(axis=1))
vecNorm = np.matlib.repmat(vecNorm,num_feature,1)
print (vecNorm.shape)
hashV = vec/vecNorm.T
print (hashV.shape)
M = np.sign(hashV.dot(signiture))
print (M.shape)

(9, 189)
(189, 9)
(189, 100000)


# $Step3$LSH

In [8]:
epsilon = cos(1.7*pi/180)
print (epsilon)

0.999559860119384


In [9]:
candidates = 0
duplicate_songs = set()
num_hash = 1000
#amplify = 10000
for b in range(g65_b):
    
    # construct the hashing vector
    v = (1+np.arange(g65_r))*(1+np.arange(g65_r))

    start = b*g65_r
    end = (b+1)*g65_r
    band = M[start:end,:]
    
    # hash the signiture matrix into r*2 buckets
    score = np.dot(v,band)
    vote_max = score.max().astype(int)
    vote_min = score.min().astype(int)

    for key in range(vote_min,vote_max):
        
        # find songs hashed to same key
        temp = np.where(score==key)
        if (len(temp[0])>1 and len(temp[0])<10000):
            index = temp[0]
            candidates += 1
            cosine = np.dot(signiture[:,index].T,signiture[:,index])-np.eye(len(index))
            temp_dup = np.where(cosine>epsilon)
            for i in range(len(temp_dup[0])):
                duplicate_songs.add((index[temp_dup[0][i]],index[temp_dup[1][i]]))
            


5643 25.0


# Results:

In [15]:
print("Number of Candidates: {}\r\nNumberof Duplicates: {}".format(candidates,int(len(duplicate_songs)/2)))

Number of Candidates: 5643
Numberof Duplicates: 25


In [10]:
#print (duplicate_songs)
#print (np.inner(signiture[:,3854],signiture[:,7436]))

In [11]:
def angles(n,m):
    np.inner(signiture[:,n],signiture[:,m])/sqrt(np.sum(signiture[:,n]*signiture[:,n])*np.sum(signiture[:,m]*signiture[:,m]))
    angle = 180*acos(cos)/pi
    return angle