In [7]:
## Imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from ast import literal_eval

In [8]:
DATASET_NAME = 'DI2KG'

In [25]:
class LSH :
  
    # Random limits, const
    __max_rand =  1000
    __min_rand = -1000
    
    # Constructor
    def __init__(self, k, L, embedding_size = 150):
        
        # Number of hash function
        self.k = k
        
        # Number of attempts
        self.L = L
        
        # Embedding length 
        self.embedding_size = embedding_size
        
        # Random matrices
        self.normalized_random_matrices = []
        
        for i in range(self.L):
            random_matrix = np.random.randint(self.__min_rand, self.__max_rand,(self.k, self.embedding_size));
            
            # Append normalized random matrices
            self.normalized_random_matrices.append(normalize(random_matrix, axis=1, norm='l1'))
        
    
    # Locality Sensitive hash function
    def locality_sensitive_hash(self, embedding, matrix_index):
        out = 0
      
        for h in self.normalized_random_matrices[matrix_index]:
            if (np.dot(h, embedding) >= 0):
                out = (out << 1) | 1
            else:
                out = (out << 1) | 0

        return out
      
    # Divide in buckets using L-th matrix, out is the l-th hashtable
    def divide_in_buckets(self, embeddings,titles,matrix_index,out):
        for ix,embedding in enumerate(embeddings):
            embedding=np.array(literal_eval(embedding))
            hash = self.locality_sensitive_hash(embedding, matrix_index)
            if (hash in out):
                out[hash].append(titles[ix])
            else:
                out[hash] = [titles[ix]]
            
        return out  


In [4]:
test = LSH(k=5, L=2, embedding_size=10)
print(test.normalized_random_matrices)

print(test.locality_sensitive_hash([1,2,3,4,5,6,7,8,9,10], 1))

[array([[ 0.17893859, -0.07196443, -0.07085301,  0.00416782, -0.1036399 ,
         0.04640178,  0.09752709, -0.06751876,  0.17588219, -0.18310642],
       [ 0.01631117, -0.01463823, -0.38435801, -0.01045588,  0.29694688,
         0.05060644,  0.11417817,  0.0322041 ,  0.03973233, -0.0405688 ],
       [-0.03213844, -0.15162752, -0.10465595, -0.08776267,  0.10980635,
        -0.10486197,  0.12072518, -0.19056448,  0.0723115 ,  0.02554594],
       [-0.08582977,  0.23038516, -0.0530195 ,  0.14098906,  0.06680932,
        -0.13956253,  0.11222064, -0.06062767,  0.07941037, -0.03114598],
       [ 0.20755576,  0.05006827, -0.03573054, -0.13108785,  0.1734183 ,
        -0.1276741 , -0.13837051,  0.08101957, -0.02867547,  0.02639964]]), array([[ 0.04389722,  0.0248394 , -0.13254818,  0.14389722, -0.10321199,
         0.10492505, -0.20835118,  0.12205567,  0.10256959, -0.0137045 ],
       [-0.09917808,  0.13534247, -0.09260274, -0.04748858,  0.12365297,
        -0.11799087, -0.04420091,  0.15488

In [5]:
## almost equals
embeddings = [[1.2345,2,3,4,5,6,10.4,8,9,10],[1,2,3,4,5,6,7,8,9,10],[1,2,3,5,5,6,7,8,9,10]]
print(len(test.divide_in_buckets(embeddings, 1)))

## not equals
embeddings = np.random.randint(-10000, 10000,(10000, 10))
print(len(test.divide_in_buckets(embeddings, 1)))


3
32


In [9]:
## TEST BLOCKING PERFORMANCE
#    Basta che per ogni tupla vado a prendere la sua corrispondente, ne calcolo
#     i vari L hash e controllo che almeno uno sia uguale e incremento un
#     contatore. La precisione è contatore/numero di tuple controllate
def performance_test(filtered_dataset, k, L, embedding_size):
    
    match_found = 0
    
    lsh = LSH(k, L, embedding_size)
    
    # for each elemt in dataset
    for index, row in filtered_dataset.iterrows():
        x_embedding = np.array(literal_eval(row['left_table']))
        y_embedding = np.array(literal_eval(row['right_table']))
          
        x_hashs = set()
        y_hashs = set()
        for i in range(L):
            x_hashs.add(lsh.locality_sensitive_hash(x_embedding, i))
            y_hashs.add(lsh.locality_sensitive_hash(y_embedding, i))
        
        if (len(set.intersection(x_hashs, y_hashs)) > 0):
            match_found += 1
  
    
    return match_found / len(filtered_dataset.index)

In [10]:
## Open dataset 
df = pd.read_csv('../../lsh-test-data/' + DATASET_NAME + '-embeddings.csv')
df.head()

Unnamed: 0,attributi_x,attributi_y,label,left_table,right_table,label.1
0,Samsung Galaxy EK-GC110 16.3 Megapixel Compact...,"SAMSUNG WB350F BROWN 16MP 21X 3"" SMART CAMERA ...",0,"[-1.0, 0.0, -0.0, -0.7615925073623657, 0.0, 0....","[-0.0, -0.0, -0.26511502265930176, 0.855300188...",0
1,Canon PowerShot SX510 HS Black Digital Camera ...,"NIKON COOLPIX L610 BLACK 16MP 14X 3""LCD 32408",0,"[-1.0, 0.0, -0.0, -0.7615923881530762, 8.90093...","[-0.0, -0.0, 0.24665074050426483, 0.9988988637...",0
2,Panasonic Lumix DMC-GH4 - Price comparison & r...,Canon Digital IXUS 800 IS - PowerShot and IXUS...,0,"[-0.9981581568717957, -0.0, -0.047106023877859...","[0.039003822952508926, -0.0, -0.81086432933807...",0
3,Nikon D600 / 24-85mm Kit - Price comparison & ...,Canon PowerShot G11 - PowerShot and IXUS digit...,0,"[-0.9989334344863892, -0.0, -0.070406578481197...","[0.039368510246276855, -0.0, -0.80935800075531...",0
4,Canon EOS Rebel T3i Black SLR Digital Camera K...,"FUJIFILM JX600 WHITE 14MP 5X 2.7""LCD CCD 60001...",0,"[-0.9999977350234985, 0.0, -0.0, -0.7615934610...","[-0.0, -0.0, 0.2419230341911316, 0.92247080802...",0


In [None]:
## Remove 0 labled
df = df[df.label == 1]

precision_max = 0
k_max = 0
L_max = 0
for k in range(30):
    for L in range(10):
        precision = performance_test(df, k + 1, L + 1, 150)
        print("K: {0}, L: {1}, Precision:{2}".format(k + 1, L + 1, precision))
        if (precision >= precision_max):
            precision_max = precision
            k_max = k + 1
            L_max = L + 1

print("Max precision: {0}, k: {1}, L: {2}".format(precision_max, k_max, L_max))

In [26]:
k = 10
l = 5
lsh_primer = LSH(k,l,150)
left_tuple_titles = df['attributi_x']
right_tuple_titles = df['attributi_y']
left_tuple_embeddings = df['left_table']
right_tuple_embeddings = df['right_table']

In [27]:
#hash tables è una lista di l dizionari
hash_tables = []
for i in range(l):
    hash_table = lsh_primer.divide_in_buckets(left_tuple_embeddings,left_tuple_titles,i,{})
    hash_table = lsh_primer.divide_in_buckets(right_tuple_embeddings,right_tuple_titles,i,hash_table)
    hash_tables.append(hash_table)

In [35]:
import pprint
pprint.pprint(hash_tables[3])

{130: ['Canon EOS Rebel T3i Black SLR Digital Camera Kit W/ 18-55mm Lens (18 '
       'MP, 3x Opt, SD/SDHC/SDXC Card Slot) Price Comparison at Buy.net',
       'Canon EOS 5D Mark III Black SLR Digital Camera Kit W/ 24-70 Mm Lens '
       '(22.3 MP, SD/SDHC/SDXC Card Slot) Price Comparison at Buy.net',
       'Canon PowerShot D30 Blue Digital Camera (12.1 MP, 5x Opt, SD/SDHC/SDXC '
       'Card Slot) Price Comparison at Buy.net',
       'Samsung WB2200F Black Digital Camera (16.4 MP, 60x Opt, SDHC/SDXC Card '
       'Slot) Price Comparison at Buy.net',
       'Canon EOS Rebel T3i Black SLR Digital Camera Kit W/ 18-55mm Lens (18 '
       'MP, 3x Opt, SD/SDHC/SDXC Card Slot) Price Comparison at Buy.net',
       'Canon PowerShot D30 Blue Digital Camera (12.1 MP, 5x Opt, SD/SDHC/SDXC '
       'Card Slot) Price Comparison at Buy.net',
       'Canon PowerShot ELPH 340 HS Black Digital Camera (16 MP, 12x Opt, '
       'SD/SDHC/SDXC Card Slot) Price Comparison at Buy.net',
       'Sony Alpha NE

       'Canon PowerShot G16 Black Digital Camera (12.1 MP, 5x Opt, SD Card '
       'Slot) Price Comparison at Buy.net',
       'Canon PowerShot G16 Black Digital Camera (12.1 MP, 5x Opt, SD Card '
       'Slot) Price Comparison at Buy.net',
       'Canon EOS 70D Black SLR Digital Camera Body Only (20.2 MP, SD Card '
       'Slot) Price Comparison at Buy.net',
       'Canon EOS 70D Black SLR Digital Camera Body Only (20.2 MP, SD Card '
       'Slot) Price Comparison at Buy.net',
       'Canon EOS 70D Black SLR Digital Camera Body Only (20.2 MP, SD Card '
       'Slot) Price Comparison at Buy.net',
       'Canon EOS 70D Black SLR Digital Camera Body Only (20.2 MP, SD Card '
       'Slot) Price Comparison at Buy.net',
       'Canon EOS 70D Black SLR Digital Camera Body Only (20.2 MP, SD Card '
       'Slot) Price Comparison at Buy.net',
       'Canon PowerShot G16 Black Digital Camera (12.1 MP, 5x Opt, SD Card '
       'Slot) Price Comparison at Buy.net',
       'Canon PowerShot G16 Blac

       'Canon Digital IXUS 55 - PowerShot and IXUS digital compact cameras - '
       'Canon Europe',
       'CANON POWERSHOT ELPH 150IS BLACK 20MP 10X 2.7" 9356B001',
       'Canon PowerShot SX10 IS - PowerShot and IXUS digital compact cameras - '
       'Canon Europe',
       'Canon PowerShot A2100 IS - PowerShot and IXUS digital compact cameras '
       '- Canon Europe',
       'NIKON COOLPIX P7800 12.2MP 7.1X WIDE ANGLE 3" VR 32015',
       'Canon PowerShot A3200 IS - PowerShot and IXUS digital compact cameras '
       '- Canon Europe',
       'Canon PowerShot A3100 IS - PowerShot and IXUS digital compact cameras '
       '- Canon Europe',
       'CANON POWERSHOT SX700HS BLACK 16.1MP 30X W/CASE 9338B015',
       'Canon PowerShot A510 - PowerShot and IXUS digital compact cameras - '
       'Canon Europe',
       'Canon PowerShot A2100 IS - PowerShot and IXUS digital compact cameras '
       '- Canon Europe',
       'CANON POWERSHOT SX700HS BLACK 16.1MP 30X W/CASE 9338B015',
       '

       'Canon PowerShot A580 - PowerShot and IXUS digital compact cameras - '
       'Canon Europe',
       'Canon PowerShot A510 - PowerShot and IXUS digital compact cameras - '
       'Canon Europe',
       'Canon PowerShot G6 - PowerShot and IXUS digital compact cameras - '
       'Canon Europe',
       'Canon PowerShot S90 - PowerShot and IXUS digital compact cameras - '
       'Canon Europe',
       'Canon PowerShot G11 - PowerShot and IXUS digital compact cameras - '
       'Canon Europe',
       'Canon PowerShot A610 - PowerShot and IXUS digital compact cameras - '
       'Canon Europe',
       'CANON POWERSHOT SX700HS BLACK 16.1MP 30X W/CASE 9338B015',
       'Canon PowerShot SX160 IS - PowerShot and IXUS digital compact cameras '
       '- Canon Europe',
       'Canon Digital IXUS 800 IS - PowerShot and IXUS digital compact cameras '
       '- Canon Europe',
       'FUJIFILM XQ1 BLACK 12MP 4X 3.0"LCD 600013201',
       'Canon PowerShot E1 - PowerShot and IXUS digital compact c

       'SONY DSCHX60VB BLACK 20.4MP 30X 3" DSCHX60VB',
       'OLYMPUS TG-850IHS SILVER 16MP 5X 3" CMOS V104150SU000',
       'SONY DSC-W800B BLACK 20.1MP 5X WIDE ANGLE 2.7" DSCW800B',
       'RICOH WG-4 SILVER 16MP 4X WIDE ANGLE 3" RUGGED 08572',
       'FUJI FINEPIX XP200 YELLOW 16MP 5X 3" WATERPROOF 600012725',
       'SAMSUNG WB350F BROWN 16MP 21X 3" SMART CAMERA EC-WB350FBPNCA',
       'OLYMPUS TG-850IHS SILVER 16MP 5X 3" CMOS V104150SU000',
       'SONY DSC-W800 SILVER 20.1MP 5X WIDE ANGLE 2.7" DSCW800S',
       'RICOH WG-4 SILVER 16MP 4X WIDE ANGLE 3" RUGGED 08572',
       'OLYMPUS TG-850IHS SILVER 16MP 5X 3" CMOS V104150SU000',
       'SAMSUNG WB350F RED 16MP 21X 3" SMART CAMERA EC-WB350FBPRCA',
       'RICOH WG-4 SILVER 16MP 4X WIDE ANGLE 3" RUGGED 08572',
       'SAMSUNG WB350F RED 16MP 21X 3" SMART CAMERA EC-WB350FBPRCA',
       'SONY DSC-W800B BLACK 20.1MP 5X WIDE ANGLE 2.7" DSCW800B',
       'FUJI FINEPIX XP200 YELLOW 16MP 5X 3" WATERPROOF 600012725',
       'SAMSUNG EX-2F

       'Cameras - Australia',
       'Fujifilm X-Pro 1 / 35mm Kit - Price comparison & reviews - Digital '
       'Cameras - Australia',
       'Nikon 1 J1 / 10-30mm Kit - Price comparison & reviews - Digital '
       'Cameras - Australia',
       'Nikon 1 J1 / 10-30mm Kit - Price comparison & reviews - Digital '
       'Cameras - Australia',
       'Fujifilm X-Pro 1 / 35mm Kit - Price comparison & reviews - Digital '
       'Cameras - Australia',
       'Canon EOS 6D / 24-70mm Kit - Price comparison & reviews - Digital '
       'Cameras - Australia',
       'Fujifilm X-Pro 1 / 35mm Kit - Price comparison & reviews - Digital '
       'Cameras - Australia',
       'Canon EOS 6D / 24-70mm Kit - Price comparison & reviews - Digital '
       'Cameras - Australia',
       'Fujifilm X-Pro 1 / 35mm Kit - Price comparison & reviews - Digital '
       'Cameras - Australia',
       'Canon EOS 6D / 24-70mm Kit - Price comparison & reviews - Digital '
       'Cameras - Australia',
       'Nikon 1 

       'Pentax Optio WG-1 GPS - Price comparison & reviews - Digital Cameras - '
       'Australia',
       'Olympus VG-160 - Price comparison & reviews - Digital Cameras - '
       'Australia',
       'Sony Alpha A3000 - Price comparison & reviews - Digital Cameras - '
       'Australia',
       'Fujifilm FinePix S4000 - Price comparison & reviews - Digital Cameras '
       '- Australia',
       'Olympus XZ-2 iHS - Price Comparison - Buy Cheap in Australia',
       'Nikon D3200 / 18-300mm Kit - Price comparison & reviews - Digital '
       'Cameras - Australia',
       'Nikon D3300 / 18-200mm Kit - Price comparison & reviews - Digital '
       'Cameras - Australia',
       'Olympus OM-D E-M1 - Price comparison & reviews - Digital Cameras - '
       'Australia',
       'Pentax Q10 / 5-15mm & 15-45mm Kit - Price comparison & reviews - '
       'Digital Cameras - Australia',
       'Olympus OM-D E-M1 - Price comparison & reviews - Digital Cameras - '
       'Australia',
       'Canon EOS

       'Image - 1920 X 1080 Video - AVCHD, Motion JPEG AVI - HDMI - '
       'PictBridge) Price Comparison at Buy.net',
       'Olympus PEN E-PM1 Purple Silver Camera Kit W/ 14-42mm Lens (12.3 '
       'Megapixel - 3" LCD - 3x Optical Zoom - Electronic IS - 4032 X 3024 '
       'Image - 1920 X 1080 Video - AVCHD, Motion JPEG AVI - HDMI - '
       'PictBridge) Price Comparison at Buy.net',
       'Sony Cyber-shot DSC-RX100 Black Digital Camera (20.2 MP, 3.6x Opt, '
       'SD/SDHC/SDXC/MS Duo/MS PRO Duo/MS PRO-HG Duo Card Slot) Price '
       'Comparison at Buy.net',
       'Sony Cyber-shot DSC-RX100 Black Digital Camera (20.2 MP, 3.6x Opt, '
       'SD/SDHC/SDXC/MS Duo/MS PRO Duo/MS PRO-HG Duo Card Slot) Price '
       'Comparison at Buy.net',
       'Olympus PEN E-PM1 Purple Silver Camera Kit W/ 14-42mm Lens (12.3 '
       'Megapixel - 3" LCD - 3x Optical Zoom - Electronic IS - 4032 X 3024 '
       'Image - 1920 X 1080 Video - AVCHD, Motion JPEG AVI - HDMI - '
       'PictBridge) Pric