In [4]:
import math
import json
import pickle
import numpy as np
import falconn
import timeit
from __future__ import print_function
import psycopg2
import pandas as pd
from configparser import ConfigParser
import requests
import pyphi
import re
import os

In [5]:
def call_embedding_ws(names):
#    url = 'http://54.36.53.127:8009/embedding'
    url = 'http://127.0.0.1:8009/embedding'
    headers = {"content-type": "application/json"}
    response = requests.post(url, json=names)
    arr = pyphi.jsonify.loads(response.text)
    x = np.array(arr)
    x = x.astype(dtype=np.float32)
    return x

def reverse_name(name):
    a=name.split()
    a.reverse()
    return " ".join(a)

def cos_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)
    return dot_product

def sort_by_distance(query_vector, result_vectors):
    for v in result_vectors:
        cos_similarity(query_vector, v)
        
        
def query_lhs_table_by_distance(query, lhs_table, distance, pairs, column, vectors):
    response = lhs_table.find_near_neighbors(query, distance)    
    return process_lhs_table_response(query, response, pairs, column, vectors)
    
def query_lhs_table_by_number(query, lhs_table, number, pairs, column, vectors):
    response = lhs_table.find_k_nearest_neighbors(query, k=number)    
    return process_lhs_table_response(query, response, pairs, column, vectors)

def query_lhs_table_nearest(query, lhs_table, pairs, column, vectors):
    response = lhs_table.find_nearest_neighbor(query)
    return process_lhs_table_response(query, [response], pairs, column, vectors)

def process_lhs_table_response(query, response, pairs, column, vectors):
    df = pd.DataFrame(index=range(len(response)), columns=['id', 'name', 'cosine'])   
    i = 0
    
    for resp in response:
        name = pairs.get_value(resp, column)
        df.set_value(index=i, col='id', value=resp)
        df.set_value(index=i, col='name', value=name)
        df.set_value(index=i, col='cosine', value=cos_similarity(query, vectors[resp]))
        i = i + 1
     
    df.sort_values('cosine', ascending=False)
    df = df.reset_index(drop=True)
    return df


def is_arabic(name):
    res = re.findall(r'[\u0600-\u06FF]+',name)
    if len(res) == 0:
        return False
    else:
        return True

In [None]:
%%time

path = 'data/test-sets/eng_arb'
directory = os.fsencode(path)

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".tsv") and not filename.startswith("vect"): 
        pairs = pd.read_csv(path + '/' + filename, sep='\t', names=['flag', 'left', 'right'])
        left_vector_strings = []
        right_vector_strings = []
        print(filename)
        
        i = 0
        j = 0
        ws_params = []
        print('vectors left')
        
        for index, row in pairs.iterrows():
            ws_params.append(row['left'])
            i = i + 1
            if i % 512 == 0:
                ws_result = call_embedding_ws(ws_params)   
                print("ws call #results: {r}".format(r=len(ws_result)))
                for vect in ws_result:
                    v_str = np.array2string(vect, separator=',', max_line_width=20000).strip("[]").replace(" ", "")
                    left_vector_strings.append(v_str)
                    j = j + 1
                ws_params = []
                print("result vectors added to dataframe")
                print(i)

        if len(ws_params) > 0:
            ws_result = call_embedding_ws(ws_params)
            print("ws call #results: {r}".format(r=len(ws_result)))
            for vect in ws_result:
                v_str = np.array2string(vect, separator=',', max_line_width=20000).strip("[]").replace(" ", "")
                left_vector_strings.append(v_str)
                j = j + 1
            print("result vectors added to dataframe")

        i = 0
        j = 0
        ws_params = []
        print('vectors right')
        for index, row in pairs.iterrows():
            ws_params.append(row['right'])
            i = i + 1
            if i % 512 == 0:
                ws_result = call_embedding_ws(ws_params)  
                print("ws call #results: {r}".format(r=len(ws_result)))
                for vect in ws_result:
                    v_str = np.array2string(vect, separator=',', max_line_width=20000).strip("[]").replace(" ", "")
                    right_vector_strings.append(v_str)
                    j = j + 1
                ws_params = []
                print(i)

        if len(ws_params) > 0:
            ws_result = call_embedding_ws(ws_params)
            print("ws call #results: {r}".format(r=len(ws_result)))
            for vect in ws_result:
                v_str = np.array2string(vect, separator=',', max_line_width=20000).strip("[]").replace(" ", "")
                right_vector_strings.append(v_str)
                j = j + 1
                
        se = pd.Series(left_vector_strings)
        pairs['left_vector'] = se.values
        
        se = pd.Series(right_vector_strings)
        pairs['right_vector'] = se.values
        
        print('parse vectors left')
        left_vectors = np.zeros(shape=(len(left_vector_strings),256))
        i = 0
        for v in left_vector_strings:
            x = np.fromstring(v, dtype=np.float32, sep=',')
            left_vectors[i] = x
            i = i+1
        left_vectors = left_vectors.astype(dtype=np.float32)
        left_vector_strings = None

        print('parse vectors right')
        right_vectors = np.zeros(shape=(len(right_vector_strings),256))
        i = 0
        for v in right_vector_strings:
            x = np.fromstring(v, dtype=np.float32, sep=',')
            right_vectors[i] = x
            i = i+1
        right_vectors = right_vectors.astype(dtype=np.float32)
        right_vector_strings = None

        print('calculate cosine')
        pairs['cos_dist'] = ''   
        cos_dist_loc = pairs.columns.get_loc('cos_dist')
        for index, row in pairs.iterrows():
            pairs.iloc[index, cos_dist_loc] = cos_similarity(left_vectors[index], right_vectors[index])

        pairs.to_csv(path + '/vectors_' + filename, sep='\t')

neg_eng_arb_1_1x1_1.tsv
vectors left
ws call #results: 512
result vectors added to dataframe
512
ws call #results: 512
result vectors added to dataframe
1024
ws call #results: 512
result vectors added to dataframe
1536
ws call #results: 512
result vectors added to dataframe
2048
ws call #results: 512
result vectors added to dataframe
2560
ws call #results: 512
result vectors added to dataframe
3072
ws call #results: 512
result vectors added to dataframe
3584
ws call #results: 512
result vectors added to dataframe
4096
ws call #results: 512
result vectors added to dataframe
4608
ws call #results: 512
result vectors added to dataframe
5120
ws call #results: 512
result vectors added to dataframe
5632
ws call #results: 512
result vectors added to dataframe
6144
ws call #results: 512
result vectors added to dataframe
6656
ws call #results: 512
result vectors added to dataframe
7168
ws call #results: 512
result vectors added to dataframe
7680
ws call #results: 512
result vectors added to dat

ws call #results: 512
25600
ws call #results: 512
26112
ws call #results: 512
26624
ws call #results: 512
27136
ws call #results: 512
27648
ws call #results: 512
28160
ws call #results: 512
28672
ws call #results: 512
29184
ws call #results: 512
29696
ws call #results: 512
30208
ws call #results: 512
30720
ws call #results: 512
31232
ws call #results: 512
31744
ws call #results: 512
32256
ws call #results: 512
32768
ws call #results: 512
33280
ws call #results: 512
33792
ws call #results: 512
34304
ws call #results: 512
34816
ws call #results: 512
35328
ws call #results: 512
35840
ws call #results: 512
36352
ws call #results: 512
36864
ws call #results: 512
37376
ws call #results: 512
37888
ws call #results: 512
38400
ws call #results: 512
38912
ws call #results: 512
39424
ws call #results: 512
39936
ws call #results: 512
40448
ws call #results: 512
40960
ws call #results: 512
41472
ws call #results: 512
41984
ws call #results: 512
42496
ws call #results: 512
43008
ws call #results: 51

ws call #results: 512
result vectors added to dataframe
53760
ws call #results: 512
result vectors added to dataframe
54272
ws call #results: 512
result vectors added to dataframe
54784
ws call #results: 512
result vectors added to dataframe
55296
ws call #results: 512
result vectors added to dataframe
55808
ws call #results: 512
result vectors added to dataframe
56320
ws call #results: 512
result vectors added to dataframe
56832
ws call #results: 512
result vectors added to dataframe
57344
ws call #results: 512
result vectors added to dataframe
57856
ws call #results: 512
result vectors added to dataframe
58368
ws call #results: 512
result vectors added to dataframe
58880
ws call #results: 512
result vectors added to dataframe
59392
ws call #results: 512
result vectors added to dataframe
59904
ws call #results: 512
result vectors added to dataframe
60416
ws call #results: 512
result vectors added to dataframe
60928
ws call #results: 512
result vectors added to dataframe
61440
ws call 

ws call #results: 512
result vectors added to dataframe
121344
ws call #results: 512
result vectors added to dataframe
121856
ws call #results: 512
result vectors added to dataframe
122368
ws call #results: 512
result vectors added to dataframe
122880
ws call #results: 512
result vectors added to dataframe
123392
ws call #results: 512
result vectors added to dataframe
123904
ws call #results: 512
result vectors added to dataframe
124416
ws call #results: 512
result vectors added to dataframe
124928
ws call #results: 512
result vectors added to dataframe
125440
ws call #results: 512
result vectors added to dataframe
125952
ws call #results: 512
result vectors added to dataframe
126464
ws call #results: 512
result vectors added to dataframe
126976
ws call #results: 512
result vectors added to dataframe
127488
ws call #results: 512
result vectors added to dataframe
128000
ws call #results: 512
result vectors added to dataframe
128512
ws call #results: 512
result vectors added to dataframe

result vectors added to dataframe
187904
ws call #results: 512
result vectors added to dataframe
188416
ws call #results: 512
result vectors added to dataframe
188928
ws call #results: 512
result vectors added to dataframe
189440
ws call #results: 512
result vectors added to dataframe
189952
ws call #results: 512
result vectors added to dataframe
190464
ws call #results: 512
result vectors added to dataframe
190976
ws call #results: 512
result vectors added to dataframe
191488
ws call #results: 512
result vectors added to dataframe
192000
ws call #results: 314
result vectors added to dataframe
vectors right
ws call #results: 512
512
ws call #results: 512
1024
ws call #results: 512
1536
ws call #results: 512
2048
ws call #results: 512
2560
ws call #results: 512
3072
ws call #results: 512
3584
ws call #results: 512
4096
ws call #results: 512
4608
ws call #results: 512
5120
ws call #results: 512
5632
ws call #results: 512
6144
ws call #results: 512
6656
ws call #results: 512
7168
ws call 

137728
ws call #results: 512
138240
ws call #results: 512
138752
ws call #results: 512
139264
ws call #results: 512
139776
ws call #results: 512
140288
ws call #results: 512
140800
ws call #results: 512
141312
ws call #results: 512
141824
ws call #results: 512
142336
ws call #results: 512
142848
ws call #results: 512
143360
ws call #results: 512
143872
ws call #results: 512
144384
ws call #results: 512
144896
ws call #results: 512
145408
ws call #results: 512
145920
ws call #results: 512
146432
ws call #results: 512
146944
ws call #results: 512
147456
ws call #results: 512
147968
ws call #results: 512
148480
ws call #results: 512
148992
ws call #results: 512
149504
ws call #results: 512
150016
ws call #results: 512
150528
ws call #results: 512
151040
ws call #results: 512
151552
ws call #results: 512
152064
ws call #results: 512
152576
ws call #results: 512
153088
ws call #results: 512
153600
ws call #results: 512
154112
ws call #results: 512
154624
ws call #results: 512
155136
ws call

ws call #results: 512
result vectors added to dataframe
41984
ws call #results: 512
result vectors added to dataframe
42496
ws call #results: 512
result vectors added to dataframe
43008
ws call #results: 512
result vectors added to dataframe
43520
ws call #results: 512
result vectors added to dataframe
44032
ws call #results: 512
result vectors added to dataframe
44544
ws call #results: 512
result vectors added to dataframe
45056
ws call #results: 512
result vectors added to dataframe
45568
ws call #results: 512
result vectors added to dataframe
46080
ws call #results: 512
result vectors added to dataframe
46592
ws call #results: 512
result vectors added to dataframe
47104
ws call #results: 512
result vectors added to dataframe
47616
ws call #results: 512
result vectors added to dataframe
48128
ws call #results: 512
result vectors added to dataframe
48640
ws call #results: 512
result vectors added to dataframe
49152
ws call #results: 512
result vectors added to dataframe
49664
ws call 

ws call #results: 512
result vectors added to dataframe
109568
ws call #results: 512
result vectors added to dataframe
110080
ws call #results: 512
result vectors added to dataframe
110592
ws call #results: 512
result vectors added to dataframe
111104
ws call #results: 512
result vectors added to dataframe
111616
ws call #results: 512
result vectors added to dataframe
112128
ws call #results: 512
result vectors added to dataframe
112640
ws call #results: 512
result vectors added to dataframe
113152
ws call #results: 512
result vectors added to dataframe
113664
ws call #results: 512
result vectors added to dataframe
114176
ws call #results: 512
result vectors added to dataframe
114688
ws call #results: 512
result vectors added to dataframe
115200
ws call #results: 512
result vectors added to dataframe
115712
ws call #results: 512
result vectors added to dataframe
116224
ws call #results: 512
result vectors added to dataframe
116736
ws call #results: 512
result vectors added to dataframe

result vectors added to dataframe
176128
ws call #results: 512
result vectors added to dataframe
176640
ws call #results: 512
result vectors added to dataframe
177152
ws call #results: 512
result vectors added to dataframe
177664
ws call #results: 512
result vectors added to dataframe
178176
ws call #results: 512
result vectors added to dataframe
178688
ws call #results: 512
result vectors added to dataframe
179200
ws call #results: 512
result vectors added to dataframe
179712
ws call #results: 512
result vectors added to dataframe
180224
ws call #results: 512
result vectors added to dataframe
180736
ws call #results: 512
result vectors added to dataframe
181248
ws call #results: 512
result vectors added to dataframe
181760
ws call #results: 512
result vectors added to dataframe
182272
ws call #results: 512
result vectors added to dataframe
182784
ws call #results: 512
result vectors added to dataframe
183296
ws call #results: 512
result vectors added to dataframe
183808
ws call #resul

ws call #results: 512
result vectors added to dataframe
243200
ws call #results: 512
result vectors added to dataframe
243712
ws call #results: 512
result vectors added to dataframe
244224
ws call #results: 512
result vectors added to dataframe
244736
ws call #results: 512
result vectors added to dataframe
245248
ws call #results: 512
result vectors added to dataframe
245760
ws call #results: 512
result vectors added to dataframe
246272
ws call #results: 512
result vectors added to dataframe
246784
ws call #results: 512
result vectors added to dataframe
247296
ws call #results: 512
result vectors added to dataframe
247808
ws call #results: 512
result vectors added to dataframe
248320
ws call #results: 512
result vectors added to dataframe
248832
ws call #results: 512
result vectors added to dataframe
249344
ws call #results: 512
result vectors added to dataframe
249856
ws call #results: 512
result vectors added to dataframe
250368
ws call #results: 512
result vectors added to dataframe

ws call #results: 512
57344
ws call #results: 512
57856
ws call #results: 512
58368
ws call #results: 512
58880
ws call #results: 512
59392
ws call #results: 512
59904
ws call #results: 512
60416
ws call #results: 512
60928
ws call #results: 512
61440
ws call #results: 512
61952
ws call #results: 512
62464
ws call #results: 512
62976
ws call #results: 512
63488
ws call #results: 512
64000
ws call #results: 512
64512
ws call #results: 512
65024
ws call #results: 512
65536
ws call #results: 512
66048
ws call #results: 512
66560
ws call #results: 512
67072
ws call #results: 512
67584
ws call #results: 512
68096
ws call #results: 512
68608
ws call #results: 512
69120
ws call #results: 512
69632
ws call #results: 512
70144
ws call #results: 512
70656
ws call #results: 512
71168
ws call #results: 512
71680
ws call #results: 512
72192
ws call #results: 512
72704
ws call #results: 512
73216
ws call #results: 512
73728
ws call #results: 512
74240
ws call #results: 512
74752
ws call #results: 51

203264
ws call #results: 512
203776
ws call #results: 512
204288
ws call #results: 512
204800
ws call #results: 512
205312
ws call #results: 512
205824
ws call #results: 512
206336
ws call #results: 512
206848
ws call #results: 512
207360
ws call #results: 512
207872
ws call #results: 512
208384
ws call #results: 512
208896
ws call #results: 512
209408
ws call #results: 512
209920
ws call #results: 512
210432
ws call #results: 512
210944
ws call #results: 512
211456
ws call #results: 512
211968
ws call #results: 512
212480
ws call #results: 512
212992
ws call #results: 512
213504
ws call #results: 512
214016
ws call #results: 512
214528
ws call #results: 512
215040
ws call #results: 512
215552
ws call #results: 512
216064
ws call #results: 512
216576
ws call #results: 512
217088
ws call #results: 512
217600
ws call #results: 512
218112
ws call #results: 512
218624
ws call #results: 512
219136
ws call #results: 512
219648
ws call #results: 512
220160
ws call #results: 512
220672
ws call

ws call #results: 512
result vectors added to dataframe
29696
ws call #results: 512
result vectors added to dataframe
30208
ws call #results: 512
result vectors added to dataframe
30720
ws call #results: 512
result vectors added to dataframe
31232
ws call #results: 512
result vectors added to dataframe
31744
ws call #results: 512
result vectors added to dataframe
32256
ws call #results: 512
result vectors added to dataframe
32768
ws call #results: 512
result vectors added to dataframe
33280
ws call #results: 512
result vectors added to dataframe
33792
ws call #results: 512
result vectors added to dataframe
34304
ws call #results: 512
result vectors added to dataframe
34816
ws call #results: 512
result vectors added to dataframe
35328
ws call #results: 512
result vectors added to dataframe
35840
ws call #results: 512
result vectors added to dataframe
36352
ws call #results: 512
result vectors added to dataframe
36864
ws call #results: 512
result vectors added to dataframe
37376
ws call 

ws call #results: 512
result vectors added to dataframe
35328
ws call #results: 512
result vectors added to dataframe
35840
ws call #results: 512
result vectors added to dataframe
36352
ws call #results: 512
result vectors added to dataframe
36864
ws call #results: 512
result vectors added to dataframe
37376
ws call #results: 512
result vectors added to dataframe
37888
ws call #results: 512
result vectors added to dataframe
38400
ws call #results: 512
result vectors added to dataframe
38912
ws call #results: 512
result vectors added to dataframe
39424
ws call #results: 512
result vectors added to dataframe
39936
ws call #results: 512
result vectors added to dataframe
40448
ws call #results: 512
result vectors added to dataframe
40960
ws call #results: 512
result vectors added to dataframe
41472
ws call #results: 512
result vectors added to dataframe
41984
ws call #results: 512
result vectors added to dataframe
42496
ws call #results: 512
result vectors added to dataframe
43008
ws call 

result vectors added to dataframe
102912
ws call #results: 512
result vectors added to dataframe
103424
ws call #results: 512
result vectors added to dataframe
103936
ws call #results: 512
result vectors added to dataframe
104448
ws call #results: 512
result vectors added to dataframe
104960
ws call #results: 512
result vectors added to dataframe
105472
ws call #results: 512
result vectors added to dataframe
105984
ws call #results: 512
result vectors added to dataframe
106496
ws call #results: 512
result vectors added to dataframe
107008
ws call #results: 512
result vectors added to dataframe
107520
ws call #results: 512
result vectors added to dataframe
108032
ws call #results: 512
result vectors added to dataframe
108544
ws call #results: 512
result vectors added to dataframe
109056
ws call #results: 512
result vectors added to dataframe
109568
ws call #results: 512
result vectors added to dataframe
110080
ws call #results: 512
result vectors added to dataframe
110592
ws call #resul

15872
ws call #results: 512
16384
ws call #results: 512
16896
ws call #results: 512
17408
ws call #results: 512
17920
ws call #results: 512
18432
ws call #results: 512
18944
ws call #results: 512
19456
ws call #results: 512
19968
ws call #results: 512
20480
ws call #results: 512
20992
ws call #results: 512
21504
ws call #results: 512
22016
ws call #results: 512
22528
ws call #results: 512
23040
ws call #results: 512
23552
ws call #results: 512
24064
ws call #results: 512
24576
ws call #results: 512
25088
ws call #results: 512
25600
ws call #results: 512
26112
ws call #results: 512
26624
ws call #results: 512
27136
ws call #results: 512
27648
ws call #results: 512
28160
ws call #results: 512
28672
ws call #results: 512
29184
ws call #results: 512
29696
ws call #results: 512
30208
ws call #results: 512
30720
ws call #results: 512
31232
ws call #results: 512
31744
ws call #results: 512
32256
ws call #results: 512
32768
ws call #results: 512
33280
ws call #results: 512
33792
ws call #resul

ws call #results: 207
result vectors added to dataframe
vectors right
ws call #results: 207
parse vectors left
parse vectors right
calculate cosine
pos_eng_arb_2x2.tsv
vectors left
ws call #results: 512
result vectors added to dataframe
512
ws call #results: 512
result vectors added to dataframe
1024
ws call #results: 512
result vectors added to dataframe
1536
ws call #results: 512
result vectors added to dataframe
2048
ws call #results: 370
result vectors added to dataframe
vectors right
ws call #results: 512
512
ws call #results: 512
1024
ws call #results: 512
1536
ws call #results: 512
2048
ws call #results: 370
parse vectors left
parse vectors right
calculate cosine
pos_eng_arb_2x3.tsv
vectors left
ws call #results: 512
result vectors added to dataframe
512
ws call #results: 512
result vectors added to dataframe
1024
ws call #results: 512
result vectors added to dataframe
1536
ws call #results: 512
result vectors added to dataframe
2048
ws call #results: 512
result vectors added to

11776
ws call #results: 512
12288
ws call #results: 512
12800
ws call #results: 512
13312
ws call #results: 512
13824
ws call #results: 512
14336
ws call #results: 512
14848
ws call #results: 512
15360
ws call #results: 512
15872
ws call #results: 512
16384
ws call #results: 247
parse vectors left
parse vectors right
calculate cosine
pos_eng_arb_4x4.tsv
vectors left
ws call #results: 512
result vectors added to dataframe
512
ws call #results: 512
result vectors added to dataframe
1024
ws call #results: 512
result vectors added to dataframe
1536
ws call #results: 512
result vectors added to dataframe
2048
ws call #results: 512
result vectors added to dataframe
2560
ws call #results: 512
result vectors added to dataframe
3072
ws call #results: 512
result vectors added to dataframe
3584
ws call #results: 512
result vectors added to dataframe
4096
ws call #results: 512
result vectors added to dataframe
4608
ws call #results: 512
result vectors added to dataframe
5120
ws call #results: 512


In [3]:
path = 'data/test-sets/eng_arb'
directory = os.fsencode(path)

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".tsv") and  filename.startswith("vect"): 
        cosPd = pd.read_csv(path + '/' + filename, sep='\t')

        lowest = cosPd['cos_dist'].min()
        highest = cosPd['cos_dist'].max()
        median = cosPd['cos_dist'].median()
        ah = len([1 for i in cosPd['cos_dist'] if i > 0.5])
        ah_percentage = int(ah*100/len(cosPd))
        print("{f}: lowest={l}, highest={h}, median={med}, above_0.5={ahp}%".format(ahp = ah_percentage, l=lowest, h=highest, med=median, f=filename))
        '''for inx, row in cosPd.iterrows():
            if row['cos_dist'] == highest:
                #print("row {l : r}".format(r=row['right'], l=row['left']))
                print(row)'''
        
        

vectors_pos_eng_eng_2x3.tsv: lowest=0.156453, highest=0.944641, median=0.6605855, above_0.5=85%
vectors_pos_eng_eng_3x3.tsv: lowest=0.9593, highest=1.0, median=0.999288, above_0.5=100%
vectors_pos_eng_eng_4x4.tsv: lowest=0.974668, highest=1.0, median=0.999251, above_0.5=100%
vectors_neg_eng_eng_1_1x1_1.tsv: lowest=-0.662273, highest=1.0, median=0.40004, above_0.5=40%
vectors_neg_eng_eng_1_1x1_2.tsv: lowest=-0.574799, highest=0.9492879999999999, median=0.268497, above_0.5=23%
vectors_neg_eng_eng_1_1x1_3.tsv: lowest=-0.62973, highest=0.930817, median=0.26606199999999997, above_0.5=25%
vectors_neg_eng_eng_2_1x2_1.tsv: lowest=-0.333343, highest=1.0, median=0.568731, above_0.5=60%
vectors_neg_eng_eng_2_1x2_2.tsv: lowest=-0.5036430000000001, highest=0.9677389999999999, median=0.46027799999999996, above_0.5=45%
vectors_neg_eng_eng_3_1x3_1.tsv: lowest=-0.270273, highest=1.0, median=0.7616890000000001, above_0.5=86%
vectors_neg_eng_eng_3x3_unordered.tsv: lowest=-0.651091, highest=0.999828, medi

In [None]:
cosPd['cos_dist']

In [None]:
number_of_tables = 50
assert left_vectors.dtype == np.float32

params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(left_vectors[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(7, params_cp)

print('Constructing the LSH table')
t1 = timeit.default_timer()
left_table = falconn.LSHIndex(params_cp)
left_table.setup(left_vectors)
t2 = timeit.default_timer()
print('Done')
print('Construction time: {}'.format(t2 - t1))

left_query_object = left_table.construct_query_object()

In [None]:
number_of_tables = 50
assert right_vectors.dtype == np.float32

params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(right_vectors[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(7, params_cp)

print('Constructing the LSH table')
t1 = timeit.default_timer()
right_table = falconn.LSHIndex(params_cp)
right_table.setup(right_vectors)
t2 = timeit.default_timer()
print('Done')
print('Construction time: {}'.format(t2 - t1))

right_query_object = right_table.construct_query_object()

In [None]:
%%time
for index, row in pairs.iterrows():    
    result = query_lhs_table_nearest(left_vectors[index], 
                                     lhs_table=right_query_object,
                                     pairs=pairs,
                                     column='right',
                                     vectors=right_vectors)
    if not row['left'] == result['name'][0]:
        print('{l}:{r} NOT MATCH'.format(l=row['left'], r=result['name'][0]))
        
    result = query_lhs_table_nearest(right_vectors[index], 
                                     lhs_table=left_query_object,
                                     pairs=pairs,
                                     column='left',
                                     vectors=left_vectors)
    if not row['right'] == result['name'][0]:
        print('{l}:{r} NOT MATCH (q={q}, a={a})'.format(l=result['name'][0], r=row['right'], q=row['right'], a=result))
