In [1]:
import math
import json
import pickle
import numpy as np
import falconn
import timeit
from __future__ import print_function
import psycopg2
import pandas as pd
from configparser import ConfigParser
import requests
import pyphi
import re
import os

In [2]:
def call_embedding_ws(names):
#    url = 'http://54.36.53.127:8009/embedding'
    url = 'http://127.0.0.1:8009/embedding'
    headers = {"content-type": "application/json"}
    response = requests.post(url, json=names)
    arr = pyphi.jsonify.loads(response.text)
    x = np.array(arr)
    x = x.astype(dtype=np.float32)
    return x

def reverse_name(name):
    a=name.split()
    a.reverse()
    return " ".join(a)

def cos_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)
    return dot_product

def sort_by_distance(query_vector, result_vectors):
    for v in result_vectors:
        cos_similarity(query_vector, v)
        
        
def query_lhs_table_by_distance(query, lhs_table, distance, pairs, column, vectors):
    response = lhs_table.find_near_neighbors(query, distance)    
    return process_lhs_table_response(query, response, pairs, column, vectors)
    
def query_lhs_table_by_number(query, lhs_table, number, pairs, column, vectors):
    response = lhs_table.find_k_nearest_neighbors(query, k=number)    
    return process_lhs_table_response(query, response, pairs, column, vectors)

def query_lhs_table_nearest(query, lhs_table, pairs, column, vectors):
    response = lhs_table.find_nearest_neighbor(query)
    return process_lhs_table_response(query, [response], pairs, column, vectors)

def process_lhs_table_response(query, response, pairs, column, vectors):
    df = pd.DataFrame(index=range(len(response)), columns=['id', 'name', 'cosine'])   
    i = 0
    
    for resp in response:
        name = pairs.get_value(resp, column)
        df.set_value(index=i, col='id', value=resp)
        df.set_value(index=i, col='name', value=name)
        df.set_value(index=i, col='cosine', value=cos_similarity(query, vectors[resp]))
        i = i + 1
     
    df.sort_values('cosine', ascending=False)
    df = df.reset_index(drop=True)
    return df


def is_arabic(name):
    res = re.findall(r'[\u0600-\u06FF]+',name)
    if len(res) == 0:
        return False
    else:
        return True

In [None]:
%%time

path = 'data/test-sets/arb_arb'
directory = os.fsencode(path)

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".tsv") and not filename.startswith("vect"): 
        pairs = pd.read_csv(path + '/' + filename, sep='\t', names=['flag', 'left', 'right'])
        left_vector_strings = []
        right_vector_strings = []
        print(filename)
        
        i = 0
        j = 0
        ws_params = []
        print('vectors left')
        
        for index, row in pairs.iterrows():
            ws_params.append(row['left'])
            i = i + 1
            if i % 512 == 0:
                ws_result = call_embedding_ws(ws_params)   
                print("ws call #results: {r}".format(r=len(ws_result)))
                for vect in ws_result:
                    v_str = np.array2string(vect, separator=',', max_line_width=20000).strip("[]").replace(" ", "")
                    left_vector_strings.append(v_str)
                    j = j + 1
                ws_params = []
                print("result vectors added to dataframe")
                print(i)

        if len(ws_params) > 0:
            ws_result = call_embedding_ws(ws_params)
            print("ws call #results: {r}".format(r=len(ws_result)))
            for vect in ws_result:
                v_str = np.array2string(vect, separator=',', max_line_width=20000).strip("[]").replace(" ", "")
                left_vector_strings.append(v_str)
                j = j + 1
            print("result vectors added to dataframe")

        i = 0
        j = 0
        ws_params = []
        print('vectors right')
        for index, row in pairs.iterrows():
            ws_params.append(row['right'])
            i = i + 1
            if i % 512 == 0:
                ws_result = call_embedding_ws(ws_params)  
                print("ws call #results: {r}".format(r=len(ws_result)))
                for vect in ws_result:
                    v_str = np.array2string(vect, separator=',', max_line_width=20000).strip("[]").replace(" ", "")
                    right_vector_strings.append(v_str)
                    j = j + 1
                ws_params = []
                print(i)

        if len(ws_params) > 0:
            ws_result = call_embedding_ws(ws_params)
            print("ws call #results: {r}".format(r=len(ws_result)))
            for vect in ws_result:
                v_str = np.array2string(vect, separator=',', max_line_width=20000).strip("[]").replace(" ", "")
                right_vector_strings.append(v_str)
                j = j + 1
                
        se = pd.Series(left_vector_strings)
        pairs['left_vector'] = se.values
        
        se = pd.Series(right_vector_strings)
        pairs['right_vector'] = se.values
        
        print('parse vectors left')
        left_vectors = np.zeros(shape=(len(left_vector_strings),256))
        i = 0
        for v in left_vector_strings:
            x = np.fromstring(v, dtype=np.float32, sep=',')
            left_vectors[i] = x
            i = i+1
        left_vectors = left_vectors.astype(dtype=np.float32)
        left_vector_strings = None

        print('parse vectors right')
        right_vectors = np.zeros(shape=(len(right_vector_strings),256))
        i = 0
        for v in right_vector_strings:
            x = np.fromstring(v, dtype=np.float32, sep=',')
            right_vectors[i] = x
            i = i+1
        right_vectors = right_vectors.astype(dtype=np.float32)
        right_vector_strings = None

        print('calculate cosine')
        pairs['cos_dist'] = ''   
        cos_dist_loc = pairs.columns.get_loc('cos_dist')
        for index, row in pairs.iterrows():
            pairs.iloc[index, cos_dist_loc] = cos_similarity(left_vectors[index], right_vectors[index])

        pairs.to_csv(path + '/vectors_' + filename, sep='\t')

neg_arb_arb_1_1x1_1.tsv
vectors left
ws call #results: 512
result vectors added to dataframe
512
ws call #results: 512
result vectors added to dataframe
1024
ws call #results: 512
result vectors added to dataframe
1536
ws call #results: 512
result vectors added to dataframe
2048
ws call #results: 512
result vectors added to dataframe
2560
ws call #results: 512
result vectors added to dataframe
3072
ws call #results: 512
result vectors added to dataframe
3584
ws call #results: 512
result vectors added to dataframe
4096
ws call #results: 512
result vectors added to dataframe
4608
ws call #results: 512
result vectors added to dataframe
5120
ws call #results: 512
result vectors added to dataframe
5632
ws call #results: 512
result vectors added to dataframe
6144
ws call #results: 512
result vectors added to dataframe
6656
ws call #results: 512
result vectors added to dataframe
7168
ws call #results: 512
result vectors added to dataframe
7680
ws call #results: 512
result vectors added to dat

ws call #results: 512
result vectors added to dataframe
68096
ws call #results: 512
result vectors added to dataframe
68608
ws call #results: 512
result vectors added to dataframe
69120
ws call #results: 512
result vectors added to dataframe
69632
ws call #results: 512
result vectors added to dataframe
70144
ws call #results: 512
result vectors added to dataframe
70656
ws call #results: 512
result vectors added to dataframe
71168
ws call #results: 512
result vectors added to dataframe
71680
ws call #results: 512
result vectors added to dataframe
72192
ws call #results: 512
result vectors added to dataframe
72704
ws call #results: 512
result vectors added to dataframe
73216
ws call #results: 512
result vectors added to dataframe
73728
ws call #results: 512
result vectors added to dataframe
74240
ws call #results: 512
result vectors added to dataframe
74752
ws call #results: 512
result vectors added to dataframe
75264
ws call #results: 512
result vectors added to dataframe
75776
ws call 

ws call #results: 512
result vectors added to dataframe
24576
ws call #results: 512
result vectors added to dataframe
25088
ws call #results: 512
result vectors added to dataframe
25600
ws call #results: 512
result vectors added to dataframe
26112
ws call #results: 512
result vectors added to dataframe
26624
ws call #results: 512
result vectors added to dataframe
27136
ws call #results: 512
result vectors added to dataframe
27648
ws call #results: 512
result vectors added to dataframe
28160
ws call #results: 512
result vectors added to dataframe
28672
ws call #results: 512
result vectors added to dataframe
29184
ws call #results: 512
result vectors added to dataframe
29696
ws call #results: 512
result vectors added to dataframe
30208
ws call #results: 512
result vectors added to dataframe
30720
ws call #results: 512
result vectors added to dataframe
31232
ws call #results: 512
result vectors added to dataframe
31744
ws call #results: 512
result vectors added to dataframe
32256
ws call 

result vectors added to dataframe
92160
ws call #results: 512
result vectors added to dataframe
92672
ws call #results: 512
result vectors added to dataframe
93184
ws call #results: 512
result vectors added to dataframe
93696
ws call #results: 512
result vectors added to dataframe
94208
ws call #results: 512
result vectors added to dataframe
94720
ws call #results: 512
result vectors added to dataframe
95232
ws call #results: 512
result vectors added to dataframe
95744
ws call #results: 512
result vectors added to dataframe
96256
ws call #results: 512
result vectors added to dataframe
96768
ws call #results: 512
result vectors added to dataframe
97280
ws call #results: 512
result vectors added to dataframe
97792
ws call #results: 512
result vectors added to dataframe
98304
ws call #results: 512
result vectors added to dataframe
98816
ws call #results: 512
result vectors added to dataframe
99328
ws call #results: 512
result vectors added to dataframe
99840
ws call #results: 512
result v

ws call #results: 512
result vectors added to dataframe
159232
ws call #results: 512
result vectors added to dataframe
159744
ws call #results: 512
result vectors added to dataframe
160256
ws call #results: 512
result vectors added to dataframe
160768
ws call #results: 512
result vectors added to dataframe
161280
ws call #results: 512
result vectors added to dataframe
161792
ws call #results: 512
result vectors added to dataframe
162304
ws call #results: 512
result vectors added to dataframe
162816
ws call #results: 512
result vectors added to dataframe
163328
ws call #results: 512
result vectors added to dataframe
163840
ws call #results: 512
result vectors added to dataframe
164352
ws call #results: 512
result vectors added to dataframe
164864
ws call #results: 512
result vectors added to dataframe
165376
ws call #results: 512
result vectors added to dataframe
165888
ws call #results: 512
result vectors added to dataframe
166400
ws call #results: 512
result vectors added to dataframe

result vectors added to dataframe
225792
ws call #results: 512
result vectors added to dataframe
226304
ws call #results: 512
result vectors added to dataframe
226816
ws call #results: 512
result vectors added to dataframe
227328
ws call #results: 512
result vectors added to dataframe
227840
ws call #results: 512
result vectors added to dataframe
228352
ws call #results: 512
result vectors added to dataframe
228864
ws call #results: 512
result vectors added to dataframe
229376
ws call #results: 512
result vectors added to dataframe
229888
ws call #results: 512
result vectors added to dataframe
230400
ws call #results: 512
result vectors added to dataframe
230912
ws call #results: 512
result vectors added to dataframe
231424
ws call #results: 512
result vectors added to dataframe
231936
ws call #results: 512
result vectors added to dataframe
232448
ws call #results: 512
result vectors added to dataframe
232960
ws call #results: 512
result vectors added to dataframe
233472
ws call #resul

ws call #results: 512
22016
ws call #results: 512
22528
ws call #results: 512
23040
ws call #results: 512
23552
ws call #results: 512
24064
ws call #results: 512
24576
ws call #results: 512
25088
ws call #results: 512
25600
ws call #results: 512
26112
ws call #results: 512
26624
ws call #results: 512
27136
ws call #results: 512
27648
ws call #results: 512
28160
ws call #results: 512
28672
ws call #results: 512
29184
ws call #results: 512
29696
ws call #results: 512
30208
ws call #results: 512
30720
ws call #results: 512
31232
ws call #results: 512
31744
ws call #results: 512
32256
ws call #results: 512
32768
ws call #results: 512
33280
ws call #results: 512
33792
ws call #results: 512
34304
ws call #results: 512
34816
ws call #results: 512
35328
ws call #results: 512
35840
ws call #results: 512
36352
ws call #results: 512
36864
ws call #results: 512
37376
ws call #results: 512
37888
ws call #results: 512
38400
ws call #results: 512
38912
ws call #results: 512
39424
ws call #results: 51

ws call #results: 512
result vectors added to dataframe
35328
ws call #results: 512
result vectors added to dataframe
35840
ws call #results: 512
result vectors added to dataframe
36352
ws call #results: 512
result vectors added to dataframe
36864
ws call #results: 512
result vectors added to dataframe
37376
ws call #results: 512
result vectors added to dataframe
37888
ws call #results: 512
result vectors added to dataframe
38400
ws call #results: 512
result vectors added to dataframe
38912
ws call #results: 512
result vectors added to dataframe
39424
ws call #results: 512
result vectors added to dataframe
39936
ws call #results: 512
result vectors added to dataframe
40448
ws call #results: 512
result vectors added to dataframe
40960
ws call #results: 512
result vectors added to dataframe
41472
ws call #results: 512
result vectors added to dataframe
41984
ws call #results: 512
result vectors added to dataframe
42496
ws call #results: 512
result vectors added to dataframe
43008
ws call 

result vectors added to dataframe
102912
ws call #results: 512
result vectors added to dataframe
103424
ws call #results: 512
result vectors added to dataframe
103936
ws call #results: 512
result vectors added to dataframe
104448
ws call #results: 512
result vectors added to dataframe
104960
ws call #results: 512
result vectors added to dataframe
105472
ws call #results: 512
result vectors added to dataframe
105984
ws call #results: 512
result vectors added to dataframe
106496
ws call #results: 512
result vectors added to dataframe
107008
ws call #results: 512
result vectors added to dataframe
107520
ws call #results: 512
result vectors added to dataframe
108032
ws call #results: 512
result vectors added to dataframe
108544
ws call #results: 512
result vectors added to dataframe
109056
ws call #results: 512
result vectors added to dataframe
109568
ws call #results: 512
result vectors added to dataframe
110080
ws call #results: 512
result vectors added to dataframe
110592
ws call #resul

ws call #results: 512
result vectors added to dataframe
169984
ws call #results: 512
result vectors added to dataframe
170496
ws call #results: 512
result vectors added to dataframe
171008
ws call #results: 512
result vectors added to dataframe
171520
ws call #results: 512
result vectors added to dataframe
172032
ws call #results: 512
result vectors added to dataframe
172544
ws call #results: 512
result vectors added to dataframe
173056
ws call #results: 512
result vectors added to dataframe
173568
ws call #results: 512
result vectors added to dataframe
174080
ws call #results: 512
result vectors added to dataframe
174592
ws call #results: 512
result vectors added to dataframe
175104
ws call #results: 512
result vectors added to dataframe
175616
ws call #results: 512
result vectors added to dataframe
176128
ws call #results: 512
result vectors added to dataframe
176640
ws call #results: 512
result vectors added to dataframe
177152
ws call #results: 512
result vectors added to dataframe

result vectors added to dataframe
236544
ws call #results: 512
result vectors added to dataframe
237056
ws call #results: 512
result vectors added to dataframe
237568
ws call #results: 512
result vectors added to dataframe
238080
ws call #results: 512
result vectors added to dataframe
238592
ws call #results: 512
result vectors added to dataframe
239104
ws call #results: 512
result vectors added to dataframe
239616
ws call #results: 512
result vectors added to dataframe
240128
ws call #results: 512
result vectors added to dataframe
240640
ws call #results: 512
result vectors added to dataframe
241152
ws call #results: 512
result vectors added to dataframe
241664
ws call #results: 512
result vectors added to dataframe
242176
ws call #results: 512
result vectors added to dataframe
242688
ws call #results: 512
result vectors added to dataframe
243200
ws call #results: 512
result vectors added to dataframe
243712
ws call #results: 512
result vectors added to dataframe
244224
ws call #resul

ws call #results: 512
result vectors added to dataframe
303616
ws call #results: 512
result vectors added to dataframe
304128
ws call #results: 512
result vectors added to dataframe
304640
ws call #results: 512
result vectors added to dataframe
305152
ws call #results: 512
result vectors added to dataframe
305664
ws call #results: 512
result vectors added to dataframe
306176
ws call #results: 512
result vectors added to dataframe
306688
ws call #results: 512
result vectors added to dataframe
307200
ws call #results: 512
result vectors added to dataframe
307712
ws call #results: 512
result vectors added to dataframe
308224
ws call #results: 512
result vectors added to dataframe
308736
ws call #results: 512
result vectors added to dataframe
309248
ws call #results: 512
result vectors added to dataframe
309760
ws call #results: 512
result vectors added to dataframe
310272
ws call #results: 512
result vectors added to dataframe
310784
ws call #results: 512
result vectors added to dataframe

result vectors added to dataframe
370176
ws call #results: 512
result vectors added to dataframe
370688
ws call #results: 512
result vectors added to dataframe
371200
ws call #results: 512
result vectors added to dataframe
371712
ws call #results: 512
result vectors added to dataframe
372224
ws call #results: 512
result vectors added to dataframe
372736
ws call #results: 512
result vectors added to dataframe
373248
ws call #results: 512
result vectors added to dataframe
373760
ws call #results: 512
result vectors added to dataframe
374272
ws call #results: 512
result vectors added to dataframe
374784
ws call #results: 512
result vectors added to dataframe
375296
ws call #results: 512
result vectors added to dataframe
375808
ws call #results: 512
result vectors added to dataframe
376320
ws call #results: 512
result vectors added to dataframe
376832
ws call #results: 512
result vectors added to dataframe
377344
ws call #results: 512
result vectors added to dataframe
377856
ws call #resul

ws call #results: 512
result vectors added to dataframe
437248
ws call #results: 512
result vectors added to dataframe
437760
ws call #results: 512
result vectors added to dataframe
438272
ws call #results: 512
result vectors added to dataframe
438784
ws call #results: 512
result vectors added to dataframe
439296
ws call #results: 512
result vectors added to dataframe
439808
ws call #results: 512
result vectors added to dataframe
440320
ws call #results: 512
result vectors added to dataframe
440832
ws call #results: 512
result vectors added to dataframe
441344
ws call #results: 512
result vectors added to dataframe
441856
ws call #results: 512
result vectors added to dataframe
442368
ws call #results: 512
result vectors added to dataframe
442880
ws call #results: 512
result vectors added to dataframe
443392
ws call #results: 512
result vectors added to dataframe
443904
ws call #results: 512
result vectors added to dataframe
444416
ws call #results: 512
result vectors added to dataframe

result vectors added to dataframe
503808
ws call #results: 512
result vectors added to dataframe
504320
ws call #results: 512
result vectors added to dataframe
504832
ws call #results: 512
result vectors added to dataframe
505344
ws call #results: 512
result vectors added to dataframe
505856
ws call #results: 512
result vectors added to dataframe
506368
ws call #results: 512
result vectors added to dataframe
506880
ws call #results: 512
result vectors added to dataframe
507392
ws call #results: 512
result vectors added to dataframe
507904
ws call #results: 512
result vectors added to dataframe
508416
ws call #results: 512
result vectors added to dataframe
508928
ws call #results: 512
result vectors added to dataframe
509440
ws call #results: 512
result vectors added to dataframe
509952
ws call #results: 512
result vectors added to dataframe
510464
ws call #results: 512
result vectors added to dataframe
510976
ws call #results: 512
result vectors added to dataframe
511488
ws call #resul

ws call #results: 512
result vectors added to dataframe
570880
ws call #results: 512
result vectors added to dataframe
571392
ws call #results: 512
result vectors added to dataframe
571904
ws call #results: 512
result vectors added to dataframe
572416
ws call #results: 512
result vectors added to dataframe
572928
ws call #results: 512
result vectors added to dataframe
573440
ws call #results: 512
result vectors added to dataframe
573952
ws call #results: 512
result vectors added to dataframe
574464
ws call #results: 512
result vectors added to dataframe
574976
ws call #results: 512
result vectors added to dataframe
575488
ws call #results: 512
result vectors added to dataframe
576000
ws call #results: 512
result vectors added to dataframe
576512
ws call #results: 512
result vectors added to dataframe
577024
ws call #results: 512
result vectors added to dataframe
577536
ws call #results: 512
result vectors added to dataframe
578048
ws call #results: 512
result vectors added to dataframe

result vectors added to dataframe
637440
ws call #results: 512
result vectors added to dataframe
637952
ws call #results: 512
result vectors added to dataframe
638464
ws call #results: 512
result vectors added to dataframe
638976
ws call #results: 512
result vectors added to dataframe
639488
ws call #results: 512
result vectors added to dataframe
640000
ws call #results: 512
result vectors added to dataframe
640512
ws call #results: 512
result vectors added to dataframe
641024
ws call #results: 512
result vectors added to dataframe
641536
ws call #results: 512
result vectors added to dataframe
642048
ws call #results: 512
result vectors added to dataframe
642560
ws call #results: 512
result vectors added to dataframe
643072
ws call #results: 512
result vectors added to dataframe
643584
ws call #results: 512
result vectors added to dataframe
644096
ws call #results: 512
result vectors added to dataframe
644608
ws call #results: 512
result vectors added to dataframe
645120
ws call #resul

ws call #results: 512
113664
ws call #results: 512
114176
ws call #results: 512
114688
ws call #results: 512
115200
ws call #results: 512
115712
ws call #results: 512
116224
ws call #results: 512
116736
ws call #results: 512
117248
ws call #results: 512
117760
ws call #results: 512
118272
ws call #results: 512
118784
ws call #results: 512
119296
ws call #results: 512
119808
ws call #results: 512
120320
ws call #results: 512
120832
ws call #results: 512
121344
ws call #results: 512
121856
ws call #results: 512
122368
ws call #results: 512
122880
ws call #results: 512
123392
ws call #results: 512
123904
ws call #results: 512
124416
ws call #results: 512
124928
ws call #results: 512
125440
ws call #results: 512
125952
ws call #results: 512
126464
ws call #results: 512
126976
ws call #results: 512
127488
ws call #results: 512
128000
ws call #results: 512
128512
ws call #results: 512
129024
ws call #results: 512
129536
ws call #results: 512
130048
ws call #results: 512
130560
ws call #resul

258048
ws call #results: 512
258560
ws call #results: 512
259072
ws call #results: 512
259584
ws call #results: 512
260096
ws call #results: 512
260608
ws call #results: 512
261120
ws call #results: 512
261632
ws call #results: 512
262144
ws call #results: 512
262656
ws call #results: 512
263168
ws call #results: 512
263680
ws call #results: 512
264192
ws call #results: 512
264704
ws call #results: 512
265216
ws call #results: 512
265728
ws call #results: 512
266240
ws call #results: 512
266752
ws call #results: 512
267264
ws call #results: 512
267776
ws call #results: 512
268288
ws call #results: 512
268800
ws call #results: 512
269312
ws call #results: 512
269824
ws call #results: 512
270336
ws call #results: 512
270848
ws call #results: 512
271360
ws call #results: 512
271872
ws call #results: 512
272384
ws call #results: 512
272896
ws call #results: 512
273408
ws call #results: 512
273920
ws call #results: 512
274432
ws call #results: 512
274944
ws call #results: 512
275456
ws call

402944
ws call #results: 512
403456
ws call #results: 512
403968
ws call #results: 512
404480
ws call #results: 512
404992
ws call #results: 512
405504
ws call #results: 512
406016
ws call #results: 512
406528
ws call #results: 512
407040
ws call #results: 512
407552
ws call #results: 512
408064
ws call #results: 512
408576
ws call #results: 512
409088
ws call #results: 512
409600
ws call #results: 512
410112
ws call #results: 512
410624
ws call #results: 512
411136
ws call #results: 512
411648
ws call #results: 512
412160
ws call #results: 512
412672
ws call #results: 512
413184
ws call #results: 512
413696
ws call #results: 512
414208
ws call #results: 512
414720
ws call #results: 512
415232
ws call #results: 512
415744
ws call #results: 512
416256
ws call #results: 512
416768
ws call #results: 512
417280
ws call #results: 512
417792
ws call #results: 512
418304
ws call #results: 512
418816
ws call #results: 512
419328
ws call #results: 512
419840
ws call #results: 512
420352
ws call

547840
ws call #results: 512
548352
ws call #results: 512
548864
ws call #results: 512
549376
ws call #results: 512
549888
ws call #results: 512
550400
ws call #results: 512
550912
ws call #results: 512
551424
ws call #results: 512
551936
ws call #results: 512
552448
ws call #results: 512
552960
ws call #results: 512
553472
ws call #results: 512
553984
ws call #results: 512
554496
ws call #results: 512
555008
ws call #results: 512
555520
ws call #results: 512
556032
ws call #results: 512
556544
ws call #results: 512
557056
ws call #results: 512
557568
ws call #results: 512
558080
ws call #results: 512
558592
ws call #results: 512
559104
ws call #results: 512
559616
ws call #results: 512
560128
ws call #results: 512
560640
ws call #results: 512
561152
ws call #results: 512
561664
ws call #results: 512
562176
ws call #results: 512
562688
ws call #results: 512
563200
ws call #results: 512
563712
ws call #results: 512
564224
ws call #results: 512
564736
ws call #results: 512
565248
ws call

ws call #results: 512
result vectors added to dataframe
18432
ws call #results: 512
result vectors added to dataframe
18944
ws call #results: 512
result vectors added to dataframe
19456
ws call #results: 512
result vectors added to dataframe
19968
ws call #results: 512
result vectors added to dataframe
20480
ws call #results: 512
result vectors added to dataframe
20992
ws call #results: 512
result vectors added to dataframe
21504
ws call #results: 512
result vectors added to dataframe
22016
ws call #results: 512
result vectors added to dataframe
22528
ws call #results: 512
result vectors added to dataframe
23040
ws call #results: 512
result vectors added to dataframe
23552
ws call #results: 512
result vectors added to dataframe
24064
ws call #results: 512
result vectors added to dataframe
24576
ws call #results: 512
result vectors added to dataframe
25088
ws call #results: 512
result vectors added to dataframe
25600
ws call #results: 512
result vectors added to dataframe
26112
ws call 

ws call #results: 512
result vectors added to dataframe
86528
ws call #results: 512
result vectors added to dataframe
87040
ws call #results: 512
result vectors added to dataframe
87552
ws call #results: 512
result vectors added to dataframe
88064
ws call #results: 512
result vectors added to dataframe
88576
ws call #results: 512
result vectors added to dataframe
89088
ws call #results: 512
result vectors added to dataframe
89600
ws call #results: 512
result vectors added to dataframe
90112
ws call #results: 512
result vectors added to dataframe
90624
ws call #results: 512
result vectors added to dataframe
91136
ws call #results: 512
result vectors added to dataframe
91648
ws call #results: 512
result vectors added to dataframe
92160
ws call #results: 512
result vectors added to dataframe
92672
ws call #results: 512
result vectors added to dataframe
93184
ws call #results: 512
result vectors added to dataframe
93696
ws call #results: 512
result vectors added to dataframe
94208
ws call 

result vectors added to dataframe
153600
ws call #results: 512
result vectors added to dataframe
154112
ws call #results: 512
result vectors added to dataframe
154624
ws call #results: 512
result vectors added to dataframe
155136
ws call #results: 512
result vectors added to dataframe
155648
ws call #results: 512
result vectors added to dataframe
156160
ws call #results: 512
result vectors added to dataframe
156672
ws call #results: 512
result vectors added to dataframe
157184
ws call #results: 512
result vectors added to dataframe
157696
ws call #results: 512
result vectors added to dataframe
158208
ws call #results: 512
result vectors added to dataframe
158720
ws call #results: 512
result vectors added to dataframe
159232
ws call #results: 512
result vectors added to dataframe
159744
ws call #results: 512
result vectors added to dataframe
160256
ws call #results: 512
result vectors added to dataframe
160768
ws call #results: 512
result vectors added to dataframe
161280
ws call #resul

ws call #results: 512
result vectors added to dataframe
220672
ws call #results: 512
result vectors added to dataframe
221184
ws call #results: 512
result vectors added to dataframe
221696
ws call #results: 512
result vectors added to dataframe
222208
ws call #results: 512
result vectors added to dataframe
222720
ws call #results: 512
result vectors added to dataframe
223232
ws call #results: 512
result vectors added to dataframe
223744
ws call #results: 512
result vectors added to dataframe
224256
ws call #results: 512
result vectors added to dataframe
224768
ws call #results: 512
result vectors added to dataframe
225280
ws call #results: 512
result vectors added to dataframe
225792
ws call #results: 512
result vectors added to dataframe
226304
ws call #results: 512
result vectors added to dataframe
226816
ws call #results: 512
result vectors added to dataframe
227328
ws call #results: 512
result vectors added to dataframe
227840
ws call #results: 512
result vectors added to dataframe

result vectors added to dataframe
287232
ws call #results: 512
result vectors added to dataframe
287744
ws call #results: 512
result vectors added to dataframe
288256
ws call #results: 512
result vectors added to dataframe
288768
ws call #results: 512
result vectors added to dataframe
289280
ws call #results: 512
result vectors added to dataframe
289792
ws call #results: 512
result vectors added to dataframe
290304
ws call #results: 512
result vectors added to dataframe
290816
ws call #results: 512
result vectors added to dataframe
291328
ws call #results: 512
result vectors added to dataframe
291840
ws call #results: 512
result vectors added to dataframe
292352
ws call #results: 512
result vectors added to dataframe
292864
ws call #results: 512
result vectors added to dataframe
293376
ws call #results: 512
result vectors added to dataframe
293888
ws call #results: 512
result vectors added to dataframe
294400
ws call #results: 512
result vectors added to dataframe
294912
ws call #resul

ws call #results: 512
result vectors added to dataframe
354304
ws call #results: 512
result vectors added to dataframe
354816
ws call #results: 512
result vectors added to dataframe
355328
ws call #results: 512
result vectors added to dataframe
355840
ws call #results: 512
result vectors added to dataframe
356352
ws call #results: 512
result vectors added to dataframe
356864
ws call #results: 512
result vectors added to dataframe
357376
ws call #results: 512
result vectors added to dataframe
357888
ws call #results: 512
result vectors added to dataframe
358400
ws call #results: 512
result vectors added to dataframe
358912
ws call #results: 512
result vectors added to dataframe
359424
ws call #results: 512
result vectors added to dataframe
359936
ws call #results: 512
result vectors added to dataframe
360448
ws call #results: 512
result vectors added to dataframe
360960
ws call #results: 512
result vectors added to dataframe
361472
ws call #results: 512
result vectors added to dataframe

result vectors added to dataframe
420864
ws call #results: 512
result vectors added to dataframe
421376
ws call #results: 512
result vectors added to dataframe
421888
ws call #results: 512
result vectors added to dataframe
422400
ws call #results: 512
result vectors added to dataframe
422912
ws call #results: 512
result vectors added to dataframe
423424
ws call #results: 512
result vectors added to dataframe
423936
ws call #results: 512
result vectors added to dataframe
424448
ws call #results: 512
result vectors added to dataframe
424960
ws call #results: 512
result vectors added to dataframe
425472
ws call #results: 512
result vectors added to dataframe
425984
ws call #results: 512
result vectors added to dataframe
426496
ws call #results: 512
result vectors added to dataframe
427008
ws call #results: 512
result vectors added to dataframe
427520
ws call #results: 512
result vectors added to dataframe
428032
ws call #results: 512
result vectors added to dataframe
428544
ws call #resul

ws call #results: 512
result vectors added to dataframe
487936
ws call #results: 512
result vectors added to dataframe
488448
ws call #results: 512
result vectors added to dataframe
488960
ws call #results: 512
result vectors added to dataframe
489472
ws call #results: 512
result vectors added to dataframe
489984
ws call #results: 512
result vectors added to dataframe
490496
ws call #results: 512
result vectors added to dataframe
491008
ws call #results: 512
result vectors added to dataframe
491520
ws call #results: 512
result vectors added to dataframe
492032
ws call #results: 512
result vectors added to dataframe
492544
ws call #results: 512
result vectors added to dataframe
493056
ws call #results: 512
result vectors added to dataframe
493568
ws call #results: 512
result vectors added to dataframe
494080
ws call #results: 512
result vectors added to dataframe
494592
ws call #results: 512
result vectors added to dataframe
495104
ws call #results: 512
result vectors added to dataframe

112128
ws call #results: 512
112640
ws call #results: 512
113152
ws call #results: 512
113664
ws call #results: 512
114176
ws call #results: 512
114688
ws call #results: 512
115200
ws call #results: 512
115712
ws call #results: 512
116224
ws call #results: 512
116736
ws call #results: 512
117248
ws call #results: 512
117760
ws call #results: 512
118272
ws call #results: 512
118784
ws call #results: 512
119296
ws call #results: 512
119808
ws call #results: 512
120320
ws call #results: 512
120832
ws call #results: 512
121344
ws call #results: 512
121856
ws call #results: 512
122368
ws call #results: 512
122880
ws call #results: 512
123392
ws call #results: 512
123904
ws call #results: 512
124416
ws call #results: 512
124928
ws call #results: 512
125440
ws call #results: 512
125952
ws call #results: 512
126464
ws call #results: 512
126976
ws call #results: 512
127488
ws call #results: 512
128000
ws call #results: 512
128512
ws call #results: 512
129024
ws call #results: 512
129536
ws call

ws call #results: 512
257536
ws call #results: 512
258048
ws call #results: 512
258560
ws call #results: 512
259072
ws call #results: 512
259584
ws call #results: 512
260096
ws call #results: 512
260608
ws call #results: 512
261120
ws call #results: 512
261632
ws call #results: 512
262144
ws call #results: 512
262656
ws call #results: 512
263168
ws call #results: 512
263680
ws call #results: 512
264192
ws call #results: 512
264704
ws call #results: 512
265216
ws call #results: 512
265728
ws call #results: 512
266240
ws call #results: 512
266752
ws call #results: 512
267264
ws call #results: 512
267776
ws call #results: 512
268288
ws call #results: 512
268800
ws call #results: 512
269312
ws call #results: 512
269824
ws call #results: 512
270336
ws call #results: 512
270848
ws call #results: 512
271360
ws call #results: 512
271872
ws call #results: 512
272384
ws call #results: 512
272896
ws call #results: 512
273408
ws call #results: 512
273920
ws call #results: 512
274432
ws call #resul

401920
ws call #results: 512
402432
ws call #results: 512
402944
ws call #results: 512
403456
ws call #results: 512
403968
ws call #results: 512
404480
ws call #results: 512
404992
ws call #results: 512
405504
ws call #results: 512
406016
ws call #results: 512
406528
ws call #results: 512
407040
ws call #results: 512
407552
ws call #results: 512
408064
ws call #results: 512
408576
ws call #results: 512
409088
ws call #results: 512
409600
ws call #results: 512
410112
ws call #results: 512
410624
ws call #results: 512
411136
ws call #results: 512
411648
ws call #results: 512
412160
ws call #results: 512
412672
ws call #results: 512
413184
ws call #results: 512
413696
ws call #results: 512
414208
ws call #results: 512
414720
ws call #results: 512
415232
ws call #results: 512
415744
ws call #results: 512
416256
ws call #results: 512
416768
ws call #results: 512
417280
ws call #results: 512
417792
ws call #results: 512
418304
ws call #results: 512
418816
ws call #results: 512
419328
ws call

ws call #results: 512
result vectors added to dataframe
11776
ws call #results: 512
result vectors added to dataframe
12288
ws call #results: 512
result vectors added to dataframe
12800
ws call #results: 512
result vectors added to dataframe
13312
ws call #results: 512
result vectors added to dataframe
13824
ws call #results: 512
result vectors added to dataframe
14336
ws call #results: 512
result vectors added to dataframe
14848
ws call #results: 512
result vectors added to dataframe
15360
ws call #results: 512
result vectors added to dataframe
15872
ws call #results: 512
result vectors added to dataframe
16384
ws call #results: 512
result vectors added to dataframe
16896
ws call #results: 512
result vectors added to dataframe
17408
ws call #results: 512
result vectors added to dataframe
17920
ws call #results: 512
result vectors added to dataframe
18432
ws call #results: 512
result vectors added to dataframe
18944
ws call #results: 512
result vectors added to dataframe
19456
ws call 

result vectors added to dataframe
79360
ws call #results: 512
result vectors added to dataframe
79872
ws call #results: 512
result vectors added to dataframe
80384
ws call #results: 512
result vectors added to dataframe
80896
ws call #results: 512
result vectors added to dataframe
81408
ws call #results: 512
result vectors added to dataframe
81920
ws call #results: 512
result vectors added to dataframe
82432
ws call #results: 512
result vectors added to dataframe
82944
ws call #results: 512
result vectors added to dataframe
83456
ws call #results: 512
result vectors added to dataframe
83968
ws call #results: 512
result vectors added to dataframe
84480
ws call #results: 512
result vectors added to dataframe
84992
ws call #results: 512
result vectors added to dataframe
85504
ws call #results: 512
result vectors added to dataframe
86016
ws call #results: 512
result vectors added to dataframe
86528
ws call #results: 512
result vectors added to dataframe
87040
ws call #results: 512
result v

result vectors added to dataframe
146432
ws call #results: 512
result vectors added to dataframe
146944
ws call #results: 512
result vectors added to dataframe
147456
ws call #results: 512
result vectors added to dataframe
147968
ws call #results: 512
result vectors added to dataframe
148480
ws call #results: 512
result vectors added to dataframe
148992
ws call #results: 512
result vectors added to dataframe
149504
ws call #results: 512
result vectors added to dataframe
150016
ws call #results: 512
result vectors added to dataframe
150528
ws call #results: 512
result vectors added to dataframe
151040
ws call #results: 512
result vectors added to dataframe
151552
ws call #results: 512
result vectors added to dataframe
152064
ws call #results: 512
result vectors added to dataframe
152576
ws call #results: 512
result vectors added to dataframe
153088
ws call #results: 512
result vectors added to dataframe
153600
ws call #results: 512
result vectors added to dataframe
154112
ws call #resul

ws call #results: 512
result vectors added to dataframe
213504
ws call #results: 512
result vectors added to dataframe
214016
ws call #results: 512
result vectors added to dataframe
214528
ws call #results: 512
result vectors added to dataframe
215040
ws call #results: 512
result vectors added to dataframe
215552
ws call #results: 512
result vectors added to dataframe
216064
ws call #results: 512
result vectors added to dataframe
216576
ws call #results: 512
result vectors added to dataframe
217088
ws call #results: 512
result vectors added to dataframe
217600
ws call #results: 512
result vectors added to dataframe
218112
ws call #results: 512
result vectors added to dataframe
218624
ws call #results: 512
result vectors added to dataframe
219136
ws call #results: 512
result vectors added to dataframe
219648
ws call #results: 512
result vectors added to dataframe
220160
ws call #results: 512
result vectors added to dataframe
220672
ws call #results: 512
result vectors added to dataframe

ws call #results: 512
result vectors added to dataframe
280576
ws call #results: 512
result vectors added to dataframe
281088
ws call #results: 512
result vectors added to dataframe
281600
ws call #results: 512
result vectors added to dataframe
282112
ws call #results: 512
result vectors added to dataframe
282624
ws call #results: 512
result vectors added to dataframe
283136
ws call #results: 512
result vectors added to dataframe
283648
ws call #results: 512
result vectors added to dataframe
284160
ws call #results: 512
result vectors added to dataframe
284672
ws call #results: 512
result vectors added to dataframe
285184
ws call #results: 512
result vectors added to dataframe
285696
ws call #results: 512
result vectors added to dataframe
286208
ws call #results: 512
result vectors added to dataframe
286720
ws call #results: 512
result vectors added to dataframe
287232
ws call #results: 512
result vectors added to dataframe
287744
ws call #results: 512
result vectors added to dataframe

ws call #results: 512
68096
ws call #results: 512
68608
ws call #results: 512
69120
ws call #results: 512
69632
ws call #results: 512
70144
ws call #results: 512
70656
ws call #results: 512
71168
ws call #results: 512
71680
ws call #results: 512
72192
ws call #results: 512
72704
ws call #results: 512
73216
ws call #results: 512
73728
ws call #results: 512
74240
ws call #results: 512
74752
ws call #results: 512
75264
ws call #results: 512
75776
ws call #results: 512
76288
ws call #results: 512
76800
ws call #results: 512
77312
ws call #results: 512
77824
ws call #results: 512
78336
ws call #results: 512
78848
ws call #results: 512
79360
ws call #results: 512
79872
ws call #results: 512
80384
ws call #results: 512
80896
ws call #results: 512
81408
ws call #results: 512
81920
ws call #results: 512
82432
ws call #results: 512
82944
ws call #results: 512
83456
ws call #results: 512
83968
ws call #results: 512
84480
ws call #results: 512
84992
ws call #results: 512
85504
ws call #results: 51

213504
ws call #results: 512
214016
ws call #results: 512
214528
ws call #results: 512
215040
ws call #results: 512
215552
ws call #results: 512
216064
ws call #results: 512
216576
ws call #results: 512
217088
ws call #results: 512
217600
ws call #results: 512
218112
ws call #results: 512
218624
ws call #results: 512
219136
ws call #results: 512
219648
ws call #results: 512
220160
ws call #results: 512
220672
ws call #results: 512
221184
ws call #results: 512
221696
ws call #results: 512
222208
ws call #results: 512
222720
ws call #results: 512
223232
ws call #results: 512
223744
ws call #results: 512
224256
ws call #results: 512
224768
ws call #results: 512
225280
ws call #results: 512
225792
ws call #results: 512
226304
ws call #results: 512
226816
ws call #results: 512
227328
ws call #results: 512
227840
ws call #results: 512
228352
ws call #results: 512
228864
ws call #results: 512
229376
ws call #results: 512
229888
ws call #results: 512
230400
ws call #results: 512
230912
ws call

ws call #results: 512
result vectors added to dataframe
13312
ws call #results: 120
result vectors added to dataframe
vectors right
ws call #results: 512
512
ws call #results: 512
1024
ws call #results: 512
1536
ws call #results: 512
2048
ws call #results: 512
2560
ws call #results: 512
3072
ws call #results: 512
3584
ws call #results: 512
4096
ws call #results: 512
4608
ws call #results: 512
5120
ws call #results: 512
5632
ws call #results: 512
6144
ws call #results: 512
6656
ws call #results: 512
7168
ws call #results: 512
7680
ws call #results: 512
8192
ws call #results: 512
8704
ws call #results: 512
9216
ws call #results: 512
9728
ws call #results: 512
10240
ws call #results: 512
10752
ws call #results: 512
11264
ws call #results: 512
11776
ws call #results: 512
12288
ws call #results: 512
12800
ws call #results: 512
13312
ws call #results: 120
parse vectors left
parse vectors right
calculate cosine
CPU times: user 11h 19min 30s, sys: 10min 37s, total: 11h 30min 7s
Wall time: 13h 

In [4]:
path = 'data/test-sets/arb_arb'
directory = os.fsencode(path)

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".tsv") and  filename.startswith("vect"): 
        cosPd = pd.read_csv(path + '/' + filename, sep='\t')

        lowest = cosPd['cos_dist'].min()
        highest = cosPd['cos_dist'].max()
        median = cosPd['cos_dist'].median()
        ah = len([1 for i in cosPd['cos_dist'] if i > 0.5])
        ah_percentage = int(ah*100/len(cosPd))
        print("{f}: lowest={l}, highest={h}, median={med}, above_0.5={ahp}%".format(ahp = ah_percentage, l=lowest, h=highest, med=median, f=filename))
        '''for inx, row in cosPd.iterrows():
            if row['cos_dist'] == highest:
                #print("row {l : r}".format(r=row['right'], l=row['left']))
                print(row)'''
        
        

vectors_neg_arb_arb_1x1.tsv: lowest=-0.8263959999999999, highest=0.9999819999999999, median=-0.0403823, above_0.5=10%


In [None]:
cosPd['cos_dist']

In [None]:
number_of_tables = 50
assert left_vectors.dtype == np.float32

params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(left_vectors[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(7, params_cp)

print('Constructing the LSH table')
t1 = timeit.default_timer()
left_table = falconn.LSHIndex(params_cp)
left_table.setup(left_vectors)
t2 = timeit.default_timer()
print('Done')
print('Construction time: {}'.format(t2 - t1))

left_query_object = left_table.construct_query_object()

In [None]:
number_of_tables = 50
assert right_vectors.dtype == np.float32

params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(right_vectors[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(7, params_cp)

print('Constructing the LSH table')
t1 = timeit.default_timer()
right_table = falconn.LSHIndex(params_cp)
right_table.setup(right_vectors)
t2 = timeit.default_timer()
print('Done')
print('Construction time: {}'.format(t2 - t1))

right_query_object = right_table.construct_query_object()

In [5]:
%%time
for index, row in pairs.iterrows():    
    result = query_lhs_table_nearest(left_vectors[index], 
                                     lhs_table=right_query_object,
                                     pairs=pairs,
                                     column='right',
                                     vectors=right_vectors)
    if not row['left'] == result['name'][0]:
        print('{l}:{r} NOT MATCH'.format(l=row['left'], r=result['name'][0]))
        
    result = query_lhs_table_nearest(right_vectors[index], 
                                     lhs_table=left_query_object,
                                     pairs=pairs,
                                     column='left',
                                     vectors=left_vectors)
    if not row['right'] == result['name'][0]:
        print('{l}:{r} NOT MATCH (q={q}, a={a})'.format(l=result['name'][0], r=row['right'], q=row['right'], a=result))


NameError: name 'right_query_object' is not defined