In [35]:
import pyarabic.unshape 

In [36]:
import math
import json
import pickle
import numpy as np
import falconn
import timeit
from __future__ import print_function
import psycopg2
import pandas as pd
from configparser import ConfigParser
import requests
import pyphi
import re
from scipy.spatial.distance import cosine

In [61]:
def call_embedding_ws(name):
    #url = 'http://54.36.53.127:8009/embedding'
    url = 'http://127.0.0.1:8009/embedding'
    headers = {"content-type": "application/json;charset=utf-8"}
    response = requests.post(url, json=[name.lower()])
    
    arr = pyphi.jsonify.loads(response.text)
    x = np.array(arr)
    x = x.astype(dtype=np.float32)
    return x[0]

def reverse_name(name):
    a=name.split()
    a.reverse()
    return " ".join(a)

def sort_by_distance(query_vector, result_vectors):
    for v in result_vectors:
        cos_similarity(query_vector, v)
        
        
def query_lhs_table_by_distance(name, lhs_table, distance, names, vectors):
    query = call_embedding_ws(name)
    t1 = timeit.default_timer()
    response = lhs_table.find_near_neighbors(query, distance)
    
    return process_lhs_table_response(query, response, names, vectors)
    
def query_lhs_table_by_number(name, lhs_table, number, names, vectors):
    query = call_embedding_ws(name)
    t1 = timeit.default_timer()
    response = lhs_table.find_k_nearest_neighbors(query, k=number)
    
    return process_lhs_table_response(query, response, names, vectors)

def query_lhs_table_nearest(name, lhs_table, names, vectors):
    query = call_embedding_ws(name)
    t1 = timeit.default_timer()
    response = lhs_table.find_nearest_neighbor(query)
    print(response)
    return process_lhs_table_response(query, [response], names, vectors)

def process_lhs_table_response(query, response, names, vectors):
    df = pd.DataFrame(index=range(len(response)), columns=['id', 'name', 'cosine'])   
    i = 0
    
    for resp in response:
        name = names.get_value(resp, 'name')
        cos = cos_similarity(query, vectors[resp])
        df.set_value(index=i, col='id', value=resp)
        df.set_value(index=i, col='name', value=name)
        df.set_value(index=i, col='cosine', value=cos)
        i = i + 1
     
    df = df.sort_values(by='cosine', ascending=False)
    df = df.reset_index(drop=True)

    return df


def is_arabic(name):
    res = re.findall(r'[\u0600-\u06FF]+',name)
    if len(res) == 0:
        return False
    else:
        return True

def cos_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)
    return dot_product

In [41]:
arab_names = pd.read_csv('/data/ARB_FULL_NAMES_VECTORS.csv', sep='|', usecols=[1, 2], header=None, names=['vector','name'])
eng_names = pd.read_csv('/data/ENG_FULL_NAMES_VECTORS.csv', sep='|', usecols=[1, 2], header=None, names=['vector','name'])
#eng_names = pd.read_csv('/data/ENG_TEST_NAME_VECTORS_230k.csv', sep='|').reset_index(drop=True)
#arab_names = pd.read_csv('/data/ARB_TEST_NAME_VECTORS_230k.csv', sep='|').reset_index(drop=True)

#merged_names = arab_names.append(eng_names).reset_index(drop=True)

In [42]:
eng_vector_strings = eng_names['vector'].as_matrix()
eng_vectors = np.zeros(shape=(len(eng_vector_strings),256))
i = 0
for engv in eng_vector_strings:
    x = np.fromstring(engv, dtype=np.float32, sep=',')
    eng_vectors[i] = x
    i = i+1
eng_vectors = eng_vectors.astype(dtype=np.float32)
eng_vector_strings = None

In [43]:
arab_vector_strings = arab_names['vector'].as_matrix()
arab_vectors = np.zeros(shape=(len(arab_vector_strings),256))
i = 0
for arabv in arab_vector_strings:
    x = np.fromstring(arabv, dtype=np.float32, sep=',')
    arab_vectors[i] = x
    i = i+1
arab_vectors = arab_vectors.astype(dtype=np.float32)
arab_vector_strings = None

In [None]:
'''
merged_vector_strings = merged_names['vector'].as_matrix()
merged_vectors = np.zeros(shape=(len(merged_vector_strings),256))
i = 0
for mrgv in merged_vector_strings:
    x = np.fromstring(mrgv, dtype=np.float32, sep=',')
    merged_vectors[i] = x
    i = i+1
merged_vectors = merged_vectors.astype(dtype=np.float32)
merged_vector_strings = None
'''

In [44]:
number_of_tables = 50
assert eng_vectors.dtype == np.float32

params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(eng_vectors[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(18, params_cp)

print('Constructing the LSH table')
t1 = timeit.default_timer()
eng_table = falconn.LSHIndex(params_cp)
eng_table.setup(eng_vectors)
t2 = timeit.default_timer()
print('Done')
print('Construction time: {}'.format(t2 - t1))

eng_query_object = eng_table.construct_query_object()

Constructing the LSH table
Done
Construction time: 0.17328246500255773


In [45]:
number_of_tables = 60
assert arab_vectors.dtype == np.float32

params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(arab_vectors[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(16, params_cp)

print('Constructing the LSH table')
t1 = timeit.default_timer()
arab_table = falconn.LSHIndex(params_cp)
arab_table.setup(arab_vectors)
t2 = timeit.default_timer()
print('Done')
print('Construction time: {}'.format(t2 - t1))

arab_query_object = arab_table.construct_query_object()

Constructing the LSH table
Done
Construction time: 0.12503248400025768


In [None]:
'''
number_of_tables = 50
assert merged_vectors.dtype == np.float32

params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(merged_vectors[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(20, params_cp)

print('Constructing the LSH table')
t1 = timeit.default_timer()
merged_table = falconn.LSHIndex(params_cp)
merged_table.setup(merged_vectors)
t2 = timeit.default_timer()
print('Done')
print('Construction time: {}'.format(t2 - t1))

merged_query_object = merged_table.construct_query_object()
'''

In [None]:
# GET /near_neighbors

req = json.loads(REQUEST)
args = req['args']

if 'name' not in args:
    print(json.dumps({'nearNeighbors': None}))
else:
    name = args['name'][0]
    if 'distance' not in args:
        distance = 1
    else:
        distance = float(args['distance'][0])
    
    eng_df = query_lhs_table_by_distance(
        name=name, 
        lhs_table=eng_query_object, 
        distance=distance, 
        names=eng_names, 
        vectors=eng_vectors)
    
    eng_df_reversed = query_lhs_table_by_distance(
        name=reverse_name(name), 
        lhs_table=eng_query_object, 
        distance=distance, 
        names=eng_names,  
        vectors=eng_vectors)
    
    
    arb_df = query_lhs_table_by_distance(
        name=name, 
        lhs_table=arab_query_object, 
        distance=distance, 
        names=arab_names,  
        vectors=arab_vectors)
    
    arb_df_reversed = query_lhs_table_by_distance(
        name=reverse_name(name), 
        lhs_table=arab_query_object, 
        distance=distance, 
        names=arab_names,  
        vectors=arab_vectors)
    
    eng_df_merged = eng_df.append(eng_df_reversed).drop_duplicates(['name'], keep='last')    
    eng_df_merged = eng_df_merged.sort_values('cosine', ascending=False).head(100)
    eng_json = eng_df_merged.to_json(orient='records')[1:-1]
    
    arb_df_merged = arb_df.append(arb_df_reversed).drop_duplicates(['name'], keep='last')    
    arb_df_merged = arb_df_merged.sort_values('cosine', ascending=False).head(100)
    arb_json = arb_df_merged.to_json(orient='records')[1:-1]
    
    print('{\"english\":[' + eng_json + '], \"arabic\":[' + arb_json + ']}')

In [None]:
# GET /near_n_neighbors

req = json.loads(REQUEST)
args = req['args']

if 'name' not in args:
     print('{\"english\":[], \"arabic\":[]}')
else:
    name = args['name'][0]
    if 'num' not in args:
        number = 1
    else:
        number = int(args['num'][0])
    
    eng_df = query_lhs_table_by_number(
        name=name, 
        lhs_table=eng_query_object, 
        number=number, 
        names=eng_names, 
        vectors=eng_vectors)
    
    eng_df_reversed = query_lhs_table_by_number(
        name=reverse_name(name), 
        lhs_table=eng_query_object, 
        number=number, 
        names=eng_names,  
        vectors=eng_vectors)
    
    
    arb_df = query_lhs_table_by_number(
        name=name, 
        lhs_table=arab_query_object, 
        number=number,  
        names=arab_names,  
        vectors=arab_vectors)
    
    arb_df_reversed = query_lhs_table_by_number(
        name=reverse_name(name), 
        lhs_table=arab_query_object, 
        number=number, 
        names=arab_names,  
        vectors=arab_vectors)
    
    eng_df_merged = eng_df.append(eng_df_reversed).drop_duplicates(['name'], keep='last')    
    eng_df_merged = eng_df_merged.sort_values('cosine', ascending=False)
    
    eng_json = eng_df_merged.to_json(orient='records')[1:-1]
    
    arb_df_merged = arb_df.append(arb_df_reversed).drop_duplicates(['name'], keep='last')    
    arb_df_merged = arb_df_merged.sort_values('cosine', ascending=False)
    arb_json = arb_df_merged.to_json(orient='records')[1:-1]
    
    print('{\"english\":[' + eng_json + '], \"arabic\":[' + arb_json + ']}')

In [62]:
name='EMMA SAAD'

eng_df = query_lhs_table_by_number(
        name=name, 
        lhs_table=eng_query_object, 
        number=25, 
        names=eng_names, 
        vectors=eng_vectors)

print(eng_df)

arab_df = query_lhs_table_by_number(
        name=name, 
        lhs_table=arab_query_object, 
        number=25, 
        names=arab_names, 
        vectors=arab_vectors)

print(arab_df)

      id                         name    cosine
0    345                    EMMA SAAD         1
1    346                    EMMA SA'D  0.996044
2    104                   JAMIL SAAD  0.967138
3    102                  JAMEEL SAAD  0.963519
4    105                   JAMIL SA'D  0.959336
5    103                  JAMEEL SA'D  0.952991
6   1891    EMMA DAVID ANNABELLA SAAD  0.708966
7   1892    EMMA DAVID ANNABELLA SA'D  0.705759
8    343                   EMMA DAVID  0.667079
9    968           KHALID JOSHUA SA'D  0.615134
10   966            KALID JOSHUA SA'D  0.605473
11   967           KHALID JOSHUA SAAD  0.605203
12   965            KALID JOSHUA SAAD  0.596004
13  1096          EMMA ANNABELLA SA'D  0.572101
14  1094              EMMA DAVID SA'D  0.558956
15  1095          EMMA ANNABELLA SAAD  0.555421
16  1802   KALID JONATHON JOSHUA SAAD  0.528511
17   719           JAMEEL NEVAEH SA'D   0.52596
18   721            JAMIL NEVAEH SA'D  0.525801
19  1804  KHALID JONATHON JOSHUA SAAD  0

In [71]:
name='EMMA SAAD'

eng_df = query_lhs_table_by_distance(
        name=name, 
        lhs_table=eng_query_object, 
        distance=8, 
        names=eng_names, 
        vectors=eng_vectors)

print(eng_df)

arab_df = query_lhs_table_by_distance(
        name=name, 
        lhs_table=arab_query_object, 
        distance=0.01,
        names=arab_names, 
        vectors=arab_vectors)

print(arab_df)

      id                       name    cosine
0    345                  EMMA SAAD         1
1    346                  EMMA SA'D  0.996044
2    104                 JAMIL SAAD  0.967138
3    102                JAMEEL SAAD  0.963519
4    105                 JAMIL SA'D  0.959336
5    103                JAMEEL SA'D  0.952991
6   1891  EMMA DAVID ANNABELLA SAAD  0.708966
7   1892  EMMA DAVID ANNABELLA SA'D  0.705759
8    343                 EMMA DAVID  0.667079
9    968         KHALID JOSHUA SA'D  0.615134
10   966          KALID JOSHUA SA'D  0.605473
11   967         KHALID JOSHUA SAAD  0.605203
12   965          KALID JOSHUA SAAD  0.596004
13  1096        EMMA ANNABELLA SA'D  0.572101
14  1095        EMMA ANNABELLA SAAD  0.555421
       id                  name    cosine
0     448             ايمّه سعد  0.996363
1     450             ايمّا سعد  0.995999
2     444              ايما سعد  0.995383
3     446              إيما سعد  0.993901
4     189              جميل سعد   0.96841
5     186   

In [78]:
%%time
name='EMMA SAAD'
name_vector = call_embedding_ws(name)
distances = pd.DataFrame(index=range(len(eng_vectors)),columns=['name', 'cosine'])
distances.loc[eng_names.index,['name']] = eng_names[['name']]

for i in range(len(eng_vectors)):
    distances.set_value(col='cosine', index=i, value=cos_similarity(name_vector, eng_vectors[i]))

distances = distances.sort_values('cosine', ascending=False)
print(distances.head(25))

                             name    cosine
345                     EMMA SAAD         1
346                     EMMA SA'D  0.996044
104                    JAMIL SAAD  0.967138
102                   JAMEEL SAAD  0.963519
105                    JAMIL SA'D  0.959336
103                   JAMEEL SA'D  0.952991
1891    EMMA DAVID ANNABELLA SAAD  0.708966
1892    EMMA DAVID ANNABELLA SA'D  0.705759
343                    EMMA DAVID  0.667079
968            KHALID JOSHUA SA'D  0.615134
966             KALID JOSHUA SA'D  0.605473
967            KHALID JOSHUA SAAD  0.605203
965             KALID JOSHUA SAAD  0.596004
1096          EMMA ANNABELLA SA'D  0.572101
1094              EMMA DAVID SA'D  0.558956
1095          EMMA ANNABELLA SAAD  0.555421
1093              EMMA DAVID SAAD  0.551626
1802   KALID JONATHON JOSHUA SAAD  0.528511
719            JAMEEL NEVAEH SA'D   0.52596
721             JAMIL NEVAEH SA'D  0.525801
1804  KHALID JONATHON JOSHUA SAAD  0.523945
1803   KALID JONATHON JOSHUA SA'

In [77]:
 %%time
name='EMMA SAAD'
name_vector = call_embedding_ws(name)
distances = pd.DataFrame(index=range(len(arab_vectors)),columns=['name', 'cosine'])
distances.loc[arab_names.index,['name']] = arab_names[['name']]

for i in range(len(arab_vectors)):
    distances.set_value(col='cosine', index=i, value=cos_similarity(name_vector, arab_vectors[i]))

distances = distances.sort_values('cosine', ascending=False)
print(distances.head(25))

                  name    cosine
448          ايمّه سعد  0.996363
450          ايمّا سعد  0.995999
444           ايما سعد  0.995383
446           إيما سعد  0.993901
189           جميل سعد   0.96841
186            جميل يس  0.948129
447        ايمّه سَعْد   0.91121
443         ايما سَعْد  0.909561
449        ايمّا سَعْد  0.908646
445         إيما سَعْد  0.902556
188         جميل سَعْد  0.840796
1913       جميل يس سعد  0.829282
428       ايمّا دايفيد   0.76135
427       ايمّه دايفيد  0.760589
1912     جميل يس سَعْد   0.75626
424        إيما دايفيد  0.751316
429        ايمّا ديفيد  0.750875
430        ايمّه ديفيد  0.750228
423        ايما دايفيد  0.738229
425         إيما ديفيد  0.735267
1292  ايمّا دايفيد سعد  0.734741
1286  ايمّه دايفيد سعد   0.73038
426         ايما ديفيد  0.723312
1288   إيما دايفيد سعد  0.722332
58               ايمّه  0.707611
CPU times: user 54.1 ms, sys: 0 ns, total: 54.1 ms
Wall time: 62.7 ms


In [None]:
name2 = 'DIERA MIRZAKHODZHAEVA'
name2_vector = call_embedding_ws(name)

In [None]:
cos_similarity(name_vector, name2_vector)

In [None]:
for i in range(len(name_vector)):
    print(name_vector[i]==name2_vector[i])

In [32]:
v1 = call_embedding_ws('MARAM ANNABELLA'.lower())
v2 = call_embedding_ws('MARAM ANNABELLA'.lower())

c1 = cos_similarity(v1, v2)
c2 = cosine(v1,v2)

print(c1)
print(c2)

print(v1)
print(v2)

1.0
2.24866886223e-08
[  1.43897173e-03   1.59284988e-04  -5.21816546e-05  -2.50471727e-04
   1.44471691e-04   1.98260270e-04   1.01991100e-02   2.70581459e-05
  -1.52154425e-02   2.83561967e-05  -1.05599865e-04  -7.98210735e-04
  -2.07505235e-03   4.61631513e-04  -7.97971152e-04  -3.13056709e-08
   5.44049180e-05   6.00523199e-04  -7.27461975e-06  -5.65245682e-05
   2.58397544e-04  -2.30849138e-04   5.65999799e-05  -3.45343469e-05
   4.71560015e-05  -2.25283508e-03  -1.22555345e-03   4.38437192e-03
  -1.87608425e-03   5.56448882e-04  -6.92250789e-04   5.03992836e-04
  -2.00928571e-05  -4.52958338e-04  -1.15051535e-05   1.79738319e-03
   3.81228374e-03  -3.68694491e-05   3.31550208e-03   3.22460401e-05
   3.54341464e-04  -1.89080325e-04   8.82057473e-03   3.23827553e-04
   4.44680845e-05  -1.53770034e-05   1.09931175e-03   5.96133352e-04
   1.19633861e-02  -1.18477416e-04   1.96502791e-04  -2.86287774e-04
  -8.81545153e-03   1.75431724e-05  -2.26106004e-05   7.16222276e-04
  -1.3719731

In [22]:
for idx, val in enumerate(v1):
    print(v1[idx] == v2[idx])

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
Tr

In [28]:
for vpos in v1:
    print(vpos)

0.00813494
-0.00213794
0.00364781
-9.93403e-05
0.00159343
-0.0470871
-0.0597995
-0.0249652
0.0223861
-0.0089862
0.0123881
-0.00111117
0.408702
-0.00447111
-0.0708167
0.00200406
-0.0046041
0.0235544
0.000846076
9.33684e-05
0.00427128
-0.00189656
-0.0362947
0.00230419
-0.0560951
-0.0340373
0.0235775
0.00113248
-0.00461807
-0.0483622
0.0519862
-0.033895
-0.00120991
-0.00393242
-0.00525278
-0.0231511
-0.104093
-0.0114212
0.128902
-0.000776816
-0.0205441
0.00939787
-0.00332069
-0.0938533
0.0111644
-0.00379884
-0.0429695
-0.00938674
-0.0868756
-0.0661556
0.00740574
0.0925865
0.0922584
0.00844218
0.0170341
0.0154128
-0.0763395
-0.0244592
-0.0391165
-0.000462669
-0.0929543
0.0984061
0.150973
0.0181238
0.0331664
0.112275
-0.0261222
-0.00419576
-0.00307963
0.0145056
-0.0105298
-0.000519382
0.053297
-0.00224331
0.022539
0.00450963
0.143143
0.0362292
0.00592352
0.0537954
0.483291
0.0135497
0.0401769
0.00110281
-0.263764
-0.00752281
0.00636723
-0.0241442
-0.0142562
-0.000869788
0.144803
0.0770645
0

In [30]:
print(v1[1] < v1[3])

True


In [33]:
print(v1 == v2)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  T

In [34]:
v1_str = np.array2string(v1, separator=',', max_line_width=10000).strip("[]").replace(" ", "")
v2_str = np.array2string(v2, separator=',', max_line_width=10000).strip("[]").replace(" ", "")

print(v1_str == v2_str)

True
