In [1]:
import pyarabic.unshape 

In [2]:
import math
import json
import pickle
import numpy as np
import falconn
import timeit
from __future__ import print_function
import psycopg2
import pandas as pd
from configparser import ConfigParser
import requests
import pyphi
import re

In [90]:
def call_embedding_ws(name):
    #url = 'http://54.36.53.127:8009/embedding'
    url = 'http://127.0.0.1:8009/embedding'
    headers = {"content-type": "application/json;charset=utf-8"}
    response = requests.post(url, json=[name.upper()])
    
    arr = pyphi.jsonify.loads(response.text)
    x = np.array(arr)
    x = x.astype(dtype=np.float32)
    return x[0]

def reverse_name(name):
    a=name.split()
    a.reverse()
    return " ".join(a)

def cos_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)
    return dot_product

def sort_by_distance(query_vector, result_vectors):
    for v in result_vectors:
        cos_similarity(query_vector, v)
        
        
def query_lhs_table_by_distance(name, lhs_table, distance, names, vectors):
    query = call_embedding_ws(name)
    #query = eng_vectors_ee[0]
    t1 = timeit.default_timer()
    response = lhs_table.find_near_neighbors(query, distance)
    
    return process_lhs_table_response(query, response, names, vectors)
    
def query_lhs_table_by_number(name, lhs_table, number, names, vectors):
    query = call_embedding_ws(name)
    #query = eng_vectors_ee[0]
    t1 = timeit.default_timer()
    response = lhs_table.find_k_nearest_neighbors(query, k=number)
    
    return process_lhs_table_response(query, response, names, vectors)

def query_lhs_table_nearest(name, lhs_table, names, vectors):
    query = call_embedding_ws(name)
    #query = eng_vectors_ee[0]
    t1 = timeit.default_timer()
    response = lhs_table.find_nearest_neighbor(query)
    print(response)
    return process_lhs_table_response(query, [response], names, vectors)

def process_lhs_table_response(query, response, names, vectors):
    df = pd.DataFrame(index=range(len(response)), columns=['id', 'name', 'cosine'])   
    i = 0

    for resp in response:
        name = names.get_value(resp, 'name')
        df.set_value(index=i, col='id', value=resp)
        df.set_value(index=i, col='name', value=name)
        df.set_value(index=i, col='cosine', value=cos_similarity(query, vectors[resp]))
        i = i + 1
     
    df.sort_values('cosine', ascending=False)
    df = df.reset_index(drop=True)

    #t2 = timeit.default_timer()    
    #time = t2 - t1
    return df


def is_arabic(name):
    res = re.findall(r'[\u0600-\u06FF]+',name)
    if len(res) == 0:
        return False
    else:
        return True

In [117]:
#arab_names = pd.read_csv('/data/ARB_FULL_NAMES_VECTORS.csv', sep='|').reset_index(drop=True)
#eng_names = pd.read_csv('/data/ENG_FULL_NAMES_VECTORS.csv', sep='|').reset_index(drop=True)
eng_names = pd.read_csv('/data/ENG_TEST_NAME_VECTORS.csv', sep='|').reset_index(drop=True)

#merged_names = arab_names.append(eng_names).reset_index(drop=True)

In [118]:
eng_vector_strings = eng_names['vector'].as_matrix()
eng_vectors = np.zeros(shape=(len(eng_vector_strings),256))
i = 0
for engv in eng_vector_strings:
    x = np.fromstring(engv, dtype=np.float32, sep=',')
    eng_vectors[i] = x
    i = i+1
eng_vectors = eng_vectors.astype(dtype=np.float32)
eng_vector_strings = None

In [6]:
arab_vector_strings = arab_names['vector'].as_matrix()
arab_vectors = np.zeros(shape=(len(arab_vector_strings),256))
i = 0
for arabv in arab_vector_strings:
    x = np.fromstring(arabv, dtype=np.float32, sep=',')
    arab_vectors[i] = x
    i = i+1
arab_vectors = arab_vectors.astype(dtype=np.float32)
arab_vector_strings = None

In [7]:
'''
merged_vector_strings = merged_names['vector'].as_matrix()
merged_vectors = np.zeros(shape=(len(merged_vector_strings),256))
i = 0
for mrgv in merged_vector_strings:
    x = np.fromstring(mrgv, dtype=np.float32, sep=',')
    merged_vectors[i] = x
    i = i+1
merged_vectors = merged_vectors.astype(dtype=np.float32)
merged_vector_strings = None
'''

In [119]:
number_of_tables = 50
assert eng_vectors.dtype == np.float32

params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(eng_vectors[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(10, params_cp)

print('Constructing the LSH table')
t1 = timeit.default_timer()
eng_table = falconn.LSHIndex(params_cp)
eng_table.setup(eng_vectors)
t2 = timeit.default_timer()
print('Done')
print('Construction time: {}'.format(t2 - t1))

eng_query_object = eng_table.construct_query_object()

Constructing the LSH table
Done
Construction time: 0.04594984196592122


In [9]:
number_of_tables = 60
assert arab_vectors.dtype == np.float32

params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(arab_vectors[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(16, params_cp)

print('Constructing the LSH table')
t1 = timeit.default_timer()
arab_table = falconn.LSHIndex(params_cp)
arab_table.setup(arab_vectors)
t2 = timeit.default_timer()
print('Done')
print('Construction time: {}'.format(t2 - t1))

arab_query_object = arab_table.construct_query_object()

Constructing the LSH table
Done
Construction time: 19.02402669400908


In [None]:
'''
number_of_tables = 50
assert merged_vectors.dtype == np.float32

params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(merged_vectors[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(20, params_cp)

print('Constructing the LSH table')
t1 = timeit.default_timer()
merged_table = falconn.LSHIndex(params_cp)
merged_table.setup(merged_vectors)
t2 = timeit.default_timer()
print('Done')
print('Construction time: {}'.format(t2 - t1))

merged_query_object = merged_table.construct_query_object()
'''

In [None]:
# GET /near_neighbors

req = json.loads(REQUEST)
args = req['args']

if 'name' not in args:
    print(json.dumps({'nearNeighbors': None}))
else:
    name = args['name'][0]
    if 'distance' not in args:
        distance = 1
    else:
        distance = float(args['distance'][0])
    
    eng_df = query_lhs_table_by_distance(
        name=name, 
        lhs_table=eng_query_object, 
        distance=distance, 
        names=eng_names, 
        vectors=eng_vectors)
    
    eng_df_reversed = query_lhs_table_by_distance(
        name=reverse_name(name), 
        lhs_table=eng_query_object, 
        distance=distance, 
        names=eng_names,  
        vectors=eng_vectors)
    
    
    arb_df = query_lhs_table_by_distance(
        name=name, 
        lhs_table=arab_query_object, 
        distance=distance, 
        names=arab_names,  
        vectors=arab_vectors)
    
    arb_df_reversed = query_lhs_table_by_distance(
        name=reverse_name(name), 
        lhs_table=arab_query_object, 
        distance=distance, 
        names=arab_names,  
        vectors=arab_vectors)
    
    eng_df_merged = eng_df.append(eng_df_reversed).drop_duplicates(['name'], keep='last')    
    eng_df_merged = eng_df_merged.sort_values('cosine', ascending=False)
    eng_json = eng_df_merged.to_json(orient='records')[1:-1]
    
    arb_df_merged = arb_df.append(arb_df_reversed).drop_duplicates(['name'], keep='last')    
    arb_df_merged = arb_df_merged.sort_values('cosine', ascending=False)
    arb_json = arb_df_merged.to_json(orient='records')[1:-1]
    
    print('{\"english\":[' + eng_json + '], \"arabic\":[' + arb_json + ']}')

In [None]:
# GET /near_n_neighbors

req = json.loads(REQUEST)
args = req['args']

if 'name' not in args:
     print('{\"english\":[], \"arabic\":[]}')
else:
    name = args['name'][0]
    if 'num' not in args:
        number = 1
    else:
        number = int(args['num'][0])
    
    eng_df = query_lhs_table_by_number(
        name=name, 
        lhs_table=eng_query_object, 
        num=number, 
        names=eng_names, 
        vectors=eng_vectors)
    
    eng_df_reversed = query_lhs_table_by_number(
        name=reverse_name(name), 
        lhs_table=eng_query_object, 
        num=number, 
        names=eng_names,  
        vectors=eng_vectors)
    
    
    arb_df = query_lhs_table_by_number(
        name=name, 
        lhs_table=arab_query_object, 
        num=number,  
        names=arab_names,  
        vectors=arab_vectors)
    
    arb_df_reversed = query_lhs_table_by_number(
        name=reverse_name(name), 
        lhs_table=arab_query_object, 
        num=number, 
        names=arab_names,  
        vectors=arab_vectors)
    
    eng_df_merged = eng_df.append(eng_df_reversed).drop_duplicates(['name'], keep='last')    
    eng_df_merged = eng_df_merged.sort_values('cosine', ascending=False)
    eng_json = eng_df_merged.to_json(orient='records')[1:-1]
    
    arb_df_merged = arb_df.append(arb_df_reversed).drop_duplicates(['name'], keep='last')    
    arb_df_merged = arb_df_merged.sort_values('cosine', ascending=False)
    arb_json = arb_df_merged.to_json(orient='records')[1:-1]
    
    print('{\"english\":[' + eng_json + '], \"arabic\":[' + arb_json + ']}')

In [53]:
def cos_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)
    return dot_product

In [126]:
name='MARAM ANNABELLA'

eng_df = query_lhs_table_by_number(
        name=name, 
        lhs_table=eng_query_object, 
        number=25, 
        names=eng_names, 
        vectors=eng_vectors)

print(eng_df)

    id             name cosine
0   25    ASEEL TRINITY      1
1   26    ASEIL TRINITY      1
2    3     MARAM ASALAH      1
3    0      MARAM NAOMI      1
4    2      MARAM ASALA      1
5    6       MURAD ELLA      1
6    7       MORAD ELLA      1
7    8     MURAD MONEER      1
8    4   MURAD SERENITY      1
9    1  MARAM ANNABELLA      1
10   5   MORAD SERENITY      1
11  28    ASALA TASNEEM      1
12  13      MURAD MONIR      1
13  14     MURAD MUNEER      1
14  15      MURAD MUNIR      1
15  16   ASEEL JONATHAN      1
16  17   ASEEL JONATHON      1
17  18   ASEIL JONATHAN      1
18  19   ASEIL JONATHON      1
19   9     MORAD MONEER      1
20  10      MORAD MONIR      1
21  22  ASEEL SEBASTIAN      1
22  23  ASEIL SEBASTIAN      1
23  11     MORAD MUNEER      1
24  12      MORAD MUNIR      1


In [127]:
%%time
#name='ABDAL RAHEEM'
name='MARAM ANNABELLA'
name_vector = call_embedding_ws(name)
distances = pd.DataFrame(index=range(len(eng_vectors)),columns=['name', 'cosine'])
distances.loc[eng_names.index,['name']] = eng_names[['name']]

for i in range(len(eng_vectors)):
    distances.set_value(col='cosine', index=i, value=cos_similarity(name_vector, eng_vectors[i]))
   

CPU times: user 50.2 ms, sys: 2.22 ms, total: 52.4 ms
Wall time: 65.7 ms


In [129]:
distances = distances.sort_values('cosine', ascending=False)
print(distances.head(150)) 

                 name    cosine
0         MARAM NAOMI         1
26      ASEIL TRINITY         1
281    RAKAN IZABELLE         1
33       ASALA MANAAR         1
32        ASALA MANAR         1
284       RAZAN RAKAN         1
285      RAZAN RAAKAN         1
278    RAKAN ISABELLA         1
29       ASALA TASNIM         1
28      ASALA TASNEEM         1
286      RAZAN NASEEM         1
46        MARIA MURAD         1
47        MARIA MORAD         1
275      RAKAN MAASON         1
539       MASON MARAM         1
274       RAKAN MASON         1
25      ASEEL TRINITY         1
37      ASALA MIKHAEL         1
287       RAZAN NASIM         1
23    ASEIL SEBASTIAN         1
22    ASEEL SEBASTIAN         1
288       RAZAN NSEEM         1
289      RAZAN JOSHUA         1
19     ASEIL JONATHON         1
18     ASEIL JONATHAN         1
55         MARIA SUAD         1
254    KALID JONATHON         1
407      ISAAK NASEEM         1
75      NASIM MUNEERA         1
201       SOAAD MARAM         1
..      

In [124]:
name2 = 'DIERA MIRZAKHODZHAEVA'
name2_vector = call_embedding_ws(name)

In [125]:
cos_similarity(name_vector, name2_vector)

1.0

In [77]:
for i in range(len(name_vector)):
    print(name_vector[i]==name2_vector[i])

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [132]:
v1 = call_embedding_ws('ASALA TASNIM')
v2 = call_embedding_ws('MARIA MORAD')
cos_similarity(v1, v2)

1.0

In [133]:
for idx, val in enumerate(v1):
    print(v1[idx] == v2[idx])

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
