### Generates Vectos from unique names from source and creates search indexes

In [1]:
import numpy as np 
import pandas as pd 
import pyphi
import requests
import os
import falconn

In [2]:
def query_falconn_index(name, lhs_table, number, names, vectors):
    query = call_embedding_ws(name)
    response = lhs_table.find_k_nearest_neighbors(query[0], k=number)
    
    return process_lhs_table_response(query, response, names, vectors)

def process_lhs_table_response(query, response, names, vectors):
    df = pd.DataFrame(index=range(len(response)), columns=['id', 'name', 'cosine'])   
    i = 0
    
    for resp in response:
        name = names.get_value(resp, 'name')
        cos = cos_similarity(query, vectors[resp])
        df.set_value(index=i, col='id', value=resp)
        df.set_value(index=i, col='name', value=name)
        df.set_value(index=i, col='cosine', value=cos)
        i = i + 1
     
    df = df.sort_values(by='cosine', ascending=False)
    df = df.reset_index(drop=True)

    return df

def cos_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)
    return dot_product

def call_embedding_ws(names):
    names = [item.lower() for item in names]
    
#    url = 'http://54.36.53.127:8009/embedding'
    url = 'http://127.0.0.1:8009/embedding'
    headers = {"content-type": "application/json"}
    response = requests.post(url, json=names)
    
    arr = pyphi.jsonify.loads(response.text)
    x = np.array(arr)
    x = x.astype(dtype=np.float32)
    
    return x

input_path = 'data/src_distinct_names/src'
output_path = 'data/src_distinct_names'

#### Calculate vectors for names in source files to output files

In [None]:
'''%%time
directory = os.fsencode(input_path)

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".csv"): 
        print(filename)
        names = pd.read_csv(input_path + '/' + filename, sep='|', names=['name'], header=None)
#        np.where(pd.isnull(names))
#        names.iloc[245193]
        
        i = -1
        j = -1
        ws_params = []

        file = open(output_path + '/vectors_' + filename,"w") 

        for index, row in names.iterrows():
            ws_params.append(row['name'].strip().lower())
            i = i + 1
            if i % 512 == 0:
                ws_result = call_embedding_ws(ws_params)
                for vect in ws_result:
                    j = j + 1
                   
                    v_str = np.array2string(vect, separator=',', max_line_width=10000).strip("[]").replace(" ", "")
                    
                    #print("{n}|{v}\n".format(n=names['name'][j], v=vect[:5]))
                    file.write("{n}|{v}\n".format(n=names['name'][j], v=v_str))
                ws_params = []
                print(i)

        if len(ws_params) > 0:
            ws_result = call_embedding_ws(ws_params)
            for vect in ws_result:
                j = j + 1
                v_str = np.array2string(vect, separator=',', max_line_width=10000).strip("[]").replace(" ", "")
                file.write("{n}|{v}\n".format(n=names['name'][j], v=v_str))

        file.close()'''

#### Read vector files and parse vector strings

In [3]:
directory = os.fsencode(output_path)
names_dict = {}
vectors_dict = {}
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".csv"): 
        print(filename)
        
        names_dict[filename] = pd.read_csv(output_path + '/' + filename, sep='|', names=['name', 'vector_string'], header=None)
        
        vector_strings = names_dict[filename]['vector_string'].as_matrix()
        vectors = np.zeros(shape=(len(vector_strings),256))
        i = 0
        for v in vector_strings:
            try:
                x = np.fromstring(v, dtype=np.float32, sep=',')
                vectors[i] = x
                i = i+1
            except:
                print("{i} {v}".format(i=i, v=v))
                raise
            
        vectors_dict[filename] = vectors.astype(dtype=np.float32)

vectors_given_names_master_eng_distinct.csv
vectors_given_names_dan_arb_distinct.csv
vectors_given_names_dan_eng_distinct.csv
vectors_family_names_master_arb_distinct.csv
vectors_family_names_master_eng_distinct.csv
vectors_given_names_master_arb_distinct.csv


#### Merge english and arabic names separately

In [14]:
eng_names_merged =  pd.DataFrame(columns=['name', 'vector_string'])
arb_names_merged = pd.DataFrame(columns=['name', 'vector_string'])


for key, value in names_dict.items():
    if 'eng' in key:
        eng_names_merged = eng_names_merged.append(value).reset_index(drop=True)

eng_names_merged = eng_names_merged.drop_duplicates(['name']).reset_index(drop=True)
        
vector_strings = eng_names_merged['vector_string'].as_matrix()
eng_vectors_merged = np.zeros(shape=(len(vector_strings),256))
i = 0
for v in vector_strings:
    x = np.fromstring(v, dtype=np.float32, sep=',')
    eng_vectors_merged[i] = x
    i = i+1
    
eng_vectors_merged = eng_vectors_merged.astype(dtype=np.float32)



for key, value in names_dict.items():
    if 'arb' in key:
        arb_names_merged = arb_names_merged.append(value).reset_index(drop=True)

arb_names_merged = arb_names_merged.drop_duplicates(['name']).reset_index(drop=True)
        
vector_strings = arb_names_merged['vector_string'].as_matrix()
arb_vectors_merged = np.zeros(shape=(len(vector_strings),256))
i = 0
for v in vector_strings:
    x = np.fromstring(v, dtype=np.float32, sep=',')
    arb_vectors_merged[i] = x
    i = i+1
arb_vectors_merged = arb_vectors_merged.astype(dtype=np.float32) 


#### Build Falconn indexes

In [15]:
%%time
number_of_tables = 50
assert eng_vectors_merged.dtype == np.float32
assert arb_vectors_merged.dtype == np.float32

params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(eng_vectors_merged[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(18, params_cp)

eng_merged_table = falconn.LSHIndex(params_cp)
eng_merged_table.setup(eng_vectors_merged)

eng_merged_query_object = eng_merged_table.construct_query_object()




params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(arb_vectors_merged[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(18, params_cp)

arb_merged_table = falconn.LSHIndex(params_cp)
arb_merged_table.setup(arb_vectors_merged)

arb_merged_query_object = arb_merged_table.construct_query_object()

CPU times: user 53.8 s, sys: 181 ms, total: 53.9 s
Wall time: 14 s


In [23]:
query_falconn_index(['محمد'], eng_merged_query_object, 20, eng_names_merged, eng_vectors_merged)

Unnamed: 0,id,name,cosine
0,213303,MHAMDI,[0.999982]
1,225048,MUHMAD,[0.999982]
2,218466,MOHAMADI,[0.999982]
3,220233,MOHMADI,[0.999979]
4,373146,MUHEMID,[0.999979]
5,225202,MUJISUKAMTI,[0.999978]
6,219118,MOHAMEDI,[0.999977]
7,218486,MOHAMADY,[0.999976]
8,220181,MOHIMID,[0.999976]
9,220275,MOHMUD,[0.999976]


In [22]:
query_falconn_index(['محمد'], arb_merged_query_object, 20, arb_names_merged, arb_vectors_merged)

Unnamed: 0,id,name,cosine
0,202,محمد,[1.0]
1,265806,محمد,[1.0]
2,260823,مامد,[0.999998]
3,275830,موماد,[0.999995]
4,275922,موموح,[0.999994]
5,275923,مومود,[0.999994]
6,281869,ميماد,[0.999994]
7,251813,ماجومت,[0.999994]
8,275940,موموه,[0.999994]
9,267330,محمود,[0.999993]


In [12]:
len(eng_names_merged)

388923