In [47]:
import numpy as np 
import pandas as pd 
import os
import falconn
import pyphi
import requests
import random
import psycopg2
from configparser import ConfigParser
import pickle
import csv
import pyarabic.araby as araby

target_dir = "/home/jupyter/notebooks/PoC/data-preparation/output/understanding_data/"

def config(filename='prepare_data.ini', section='phonetic'):
    parser = ConfigParser()
    parser.read(filename)
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

def db_connect():
    """ Connect to the PostgreSQL database server """
    conn = None
    params = config()
    conn = psycopg2.connect(**params)
    print('Connected to the PostgreSQL database...')
    
    return conn

def read_given_names():
    conn = db_connect()
    sql_result = pd.DataFrame()
    try:
        query = """
            SELECT * FROM (
                SELECT ENG, ARB, SUM(COUNT) AS COUNT FROM (
                    SELECT ENG, ARB, COUNT FROM GIVEN_NAMES_MASTER
                    WHERE ENG IS NOT NULL AND ENG != '' AND ARB IS NOT NULL AND ARB != ''
                ) AS SUB GROUP BY ENG, ARB
                ORDER BY COUNT DESC
            ) AS S
            UNION ALL
            SELECT * FROM (
                SELECT ENG, ARB, SUM(COUNT) AS COUNT FROM (
                    SELECT ENG, ARB, COUNT FROM FAMILY_NAMES_MASTER
                    WHERE ENG IS NOT NULL AND ENG != '' AND ARB IS NOT NULL AND ARB != ''
                ) AS SUB GROUP BY ENG, ARB
                ORDER BY COUNT DESC
            ) AS S
            UNION ALL
            SELECT * FROM (
                SELECT ENG, ARB, SUM(FREQ) AS COUNT FROM (
                    SELECT ENG, ARB, FREQ FROM GIVEN_NAMES_DAN
                    WHERE ENG IS NOT NULL AND ENG != '' AND ARB IS NOT NULL AND ARB != ''
                ) AS SUB GROUP BY ENG, ARB
                ORDER BY COUNT DESC
            ) AS S

            """

        sql_result = pd.read_sql(query, con=conn)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
    return sql_result

In [48]:
def build_top_given_names(min_count):
    global top_given_names, all_eng_given_names, all_arb_given_names, low_accuracy_names
    given_names = read_given_names()
    given_names['trimmed_eng'] = given_names['eng'].str.strip()
    given_names['trimmed_arb'] = given_names['arb'].str.strip()

    #given_names['trimmed_eng'] = given_names['eng'].map(lambda x: x.strip(' '))
    #given_names['trimmed_arb'] = given_names['arb'].map(lambda x: x.strip(' '))
    given_names['count'] = given_names.groupby(['trimmed_eng', 'trimmed_arb'])['count'].transform('sum')
    given_names = given_names.drop_duplicates(subset=['trimmed_eng', 'trimmed_arb'], keep='first')
    
    low_accuracy_threshold = 20
    if min_count < low_accuracy_threshold:
        top_given_names = given_names[given_names['count'] >= low_accuracy_threshold]
        top_given_names = top_given_names.copy()
        low_accuracy_names = given_names[(given_names['count'] >= min_count) & (given_names['count'] < low_accuracy_threshold)]
        low_accuracy_names = low_accuracy_names.copy()
    else:
        top_given_names = given_names[given_names['count'] >= min_count]
        top_given_names = top_given_names.copy()
        low_accuracy_names = pd.DataFrame(columns=['eng', 'arb', 'count', 'trimmed_eng', 'trimmed_arb', 'eng_variants', 'arb_variants'])
    
    all_eng_given_names = list(set(top_given_names['trimmed_eng'].tolist()))
    all_arb_given_names = list(set(top_given_names['trimmed_arb'].tolist()))
    #random.shuffle(all_eng_given_names)
    #random.shuffle(all_arb_given_names)
    
def format_variants_list(s):
    return [','.join(sorted(set(s.tolist())))]

def format_variants_list2(s):
    lis = s.tolist()
    flat_list = [item for sublist in lis for item in sublist.split(',')]

    return ','.join(sorted(set(flat_list)))

def get_variants_count(s):
    return len(set(s.tolist()))

def get_yamli_arabic_varinats(name):
    name = ''.join(e for e in name if e.isalnum())
    url = 'http://api.yamli.com/transliterate.ashx?tool=api&account_id=&prot=http%3A&hostname=fuzzyarabic.herokuapp.com&path=%2F&build=5515&sxhr_id=9&word=' + name 
    
    headers = {"content-type": "application/json"}
    
    response = requests.get(url)
    if len(response.text) > 66:
        arr = pyphi.jsonify.loads(response.text[62:-4])['data']
        data = pyphi.jsonify.loads(arr)['r']
        variants = data.split('|')
    else:
        variants = []
    variants = [''.join(e for e in name if e.isalnum() and not e.isdigit()) for name in variants]
    return variants

def save_obj(obj, name ):
    with open(target_dir + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(target_dir + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
i = 0
def add_to_yamli_dict(eng):
    global i
    i += 1
    if i % 1000 == 0:
        print("processed {i} out of {t}".format(i=i, t=len(all_eng_given_names)))
    eng_names_with_yamli_dict[eng] = get_yamli_arabic_varinats(eng)

In [49]:
def query_falconn_index(name, lhs_table, number, names, vectors):
    query = call_embedding_ws(name)
    response = lhs_table.find_k_nearest_neighbors(query[0], k=number)
    
    return process_lhs_table_response(query, response, names, vectors)

def process_lhs_table_response(query, response, names, vectors):
    df = pd.DataFrame(index=range(len(response)), columns=['id', 'name', 'cosine'])   
    i = 0
    
    for resp in response:
        name = names.get_value(resp, 'name')
        cos = cos_similarity(query, vectors[resp])
        df.set_value(index=i, col='id', value=resp)
        df.set_value(index=i, col='name', value=name)
        df.set_value(index=i, col='cosine', value=cos)
        i = i + 1
     
    df = df.sort_values(by='cosine', ascending=False)
    df = df.reset_index(drop=True)

    return df

def cos_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)
    return dot_product

def call_embedding_ws(names):
    names = [item.lower() for item in names]
    
#    url = 'http://54.36.53.127:8009/embedding'
    url = 'http://127.0.0.1:8009/embedding'
    headers = {"content-type": "application/json"}
    response = requests.post(url, json=names)
    
    arr = pyphi.jsonify.loads(response.text)
    x = np.array(arr)
    x = x.astype(dtype=np.float32)
    
    return x

input_path = 'data/src_distinct_names/src'
output_path = 'data/src_distinct_names'

In [None]:
%%time
directory = os.fsencode(output_path)
names_dict = {}
vectors_dict = {}
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".csv"): 
        print(filename)
        
        names_dict[filename] = pd.read_csv(output_path + '/' + filename, sep='|', names=['name', 'vector_string'], header=None)
        
        vector_strings = names_dict[filename]['vector_string'].as_matrix()
        vectors = np.zeros(shape=(len(vector_strings),256))
        i = 0
        for v in vector_strings:
            try:
                x = np.fromstring(v, dtype=np.float32, sep=',')
                vectors[i] = x
                i = i+1
            except:
                print("{i} {v}".format(i=i, v=v))
                raise
            
        vectors_dict[filename] = vectors.astype(dtype=np.float32)

In [None]:
%%time
eng_names_merged =  pd.DataFrame(columns=['name', 'vector_string'])
arb_names_merged = pd.DataFrame(columns=['name', 'vector_string'])


for key, value in names_dict.items():
    if 'eng' in key:
        eng_names_merged = eng_names_merged.append(value).reset_index(drop=True)

eng_names_merged = eng_names_merged.drop_duplicates(['name']).reset_index(drop=True)
eng_names_merged = eng_names_merged.reset_index(drop=True)
        
vector_strings = eng_names_merged['vector_string'].as_matrix()
eng_vectors_merged = np.zeros(shape=(len(vector_strings),256))
i = 0
for v in vector_strings:
    x = np.fromstring(v, dtype=np.float32, sep=',')
    eng_vectors_merged[i] = x
    i = i+1
    
eng_vectors_merged = eng_vectors_merged.astype(dtype=np.float32)



for key, value in names_dict.items():
    if 'arb' in key:
        arb_names_merged = arb_names_merged.append(value).reset_index(drop=True)

arb_names_merged = arb_names_merged.drop_duplicates(['name']).reset_index(drop=True)
arb_names_merged = arb_names_merged.reset_index(drop=True)
        
vector_strings = arb_names_merged['vector_string'].as_matrix()
arb_vectors_merged = np.zeros(shape=(len(vector_strings),256))
i = 0
for v in vector_strings:
    x = np.fromstring(v, dtype=np.float32, sep=',')
    arb_vectors_merged[i] = x
    i = i+1
arb_vectors_merged = arb_vectors_merged.astype(dtype=np.float32) 

In [None]:
%%time
number_of_tables = 50
assert eng_vectors_merged.dtype == np.float32
assert arb_vectors_merged.dtype == np.float32

params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(eng_vectors_merged[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(18, params_cp)

eng_merged_table = falconn.LSHIndex(params_cp)
eng_merged_table.setup(eng_vectors_merged)

eng_merged_query_object = eng_merged_table.construct_query_object()




params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(arb_vectors_merged[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = number_of_tables
# we set one rotation, since the data is dense enough,
# for sparse data set it to 2
params_cp.num_rotations = 2
params_cp.seed = 5721840
# we want to use all the available threads to set up
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
# we build 24-bit hashes so that each table has
# 2^24 bins; this is a good choise since 2^24 is of the same
# order of magnitude as the number of data points
falconn.compute_number_of_hash_functions(18, params_cp)

arb_merged_table = falconn.LSHIndex(params_cp)
arb_merged_table.setup(arb_vectors_merged)

arb_merged_query_object = arb_merged_table.construct_query_object()

In [4]:
%%time

build_top_given_names(20)
print(len(top_given_names))
print(len(set(all_eng_given_names)))
print(len(set(all_arb_given_names)))

Connected to the PostgreSQL database...
226614
115182
80855
CPU times: user 3.61 s, sys: 116 ms, total: 3.73 s
Wall time: 9.9 s


In [5]:
%%time
set_len = len(top_given_names)
eng_names_with_yamli_dict = load_obj('eng_names_with_yamli_dict')

for index, row in top_given_names.iterrows():
    name = row['trimmed_eng']
    
    yamli_variants = eng_names_with_yamli_dict[name]
    top_given_names.set_value(index, 'yamli_variants', ','.join(set(yamli_variants)))

CPU times: user 16.3 s, sys: 28.7 ms, total: 16.3 s
Wall time: 16.3 s


In [6]:
%%time
eng_name_with_variants = top_given_names.copy()
eng_name_with_variants['arb_variants'] = eng_name_with_variants.groupby('trimmed_eng')['arb'].transform(format_variants_list)
#eng_name_with_variants['arb_variants'] = eng_name_with_variants.groupby('trimmed_eng')['arb'].transform(format_variants_list)
#eng_name_with_variants['arb_variants_count'] = eng_name_with_variants.groupby('trimmed_eng')['arb'].transform(get_variants_count)
#eng_name_with_variants['eng_variants'] = eng_name_with_variants.groupby('trimmed_eng')['eng_variants'].transform(format_variants_list2)

arb_name_with_variants = top_given_names.copy()
arb_name_with_variants['eng_variants'] = arb_name_with_variants.groupby('trimmed_arb')['eng'].transform(format_variants_list)
#arb_name_with_variants['arb_variants_count'] = arb_name_with_variants.groupby('trimmed_arb')['eng'].transform(get_variants_count)



CPU times: user 41 s, sys: 210 ms, total: 41.2 s
Wall time: 41.2 s


In [7]:
%%time
#cols_to_use = eng_name_with_variants.columns.difference(arb_name_with_variants.columns)
#eee = pd.merge(eng_name_with_variants, arb_name_with_variants, left_index=True, right_index=True, how='outer')

#eee = (pd.merge(eng_name_with_variants, arb_name_with_variants, on='trimmed_eng', how='outer'))
aaa = (pd.merge(arb_name_with_variants, eng_name_with_variants, on='trimmed_arb', how='outer'))

CPU times: user 14.1 s, sys: 1.31 s, total: 15.4 s
Wall time: 15.4 s


In [None]:
%%time
eee = eee.drop_duplicates(subset=['trimmed_eng','eng_variants'], keep='first')
eee['eng_variants'] = eee.groupby('trimmed_eng')['eng_variants'].transform(format_variants_list2)
eee = eee.drop_duplicates(subset=['trimmed_eng','eng_variants'], keep='first')


In [None]:
eee

In [None]:
del eee['trimmed_eng']
del eee['arb_x']
del eee['count_x']
del eee['trimmed_arb_x']
del eee['eng_y']
del eee['arb_y']
del eee['count_y']
del eee['trimmed_arb_y']
del eee['yamli_variants_y']

In [None]:
eee.to_csv(target_dir + 'eng_names_with_variants.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)


In [12]:
%%time
aaa = aaa.drop_duplicates(subset=['trimmed_arb','arb_variants'], keep='first')
aaa['arb_variants'] = aaa.groupby('trimmed_arb')['arb_variants'].transform(format_variants_list2)
aaa = aaa.drop_duplicates(subset=['trimmed_arb','arb_variants'], keep='first')


CPU times: user 61.7 ms, sys: 5.65 ms, total: 67.3 ms
Wall time: 64.8 ms


In [None]:
del aaa['trimmed_arb']
del aaa['eng_x']
del aaa['count_x']
del aaa['trimmed_eng_x']
del aaa['eng_y']
del aaa['arb_y']
del aaa['count_y']
del aaa['trimmed_eng_y']
del aaa['yamli_variants_y']

In [None]:
aaa

In [14]:
aaa.to_csv(target_dir + 'arb_names_with_variants.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)


In [None]:
eng_names_with_yamli_dict = {key: [] for key in all_eng_given_names}

In [98]:
query_falconn_index(['خالد'], eng_merged_query_object, 50, eng_names_merged, eng_vectors_merged)

NameError: name 'eng_merged_query_object' is not defined

In [None]:
%%time
from multiprocessing.dummy import Pool as ThreadPool 
pool = ThreadPool(20)
results = pool.map(add_to_yamli_dict, all_eng_given_names)

In [None]:
i = 0
for name, arr in eng_names_with_yamli_dict.items():
    if len(arr) < 1:
        i += 1
        print(name)
        add_to_yamli_dict(name)
        
save_obj(eng_names_with_yamli_dict, 'eng_names_with_yamli_dict')


In [None]:
eng_eng_model_predictions_dict = {key: [] for key in all_eng_given_names}
eng_arb_model_predictions_dict = {key: [] for key in all_eng_given_names}
arb_eng_model_predictions_dict = {key: [] for key in all_arb_given_names}
arb_arb_model_predictions_dict = {key: [] for key in all_arb_given_names}

In [None]:
%%time 
i = 0
for name in all_eng_given_names:
    i += 1
    if i % 10000 == 0:
        print("processed {i} out of {t}".format(i=i, t=len(all_eng_given_names)))
    eng_predictions = query_falconn_index([name], eng_merged_query_object, 50, eng_names_merged, eng_vectors_merged)
    eng_predictions = eng_predictions['name'].tolist()
    eng_predictions = [a.strip(' ') for a in eng_predictions]
    eng_eng_model_predictions_dict[name] = eng_predictions

    arb_predictions = query_falconn_index([name], arb_merged_query_object, 50, arb_names_merged, arb_vectors_merged)
    arb_predictions = arb_predictions['name'].tolist()
    arb_predictions = [a.strip(' ') for a in arb_predictions]
    eng_arb_model_predictions_dict[name] = arb_predictions

save_obj(eng_arb_model_predictions_dict, 'eng_arb_model_predictions_dict')
save_obj(eng_eng_model_predictions_dict, 'eng_eng_model_predictions_dict')


In [None]:
i = 0
for name in all_arb_given_names:
    i += 1
    if i % 10000 == 0:
        print("processed {i} out of {t}".format(i=i, t=len(all_eng_given_names)))
    eng_predictions = query_falconn_index([name], eng_merged_query_object, 50, eng_names_merged, eng_vectors_merged)
    eng_predictions = eng_predictions['name'].tolist()
    eng_predictions = [a.strip(' ') for a in eng_predictions]
    arb_eng_model_predictions_dict[name] = eng_predictions

    arb_predictions = query_falconn_index([name], arb_merged_query_object, 50, arb_names_merged, arb_vectors_merged)
    arb_predictions = arb_predictions['name'].tolist()
    arb_predictions = [a.strip(' ') for a in arb_predictions]
    arb_arb_model_predictions_dict[name] = arb_predictions

save_obj(arb_arb_model_predictions_dict, 'arb_arb_model_predictions_dict')
save_obj(arb_eng_model_predictions_dict, 'arb_eng_model_predictions_dict')

In [None]:
i = 0
for name, arr in eng_eng_model_predictions_dict.items():
    if len(arr) < 1:
        i += 1
        print(name)
        add_to_eng_based_model_dict(name)
        
save_obj(eng_eng_model_predictions_dict, 'eng_eng_model_predictions_dict')

for name, arr in eng_arb_model_predictions_dict.items():
    if len(arr) < 1:
        i += 1
        print(name)
        add_to_eng_based_model_dict(name)
        
save_obj(eng_arb_model_predictions_dict, 'eng_arb_model_predictions_dict')


In [52]:
%%time
eng_names_with_yamli_dict = load_obj('eng_names_with_yamli_dict')
eng_arb_model_predictions_dict = load_obj('eng_arb_model_predictions_dict')
eng_eng_model_predictions_dict = load_obj('eng_eng_model_predictions_dict')
arb_arb_model_predictions_dict = load_obj('arb_arb_model_predictions_dict')
arb_eng_model_predictions_dict = load_obj('arb_eng_model_predictions_dict')

eng_names_with_variants = pd.read_csv(target_dir + 'eng_names_with_variants.tsv',sep='\t', header=None, names=['name', 'yamli_variants', 'arb_variants', 'eng_variants', 'eng_model_predictions', 'arb_model_predictions'])
arb_names_with_variants = pd.read_csv(target_dir + 'arb_names_with_variants.tsv',sep='\t', header=None, names=['name', 'yamli_variants', 'eng_variants', 'arb_variants', 'eng_model_predictions', 'arb_model_predictions'])


CPU times: user 6.28 s, sys: 247 ms, total: 6.52 s
Wall time: 6.52 s


In [41]:
df = eng_names_with_variants
i = 0
set_len = len(df)
for index, row in df.iterrows():
    i += 1
    if i % 10000 == 0:
        print("processed {i} out of {t}".format(i=i, t=set_len))
        
    name = row['name'].strip(' ')
    eng_variants = row['eng_variants'].split(',')
    arb_variants = row['arb_variants'].split(',')
    arb_variants = list(set([araby.strip_tashkeel(arb) for arb in arb_variants]))
    yamli_variants = row['yamli_variants'].split(',')
    yamli_variants = list(set([araby.strip_tashkeel(arb) for arb in yamli_variants]))

    eng_eng_model_predictions = list(set(set(eng_eng_model_predictions_dict[name]) - set(eng_variants)))
    arb_model = eng_arb_model_predictions_dict[name]
    arb_model = list(set([araby.strip_tashkeel(arb) for arb in arb_model]))
    eng_arb_model_predictions = list(set(set(arb_model) - set(arb_variants) - set(yamli_variants)))
    yamli_variants = set(yamli_variants) - set(arb_variants)    

    df.set_value(index, 'yamli_variants', ','.join(set(yamli_variants)))
    df.set_value(index, 'eng_eng_model_predictions', ','.join(set(eng_eng_model_predictions)))
    df.set_value(index, 'eng_arb_model_predictions', ','.join(set(eng_arb_model_predictions)))
    #df.set_value(index, 'arb_eng_model_predictions', ','.join(set(arb_eng_model_predictions)))
    #df.set_value(index, 'arb_arb_model_predictions', ','.join(set(arb_arb_model_predictions)))

processed 10000 out of 115182
processed 20000 out of 115182
processed 30000 out of 115182
processed 40000 out of 115182
processed 50000 out of 115182
processed 60000 out of 115182
processed 70000 out of 115182
processed 80000 out of 115182
processed 90000 out of 115182
processed 100000 out of 115182
processed 110000 out of 115182


In [43]:
eng_names_with_variants.to_csv(target_dir + 'eng_names_with_variants.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)


In [44]:
df = arb_names_with_variants
i = 0
set_len = len(df)
for index, row in df.iterrows():
    i += 1
    if i % 10000 == 0:
        print("processed {i} out of {t}".format(i=i, t=set_len))
        
    name = row['name'].strip(' ')
    eng_variants = row['eng_variants'].split(',')
    arb_variants = row['arb_variants'].split(',')
    arb_variants = list(set([araby.strip_tashkeel(arb) for arb in arb_variants]))
    yamli_variants = row['yamli_variants'].split(',')
    yamli_variants = list(set([araby.strip_tashkeel(arb) for arb in yamli_variants]))

    arb_eng_model_predictions = list(set(set(arb_eng_model_predictions_dict[name]) - set(eng_variants)))
    arb_model = arb_arb_model_predictions_dict[name]
    arb_model = list(set([araby.strip_tashkeel(arb) for arb in arb_model]))
    arb_arb_model_predictions = list(set(set(arb_model) - set(arb_variants) - set(yamli_variants)))
    yamli_variants = set(yamli_variants) - set(arb_variants)    

    df.set_value(index, 'yamli_variants', ','.join(set(yamli_variants)))
    df.set_value(index, 'arb_eng_model_predictions', ','.join(set(arb_eng_model_predictions)))
    df.set_value(index, 'arb_arb_model_predictions', ','.join(set(arb_arb_model_predictions)))


processed 10000 out of 80855
processed 20000 out of 80855
processed 30000 out of 80855
processed 40000 out of 80855
processed 50000 out of 80855
processed 60000 out of 80855
processed 70000 out of 80855
processed 80000 out of 80855


In [46]:
arb_names_with_variants.to_csv(target_dir + 'arb_names_with_variants.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)


In [None]:
del eee['arb_eng_model_predictions_x']
del eee['arb_arb_model_predictions_x']
del eee['eng_eng_model_predictions_x']
del eee['eng_arb_model_predictions_x']
del eee['arb_eng_model_predictions_y']
del eee['arb_arb_model_predictions_y']
del eee['eng_eng_model_predictions_y']
del eee['eng_arb_model_predictions_y']

In [None]:
eee

In [None]:
ggg = pd.read_csv(target_dir + 'all_names_with_yamli.tsv', sep='\t', header=None)


In [69]:
arb_names_with_variants.loc[0]['eng_model_predictions']

'MEHMUDH,MOOMMA,MOHAMMD,MOHOMMED,MHAMDA,MOHAMADU,MUJISUKAMTI,MOHIMID,MOHAMEDI,MAHOMMED,MACHMUDH,MOHMEDO,MOWAMMAD,MOHAMADY,MAHAMMED,MOHAMDU,MOHMOUD,MHAMOUD,MAKHMUDH,MOHMAD,MOHMOOD,MHAMOOD,MOHMAED,MOHMUD,MOHAMADI,MOHAMDY,MAHMUDA,MUAHMDO,MAKHMUDA,MHAMDI,MAHAMUDA,MOHMADI,MUHAMADU,MAHAMADI,MUHMAD,MAKHMADI,MUSLIMAT,MOKHMAD,MOAHMMED,MAHMUDI,MOHAMEDY'

In [91]:
eng_names_with_variants = pd.read_csv(target_dir + 'eng_names_with_variants.tsv',sep='\t', header=None, names=['name', 'yamli_variants', 'arb_variants', 'eng_variants', 'eng_model_predictions', 'arb_model_predictions'])
arb_names_with_variants = pd.read_csv(target_dir + 'arb_names_with_variants.tsv',sep='\t', header=None, names=['name', 'yamli_variants', 'eng_variants', 'arb_variants', 'eng_model_predictions', 'arb_model_predictions'])


In [88]:
def strip_tashkeel_from_arb_variants(df):
    for index, row in df.iterrows():        
        arb_variants = row['arb_variants'].split(',')
        arb_variants = list(set([araby.strip_tashkeel(arb) for arb in arb_variants]))
        df.set_value(index, 'arb_variants', ','.join(set(arb_variants)))
    return df

eng_names_with_variants = strip_tashkeel_from_arb_variants(eng_names_with_variants).copy()
arb_names_with_variants = strip_tashkeel_from_arb_variants(arb_names_with_variants).copy()

arb_names_with_variants.to_csv(target_dir + 'arb_names_with_variants.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)
eng_names_with_variants.to_csv(target_dir + 'eng_names_with_variants.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)


In [83]:
eng_names_with_variants.loc[0]['arb_variants']

'محاماد,محمد,محمّد,مهامّد,مهمّد,موحاماد,موحامد,موحامّد,موحمد,موحمّد,موهاماد,موهامد,موهامّد,موهمّد,مُحاماد,مُهاماد,مُهامّاد,مُهامّد'

In [79]:
arb_names_with_variants = pd.read_csv(target_dir + 'arb_names_with_variants.tsv',sep='\t', header=None, names=['name', 'yamli_variants', 'eng_variants', 'arb_variants', 'eng_model_predictions', 'arb_model_predictions'])


In [97]:
arb_names_with_variants.loc[0]['yamli_variants']

'محمض,موحماد,محامد,محاميد,محميد'