In [11]:
import psycopg2
from configparser import ConfigParser
import networkx as nx

def config(filename='prepare_data.ini', section='phonetic'):
    parser = ConfigParser()
    parser.read(filename)
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

def db_connect():
    """ Connect to the PostgreSQL database server """
    conn = None
    params = config()
    conn = psycopg2.connect(**params)
    print('Connected to the PostgreSQL database...')
    
    return conn

def read_dataframe(query):
    conn = db_connect()
    result = None
    try:
        result = pd.read_sql(query, con=conn, index_col='id')
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
            print('Database connection closed.')
    return result


def generate_graph(query, pickle_file):
    df = read_dataframe(query)
    G = nx.Graph()

    for index, row in df.iterrows():
        arb, eng, cnt = row['arb'], row['eng'], row['cnt']
        if G.has_edge(arb, eng):
            G[arb][eng]['weight'] += cnt
        else:
            G.add_edge(arb, eng, weight=cnt)
    nx.write_gpickle(G,pickle_file)

def generate_given_names_graph():
    query = """
            SELECT ID, ARB, ENG, COUNT AS CNT FROM GIVEN_NAMES_MASTER
            WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != ''
            """
    generate_graph(query, "/home/jupyter/notebooks/PoC/data-preparation/pickle/given_names_graph.gpickle")

def generate_family_names_graph():
    query = """
            SELECT ID, ARB, ENG, COUNT AS CNT FROM FAMILY_NAMES_MASTER
            WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != ''
            """
    generate_graph(query, "/home/jupyter/notebooks/PoC/data-preparation/pickle/family_names_graph.gpickle")

def generate_dan_names_graph():
    query = """
            SELECT ID, ARB, ENG, FREQ AS CNT FROM GIVEN_NAMES_DAN
            WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != ''
            """
    generate_graph(query,"/home/jupyter/notebooks/PoC/data-preparation/pickle/dan_names_graph.gpickle")

def read_given_names_graph():
    return nx.read_gpickle("/home/jupyter/notebooks/PoC/data-preparation/pickle/given_names_graph.gpickle")
    
def read_family_names_graph():
    return nx.read_gpickle("/home/jupyter/notebooks/PoC/data-preparation/pickle/family_names_graph.gpickle")
    
def read_dan_names_graph():
    return nx.read_gpickle("/home/jupyter/notebooks/PoC/data-preparation/pickle/dan_names_graph.gpickle")

In [12]:
import pandas as pd
top_noise_alike_data = [
    'XXX', 'UNKNOWN',  'WITHOUT', 'MADAM'
'متوفر','منزل', ''
'عير مبين', 'عيرمعروف' ,'عير معروف', 'عير معروفه',  'غبر معروف', 
'غرر معروف', 'غيتتتر مبيتتتن', 'غيتتتر مبيتتن', 'غيتتر مبيتتتن', 'غيتتر مبيتتن', 'غيتر مبتتين', 'غيتر مبين', 
'غيتر معروف', 'غير توفر', 'غيرر معروف', 'غير مبن', 'غير مبيتتتتن', 'غير مبيتتن', 


'غير مححد', 'غيرمحد', 




'غير موجو', 
'غير نعروف', 
'غير وعروف', 'غيلر معروف', 
    
    'السيد', 
    'لايوجد', 'لا يوجد', 
    'غير معروف', 'غيرمعروف',
    'غير معرف',  'غيرمعرف',
    'غيرموجود', 'غير موجود',
    'غير متوفر', 'غيرمتوفر', 
    'غير محدد', 'غيرمحدد', 
    'غير مبين', 'غيرمبين', 
    'غير معلوم', 'غيرمعلوم',
    'غير مذكور', 'غيرمذكور', 
    'غير معترف', 'غيرمعترف', 
    'غير معترف', 'غيرمعترف', 
    'عير مغروف', 'عيرمغروف', 
    'غير مغروف', 'غيرمغروف', 
]

top_noise_data = [
    'MRS', 'MRS.', 'MRSS', 'MR', 'MR.', 'MISS', 'MIS', 'AL-SAYYDA', 'SAYEDA', 'SHE', 'N A'
    'الله', 'غير'
]

given_limit = 10
family_limit = 10


In [13]:
import numpy as np

def build_variants(names, graphs, validate = False):
    iterable_list = names[:]
    total_variants = []
    for name in iterable_list:
        #print("processing name: {name}".format(name=name))
        variants = get_variants(name, graphs)
        total_variants += get_top_frequency_names(variants)
    total_variants = list(set(total_variants))
    #print("total_variants: {total_variants}".format(total_variants=total_variants))
    if validate:
        iterable_list = total_variants[:]
        for name in iterable_list:
            if not validate_name_by_variants(name, graphs, names):
                #print("removing name: {name}".format(name=name))
                total_variants.remove(name)
        total_variants
        
    return total_variants
                

def get_variants(name, graphs=['given']):
    result = {}
    for gr in graphs:
        for v,u in nx.edges(G[gr], name):
            freq = G[gr][v][u]['weight']
            if not u in result:
                result[u] = freq
            else:
                result[u] += freq
    return result

def validate_name_by_variants(name, graphs, valid_variants):
    variants = get_variants(name, graphs)
    total_valid_count = 0
    total_invalid_count = 0
    
    total = sum(variants.values())
    if total < 3:
        return False

    for key, val in variants.items():
        if key in valid_variants:
            total_valid_count += val
        else:
            total_invalid_count += val
    
    #print("for {name}: valid: {valid}, invalid: {invalid}".format(name=name, valid=total_valid_count, invalid=total_invalid_count))
    
    if total_valid_count < 3:
        return False

    if total_valid_count > total_invalid_count or total_valid_count > 100:
        return True
    
    #print("variations for {name} are: {dic}".format(name=name, dic=variants))
    return False
    

def get_top_frequency_names(list):
    total = sum(list.values())
    lower_accepted_frequency = 100
    threshold = 10
    
    max_value = max(list.values())
    if total > 6561:
        threshold = 1
    else:
        threshold -= total**(1./4.)
        
    #print("threshold: {thre}, total: {tot}".format(thre=threshold, tot=total))
    matched_list = [key for key, val in list.items() 
                    if len(key) > 2 and 
                    key not in top_noise_data and 
                    not any(x in key for x in top_noise_alike_data) and
                    (val / total * 100 > threshold or val >= lower_accepted_frequency)]
    #print("top matched_list: {thre}".format(thre=matched_list))
    not_matched_list = [ (key, val) for key, val in list.items() if val / total * 100 <= threshold and val < lower_accepted_frequency]
    matched_list_with_composite = [key for key, val in list.items() 
                                   if any(match in key and len(key) < len(match) * 2  for match in matched_list)]
    if(len(matched_list_with_composite) - len(matched_list) > 3):
        return matched_list
    
    return matched_list_with_composite

def process_name_pair(english_name, arabic_name, graphs):
    arabic_variants = build_variants([english_name], graphs)
    english_variants = build_variants([arabic_name], graphs)
    #print("first iteration results, arabic: {arb}, english: {eng}".format(arb=arabic_variants, eng=english_variants))
    arabic_variants += build_variants(english_variants, graphs, True)
    arabic_variants = list(set(arabic_variants))
    if arabic_name not in arabic_variants:
        arabic_variants.append(arabic_name)
    english_variants += build_variants(arabic_variants, graphs, True)
    english_variants = list(set(english_variants))
    if english_name not in english_variants:
        english_variants.append(english_name)
    #print("second iteration results, arabic: {arb}, english: {eng}".format(arb=arabic_variants, eng=english_variants))
    return english_variants, arabic_variants

In [14]:
#generate_given_names_graph()
#generate_family_names_graph()
#generate_dan_names_graph()

G = {}
G['given'] = read_given_names_graph()
G['family'] =  read_family_names_graph()
G['dan'] = read_dan_names_graph()

In [15]:
def read_top_given_names():
    conn = db_connect()
    print("given limit: {gl}".format(gl=given_limit))
    try:
        query = """
            SELECT * FROM (
                SELECT ENG, ARB, SUM(COUNT) AS S FROM (
                    SELECT ENG, ARB, COUNT FROM GIVEN_NAMES_MASTER
                    WHERE ENG IS NOT NULL AND ENG != '' AND ARB IS NOT NULL AND ARB != ''
                ) AS SUB GROUP BY ENG, ARB
                ORDER BY S DESC
            ) AS S LIMIT {gl}
            """.format(gl=given_limit)
        sql_result = pd.read_sql(query, con=conn)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
    return [[x[1], x[2]] for x in sql_result[['eng','arb']].itertuples()]   

top_given_names = read_top_given_names()

def read_top_family_names():
    conn = db_connect()
    print("family limit: {fl}".format(fl=family_limit))
    try:
        query = """
            SELECT * FROM (
                SELECT ENG, ARB, SUM(COUNT) AS S FROM (
                    SELECT ENG, ARB, COUNT FROM FAMILY_NAMES_MASTER
                    WHERE ENG IS NOT NULL AND ENG != '' AND ARB IS NOT NULL AND ARB != ''
                ) AS SUB GROUP BY ENG, ARB
                ORDER BY S DESC
            ) AS S2 LIMIT {fl}
            """.format(fl=family_limit)
        sql_result = pd.read_sql(query, con=conn)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
    return [[x[1], x[2]] for x in sql_result[['eng','arb']].itertuples()]   

'''top_family_names = 
    [
        ['ALI','علي'],
        ['KHAN','خان'],
        ['SALEH','صالح'],
        ['SHAH','شاه'],
        ['HUSSEIN','حسين'],
        ['ABBAS','عباس'],
        ['RAHMAN','رحمن'],
        ['HUSSAIN','حسين'],
        ['OSMAN','عصمان'],
        ['KHALIL','خليل'],
        ['THOMAS','توماس'],
        ['SAAD','سعد'],
        ['AZIZ','عزيز'],
        ['KAMAL','كمال'],
        ['SULTAN','سلطان'],
        ['HAMED','حامد'],
        ['SILVA','سيلفا'],
        ['BUTT','بت'],
        ['SMITH','سميث'],
        ['ALEXANDER','الكسندر']]'''
top_family_names = read_top_family_names()

Connected to the PostgreSQL database...
given limit: 10
Connected to the PostgreSQL database...
family limit: 10


In [16]:
import numpy as np
import itertools
import random
from fuzzywuzzy import fuzz
import csv
import os
import sys

def prepare_given_names(dir_prefix):
    given_names = []
    
    outer_index = 0
    for pair in top_given_names:
        outer_index += 1
        if outer_index % 10000 == 0:
            print("processing the given names, batch {index}".format(index=(outer_index / 10000)))

        given_names.append(process_name_pair(pair[0], pair[1],['dan', 'given']))
    prepare_names(given_names, dir_prefix)

def prepare_family_names(dir_prefix):
    family_names = []
    
    outer_index = 0
    for pair in top_family_names:
        outer_index += 1
        if outer_index % 10000 == 0:
            print("processing the family names, batch {index}".format(index=(outer_index / 10000)))

        family_names.append(process_name_pair(pair[0], pair[1],['family']))
    prepare_names(family_names, dir_prefix)


def prepare_names(all_names, file_prefix):
    
    eng_ara_pairs = open(file_prefix + "eng_ara_pairs.csv", 'w')
    eng_eng_pairs = open(file_prefix + "eng_eng_pairs.csv", 'w')
    ara_ara_pairs = open(file_prefix + "ara_ara_pairs.csv", 'w')
    negative_pairs = open(file_prefix + "negative_pairs.csv", 'w')
    eng_eng_nagative_pairs = open(file_prefix + "eng_eng_nagative_pairs.csv", 'w')
    ara_ara_negative_pairs = open(file_prefix + "ara_ara_negative_pairs.csv", 'w')
    
    fuzzy_distance = 50
    
    l = len(all_names)
    for index, gn in enumerate(all_names):
        if index % 10000 == 0:
            print("preparing name lists, batch {index}".format(index=(index / 10000)))

        eng_eng = list(itertools.combinations(gn[0], 2))
        eng_eng = [list(elem) for elem in eng_eng]
        #print(len(eng_eng))
        for item in eng_eng:
            item[0].strip()
            item[1].strip()
            eng_eng_pairs.write("{a}|{b}\n".format(a=item[0], b=item[1]))
        
        ara_ara = list(itertools.combinations(gn[1], 2))
        ara_ara = [list(elem) for elem in ara_ara]
        #print(len(ara_ara))
        for item in ara_ara:
            item[0].strip()
            item[1].strip()
            ara_ara_pairs.write("{a}|{b}\n".format(a=item[0], b=item[1]))
        
        ##generate negative labels
        r = list(range(0,index)) + list(range(index+1, l))
        eng_ara_unmatched_random_names = []
        eng_eng_unmatched_random_names = []
        ara_ara_unmatched_random_names = []
        
        for b in range(0, len(gn[0])):
            for c in range(0, len(gn[0]) - 1):
                eng_eng_unmatched_random_names.append(random.choice(r))
            for c in range(0, len(gn[1]) - 1):
                eng_ara_unmatched_random_names.append(random.choice(r))
            
        for b in range(0, len(gn[1])):
            for c in range(0, len(gn[0]) - 1):
                eng_eng_unmatched_random_names.append(random.choice(r))
            for c in range(0, len(gn[1]) - 1):
                ara_ara_unmatched_random_names.append(random.choice(r))
        
        #print(len(eng_eng_unmatched_random_names))
        #print(len(ara_ara_unmatched_random_names))
        #print("range={r}, index = {index}, current_list={gn}, e={e}, a={a}".format(r=r, index=index, gn=gn, e=english_unmatched_random_names,a=arabic_unmatched_random_names ))
        
        for arb in gn[1]:
            arb.strip()
            #count = 0
            for c in range(0, len(gn[0]) - 1):
                r = eng_eng_unmatched_random_names.pop()
                negative__idx = np.random.randint(low=0, high=len(all_names[r][0]))
                random_value = all_names[r][0][negative__idx]
                compare_value = all_names[r][1][0]
                if fuzz.partial_ratio(compare_value, arb) < fuzzy_distance:
                    random_value.strip()
                    negative_pairs.write("{r}|{a}\n".format(a=arb,r=random_value))
                    #count = count + 1
                    #print("added name: {name}, negative: {ne}".format(name=arb, ne=random_value))
                #else:
                    #print("not added name: {name}, negative: {ne}, comapred value: {cv}".format(name=arb, ne=random_value, cv=compare_value))
            #print(count)
            #count = 0
            for c in range(0, len(gn[1]) - 1):
                r = ara_ara_unmatched_random_names.pop()
                negative__idx = np.random.randint(low=0, high=len(all_names[r][1]))
                random_value = all_names[r][1][negative__idx]
                if fuzz.partial_ratio(random_value, arb) < fuzzy_distance:
                    random_value.strip()
                    ara_ara_negative_pairs.write("{a}|{r}\n".format(a=arb,r=random_value))
                    #count = count + 1
                    #print("added name: {name}, negative: {ne}".format(name=arb, ne=random_value))
                #else:
                    #print("not added name: {name}, negative: {ne}".format(name=arb, ne=random_value))
            #print(count)
        for eng in gn[0]:
            eng.strip()
            #count = 0
            for c in range(0, len(gn[0]) - 1):
                r = eng_eng_unmatched_random_names.pop()
                negative__idx = np.random.randint(low=0, high=len(all_names[r][0]))
                random_value = all_names[r][0][negative__idx]
                if fuzz.partial_ratio(random_value, eng) < fuzzy_distance:
                    random_value.strip()
                    eng_eng_nagative_pairs.write("{e}|{r}\n".format(e=eng,r=random_value))
                    #count = count + 1
                    #print("added name: {name}, negative: {ne}".format(name=eng, ne=random_value))
                #else:
                    #print("not added name: {name}, negative: {ne}".format(name=eng, ne=random_value))
            #print(count)
            #count = 0
            for c in range(0, len(gn[1]) - 1):
                r = eng_ara_unmatched_random_names.pop()
                negative__idx = np.random.randint(low=0, high=len(all_names[r][1]))
                random_value = all_names[r][1][negative__idx]
                compare_value = all_names[r][0][0]

                if fuzz.partial_ratio(compare_value, eng) < fuzzy_distance:
                    random_value.strip()
                    negative_pairs.write("{e}|{r}\n".format(e=eng,r=random_value))
                    #count = count + 1
                    #print("added name: {name}, negative: {ne}".format(name=eng, ne=random_value))
                #else:
                    #print("not added name: {name}, negative: {ne}, comapred value: {cv}".format(name=eng, ne=random_value, cv=compare_value))
            #print(count)
            
            for arb in gn[1]:
                arb.strip()
                eng_ara_pairs.write("{e}|{a}\n".format(e=eng,a=arb))
                
    eng_ara_pairs.close()
    eng_eng_pairs.close()
    ara_ara_pairs.close()
    negative_pairs.close()
    eng_eng_nagative_pairs.close()
    ara_ara_negative_pairs.close()


def generate_full_names(file_name):
    
    target_dir = "/home/jupyter/notebooks/PoC/data-preparation/name-pairs/{gl}_{fl}/".format(gl=given_limit, fl=family_limit)
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    
    prepare_given_names(target_dir + "given_")
    prepare_family_names(target_dir + "family_")

    
    with open(target_dir + '/given_eng_ara_pairs.csv', 'r') as f:
        reader = csv.reader(f, delimiter ='|')
        given_names = list(reader)
    f.close()
    
    with open(target_dir + '/given_ara_ara_pairs.csv', 'r') as f:
        reader = csv.reader(f, delimiter ='|')
        ara_ara_pairs = list(reader)
    f.close()
        
    with open(target_dir + '/given_eng_eng_pairs.csv', 'r') as f:
        reader = csv.reader(f, delimiter ='|')
        eng_eng_pairs = list(reader)
    f.close()    
    
    with open(target_dir + '/given_negative_pairs.csv', 'r') as f:
        reader = csv.reader(f, delimiter ='|')
        negative_pairs = list(reader)
    f.close()    
        
    with open(target_dir + '/given_eng_eng_nagative_pairs.csv', 'r') as f:
        reader = csv.reader(f, delimiter ='|')
        eng_eng_nagative_pairs = list(reader)
    f.close()
    
    with open(target_dir + '/given_ara_ara_negative_pairs.csv', 'r') as f:
        reader = csv.reader(f, delimiter ='|')
        ara_ara_negative_pairs = list(reader)
    f.close()    
        
    with open(target_dir + '/family_eng_ara_pairs.csv', 'r') as f:
        reader = csv.reader(f, delimiter ='|')
        family_names = list(reader)
    f.close()
    
    with open(target_dir + '/family_eng_eng_pairs.csv', 'r') as f:
        reader = csv.reader(f, delimiter ='|')
        family_eng_eng_pairs = list(reader)
    f.close()
    
    with open(target_dir + '/family_ara_ara_pairs.csv', 'r') as f:
        reader = csv.reader(f, delimiter ='|')
        family_ara_ara_pairs = list(reader)
    f.close()    
        
    with open(target_dir + '/family_negative_pairs.csv', 'r') as f:
        reader = csv.reader(f, delimiter ='|')
        family_negative_pairs = list(reader)
    f.close()
    
    with open(target_dir + '/family_eng_eng_nagative_pairs.csv', 'r') as f:
        reader = csv.reader(f, delimiter ='|')
        family_eng_eng_nagative_pairs = list(reader)
    f.close()
    
    with open(target_dir + '/family_ara_ara_negative_pairs.csv', 'r') as f:
        reader = csv.reader(f, delimiter ='|')
        family_ara_ara_negative_pairs = list(reader)
    f.close()
         
    '''
    print(len(given_names))
    print(len(eng_eng_pairs))
    print(len(ara_ara_pairs))
    print(len(negative_pairs))
    print(len(eng_eng_nagative_pairs))
    print(len(ara_ara_negative_pairs))
    print(len(family_names))
    print(len(family_eng_eng_pairs))
    print(len(family_ara_ara_pairs))
    print(len(family_negative_pairs))
    print(len(family_eng_eng_nagative_pairs))
    print(len(family_ara_ara_negative_pairs))
    
    
    print(given_names)
    print(eng_eng_pairs)
    print(ara_ara_pairs)
    print(negative_pairs)
    print(eng_eng_nagative_pairs)
    print(ara_ara_negative_pairs)
    print(family_names)
    print(family_eng_eng_pairs)
    print(family_ara_ara_pairs)
    print(family_negative_pairs)
    print(family_eng_eng_nagative_pairs)
    print(family_ara_ara_negative_pairs)
    '''
    
    f = open("/home/jupyter/notebooks/PoC/data-preparation/full-names/{f}".format(f=file_name),'w')
    
    given_names_range = list(range(len(given_names)))
    family_names_range = list(range(len(family_names)))
    negative_given_names_range = list(range(len(negative_pairs)))
    negative_family_names_range = list(range(len(family_negative_pairs)))
    
    outer_index = 0
    while True:
        gn_pairs = []
        outer_index += 1
        if outer_index % 10000 == 0:
            print("generating full names, batch {index}".format(index=(outer_index / 10000)))

        for i in range(3):
            gn_idx = 0
            if len(given_names_range) > 0:
                gn_idx = np.random.randint(low=0, high=len(given_names_range))
                gn_pair = given_names[given_names_range.pop(gn_idx)]
            else:
                gn_idx = np.random.randint(low=0, high=len(given_names))
                gn_pair = given_names[gn_idx]
                
            gn_pairs.append(gn_pair)

        fn_idx = 0
        if len(family_names_range) > 0:
            fn_idx = np.random.randint(low=0, high=len(family_names_range))
            fn_pair = family_names[family_names_range.pop(fn_idx)]
        else:
            fn_idx = np.random.randint(low=0, high=len(family_names)) 
            fn_pair = family_names[fn_idx]
        

        f.write("{eg1} {eg2} {eg3} {ef} \t {ag1} {ag2} {ag3} {af} \t {pf}\n".format(
            eg1=gn_pairs[0][0],
            eg2=gn_pairs[1][0],
            eg3=gn_pairs[2][0],
            ef=fn_pair[0],
            ag1=gn_pairs[0][1],
            ag2=gn_pairs[1][1],
            ag3=gn_pairs[2][1],
            af=fn_pair[1],
            pf=1
        ))
        
        gn_pairs = []
        
        for i in range(3):
            gn_idx = 0
            if len(negative_given_names_range) > 0:
                gn_idx = np.random.randint(low=0, high=len(negative_given_names_range))
                gn_pair = negative_pairs[negative_given_names_range.pop(gn_idx)]
            else:
                gn_idx = np.random.randint(low=0, high=len(negative_pairs))
                gn_pair = negative_pairs[gn_idx]
                
            gn_pairs.append(gn_pair)

        fn_idx = 0
        if len(negative_family_names_range) > 0:
            fn_idx = np.random.randint(low=0, high=len(negative_family_names_range))
            fn_pair = family_negative_pairs[negative_family_names_range.pop(fn_idx)]
        else:
            fn_idx = np.random.randint(low=0, high=len(family_negative_pairs)) 
            fn_pair = family_negative_pairs[fn_idx]
        
        try:
            f.write("{neg1} {neg2} {neg3} {nef} \t {nag1} {nag2} {nag3} {naf} \t {pf}\n".format(
                neg1=gn_pairs[0][0],
                neg2=gn_pairs[1][0],
                neg3=gn_pairs[2][0],
                nef=fn_pair[0],
                nag1=gn_pairs[0][1],
                nag2=gn_pairs[1][1],
                nag3=gn_pairs[2][1],
                naf=fn_pair[1],
                pf=0
            ))
        except:
            print("gn_pairs : {gn}".format(gn=gn_pairs))
            print("fn_pair  : {fn}".format(fn=fn_pairs))
            e = sys.exc_info()[0]
            write_to_page( "<p>Error: %s</p>" % e )
            
            
        if (len(given_names_range) == 0 and len(family_names_range) == 0) or (len(negative_given_names_range) == 0 and len(negative_family_names_range) == 0):
            break
          
    print(outer_index)
    # generate arabic pairs
    
    
    # generate english pairs
    
    # generate single name values
    
    
    # generate revesed names
    
    ## 
    
    ## generate negative values
    

    
    f.close()

In [None]:
%%time
generate_full_names("full_names_{gl}_{fl}_negative.tsv".format(gl=given_limit, fl=family_limit))
    

In [10]:
print(sorted(process_name_pair('MOHAMMED', 'محمد', ['given', 'dan'])[0]))

['BEN MOHAMED', 'BIN MD', 'BIN MOHAMED', 'BIN MOHD', "M'HAMAD", "M'HAMED", "M'HAMMAD", "M'HAMMED", "M'HEMAD", "M'HEMED", "M'HEMMED", 'MAHAMAD', 'MAHAMMAD', 'MEHMED', 'MEHMET', 'MHAMAD', 'MHAMED', 'MHAMMAD', 'MHAMMED', 'MHEMAD', 'MHEMED', 'MHEMMED', 'MIHAMAD', 'MIHAMMAD', 'MOCHAMAD', 'MOCHAMAT', 'MOCHAMED', 'MOCHAMET', 'MOCHAMMAD', 'MOCHAMMED', 'MOCHEMAD', 'MOHAMAD', 'MOHAMAT', 'MOHAMD', 'MOHAMED', 'MOHAMEED', 'MOHAMEET', 'MOHAMET', 'MOHAMID', 'MOHAMMAD', 'MOHAMMAT', 'MOHAMMD', 'MOHAMMED', 'MOHAMMEDOLA', 'MOHAMMEED', 'MOHAMMET', 'MOHAMMID', 'MOHAMMIT', 'MOHAMMOD', 'MOHAMMUD', 'MOHAMOOD', 'MOHAMOUD', 'MOHAMUD', 'MOHD', 'MOHEMAD', 'MOHEMAT', 'MOHEMED', 'MOHEMET', 'MOHEMMAD', 'MOHEMMED', 'MOHMD', 'MOHMED', 'MOHMMED', 'MOKHAMAD', 'MOKHAMED', 'MOKHAMMAD', 'MOKHAMMED', 'MOOHAMAD', 'MOOHAMED', 'MOOHAMET', 'MOOHAMID', 'MOOHAMMAD', 'MOOHAMMED', 'MOOHED', 'MOUHAMAD', 'MOUHAMAT', 'MOUHAMED', 'MOUHAMET', 'MOUHAMMAD', 'MOUHAMMED', 'MOUHEMAD', 'MOUHEMED', 'MOUKHAMED', 'MOWHAMMAD', 'MUCHAMAD', 'MUCHAM

In [None]:
process_name_pair('FATIMA', 'فاطمه', ['given', 'dan'])

In [None]:
process_name_pair('HAMZA', 'حمزه', ['given', 'dan'])

In [None]:
process_name_pair('KAMAL', 'كمال', ['given', 'dan'])

In [None]:
process_name_pair('KHALED', 'خالد', ['given', 'dan'])

In [None]:
process_name_pair('ALI', 'علي', ['given', 'dan'])

In [None]:
process_name_pair('SALEH', 'صالح', ['given', 'dan'])

In [None]:
process_name_pair('IBRAHIM', 'ابرهيم', ['given', 'dan'])

In [None]:
process_name_pair('MARIAM', 'مريم', ['given', 'dan'])

In [None]:
process_name_pair('HASSAN', 'حسن', ['given', 'dan'])

In [None]:
process_name_pair('OMAR', 'عمر', ['given', 'dan'])

In [None]:
process_name_pair('HUSSEIN', 'حسين', ['given', 'dan'])

In [None]:
process_name_pair('SALIM', 'سليم', ['given', 'dan'])

In [None]:
process_name_pair('SAEED', 'سعيد', ['given', 'dan'])

In [None]:
process_name_pair('ABBAS', 'عباس', ['given', 'dan'])

In [None]:
process_name_pair('SAAD', 'سعد', ['given', 'dan'])

In [None]:
process_name_pair('ELIAS', 'الياس', ['given', 'dan'])

In [None]:
process_name_pair('FAISAL', 'فيصل', ['given', 'dan'])

In [None]:
process_name_pair('FATIMA', 'فاطمه', ['given', 'dan'])

In [None]:
process_name_pair('NASSER', 'ناصر', ['given', 'dan'])

In [None]:
process_name_pair('ADEL', 'عادل', ['given', 'dan'])

In [None]:
process_name_pair('MAHMOUD', 'محمود', ['given', 'dan'])

In [None]:
process_name_pair('ADNAN', 'عدنان', ['given', 'dan'])

In [None]:
process_name_pair('SALAH', 'صلاح', ['given', 'dan'])

In [None]:
process_name_pair('SULTAN', 'سلطان', ['given', 'dan'])

In [None]:
process_name_pair('SALEM', 'سالم', ['given', 'dan'])

In [None]:
process_name_pair('MARIE', 'ماري', ['given', 'dan'])

In [None]:
process_name_pair('MOHAMMAD', 'موهاماد', ['given', 'dan'])

In [None]:
process_name_pair('DANIEL', 'دانيل', ['given', 'dan'])

In [None]:
process_name_pair('JEAN', 'جين', ['given', 'dan'])

In [None]:
process_name_pair('YOUSSEF', 'يوسف', ['given', 'dan'])

In [None]:
process_name_pair('NASIM', 'نسيم', ['given', 'dan'])

In [20]:
process_name_pair('ZAINAB', 'زينب', ['given', 'dan'])

(['ZEYNEP',
  'ZENAB',
  'ZEINEP',
  'ZAINABU',
  'ZINAB',
  'ZAENAB',
  'ZAINAB',
  'ZAINEB',
  'ZAINUB',
  'ZEINEB',
  'ZEYNAB',
  'ZAYNAB',
  'ZAINABA',
  'ZAINABAH',
  'ZEYNEB',
  'ZINEB',
  'ZAINABI',
  'ZEINAB'],
 ['زينبذ', 'زناب', 'زينبه', 'زينب ', 'زينب'])

In [None]:
process_name_pair('NOUR', 'نور', ['given', 'dan'])

In [None]:
process_name_pair('MALEK', 'مالك', ['given', 'dan'])

In [None]:
process_name_pair('MOSTAFA', 'مصطفى', ['given', 'dan'])

In [None]:
process_name_pair('AMMAR', 'عمار', ['given', 'dan'])

In [None]:
process_name_pair('DAVID', 'دافيد', ['given', 'dan'])

In [None]:
process_name_pair('ABDUL', 'عبدول', ['given', 'dan'])

In [None]:
process_name_pair('HAMED', 'حامد', ['given', 'dan'])

In [21]:
process_name_pair('JOSEPH', 'جوزيف', ['given', 'dan'])

(['JOSEPH', 'JOZEF', 'JOSEF', 'GOOZF', 'JOZSEF', 'JOSEPH MC'],
 ['جوزيف', 'جوزيف مك', 'جوزسف', 'جوسف', 'جوزف'])

In [None]:
process_name_pair('GEORGE', 'جورج', ['given', 'dan'])

In [None]:
process_name_pair('FOUAD', 'فؤاد', ['given', 'dan'])

In [None]:
process_name_pair('FATIMA', 'فطيمه', ['given', 'dan'])

In [None]:
process_name_pair('LAILA', 'ليلى', ['given', 'dan'])

In [23]:
process_name_pair('CHARLES', 'تشارلس', ['given', 'dan'])

(['CHARLES'], ['تشارلس'])

In [None]:
process_name_pair('CASE', 'قاسى', ['given', 'dan'])

In [27]:
process_name_pair("SAEED", 'سعيد', ['given', 'dan'])

(['SAYED',
  'SAYAD',
  'SAIDU',
  'SAIDI',
  'SAEEDA',
  'SAID EL',
  'SAYID',
  'SAEID',
  'SAEED',
  'SEIDA',
  'SAID',
  'SAIED',
  'SAIDA',
  'SAEEDAH'],
 ['سعيده', 'سايد', 'سعيد', 'سائد'])

In [None]:
process_name_pair("MICHA'EL", 'ميخائيل', [ 'dan'])

In [19]:
process_name_pair('AAESHA', 'عائشه', ['given', 'dan'])

([' AISHA',
  'AICHA',
  'AISHA',
  'AAISHA',
  'AYSHA',
  'AESHA',
  'AAESHA',
  'EESHA',
  'AAYISHA',
  'AYESHA',
  'EISHA',
  'ESHA',
  'AAYESHEH',
  'JAGADEESHA',
  'AYISHA',
  'AEESHA',
  'AIESHA',
  'AAYSHA',
  'AAYESHA',
  'ASHA'],
 ['عائيشه',
  'عائشه ',
  'عشا',
  'عيشه',
  'أيتشا',
  'عائشه',
  'عايشة',
  'عئشه',
  'عايشه',
  'عيشة'])

In [None]:
process_name_pair('JACOB', 'جاكوب', ['given', 'dan', 'family'])

In [None]:
process_name_pair('REZA', 'رضا', ['given', 'dan', 'family'])

In [None]:
process_name_pair('GABRIEL', 'جبريل', ['given', 'dan', 'family'])

In [None]:
process_name_pair('ALEXANDER', 'الكسندر', ['given', 'dan', 'family'])

In [None]:
process_name_pair('SMITH', 'سميث', ['given', 'dan', 'family'])

In [None]:
process_name_pair('RAHIM', 'رحيم', ['given', 'dan', 'family'])

In [None]:
process_name_pair('KUMAR', 'كمار', ['given', 'dan', 'family'])

In [None]:
process_name_pair('FRANCIS', 'فرانسيس', ['given', 'dan', 'family'])

In [None]:
process_name_pair('NGUYEN', 'نجاين', ['given', 'dan', 'family'])

In [None]:
process_name_pair('ROBERT', 'روبرت', ['given', 'dan', 'family'])

In [None]:
process_name_pair('SHAIKH', 'شيخ', ['given', 'dan', 'family'])

In [None]:
process_name_pair('JAMAL', 'جمال', ['given', 'dan', 'family'])

In [None]:
process_name_pair('PETERS', 'بطرس', ['given', 'dan', 'family'])

In [None]:
process_name_pair('ISLAM', 'اسلام', ['given', 'dan', 'family'])

In [None]:
process_name_pair('JAMES', 'جيمس', ['given', 'dan', 'family'])

In [None]:
process_name_pair('CHAN', 'تشان', ['given', 'dan', 'family'])

In [None]:
process_name_pair('JACOB', 'جاكوب', ['given', 'dan', 'family'])

In [None]:
process_name_pair('KABIR', 'كعبر', ['given', 'dan', 'family'])

In [26]:
process_name_pair('RICHARD', 'ريشارد', ['given', 'dan', 'family'])

(['RICHARD'], ['ريشارد'])

In [None]:
process_name_pair('IDRIS', 'ادريس', ['given', 'dan', 'family'])

In [25]:
process_name_pair('REHMAN', 'رحمن', ['given', 'dan', 'family'])

(['REHMAN', 'REHIMAN', 'REHAMAN', 'RAHMAN', 'RHAMAN', 'REHUMAN'],
 ['رحمان', 'رحمن'])

In [None]:
process_name_pair('PEREIRA', 'بريرا', ['given', 'dan', 'family'])

In [None]:
process_name_pair('ZAFAR', 'ظافر', ['given', 'dan', 'family'])

In [None]:
process_name_pair('ISHAK', 'اسحاق', ['given', 'dan', 'family'])

In [None]:
process_name_pair('CHRISTIAN', 'كريستيان', ['given', 'dan', 'family'])

In [None]:
process_name_pair('SAMUEL', 'صامويل', ['given', 'dan', 'family'])

In [None]:
process_name_pair('USMAN', 'عثمان', ['given', 'dan', 'family'])