In [90]:
#global declarations
english_total_result = []
english_top_frequency = []
arabic_total_result = []
arabic_top_frequency = []

In [91]:
import psycopg2
from configparser import ConfigParser

def config(filename='prepare_data.ini', section='phonetic'):
    # create a parser
    parser = ConfigParser()
    # read config file
    parser.read(filename)
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

def db_connect():
    """ Connect to the PostgreSQL database server """
    conn = None
    # read connection parameters
    params = config()

    # connect to the PostgreSQL server
    print('Connecting to the PostgreSQL database...')
    conn = psycopg2.connect(**params)

    return conn

In [93]:
from fuzzywuzzy import fuzz
import pandas as pd

def get_english_variants(cursor, arabic_name):
    query = """
    SELECT NAME, SUM(COUNT) FROM (
        SELECT ENG AS NAME, COUNT FROM GIVEN_NAMES_MASTER
        WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ARB = '""" + arabic_name + """'
        UNION ALL SELECT ENG AS NAME, COUNT FROM FAMILY_NAMES_MASTER
        WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ARB = '""" + arabic_name + """'
    ) AS SUB group BY NAME;
    """     
    
    result = {}
    
    cur.execute(query)
    result.update(cur.fetchall())
    
    return result

def get_arabic_variants(cursor, english_name): 
    query = """
        SELECT SUB.NAME, SUM(COUNT) FROM (
            SELECT ARB AS NAME, COUNT FROM GIVEN_NAMES_MASTER
            WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ENG = '""" + english_name + """' 
            UNION ALL SELECT ARB AS NAME, COUNT FROM FAMILY_NAMES_MASTER
            WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ENG = '""" + english_name + """' 
        ) AS SUB GROUP BY NAME;
        """     
    result = {}

    cur.execute(query)
    result.update(cur.fetchall())
    
    return result

def get_top_frequency_names(list, threshold = 100):
    total = sum(list.values())
    lower_accepted_frequency = 100
    threshold = 10
    
    max_value = max(list.values())
    if total > 6561:
        threshold = 1
    else:
        threshold -= total**(1./4.)
        
    print("threshold: {thre}, total: {tot}".format(thre=threshold, tot=total))
    matched_list = [key for key, val in list.items() if len(key) > 2 and (val / total * 100 > threshold or val >= lower_accepted_frequency)]
    print("top matched_list: {thre}".format(thre=matched_list))
    not_matched_list = [ (key, val) for key, val in list.items() if val / total * 100 <= threshold and val < lower_accepted_frequency]
    filtered = [key for key, val in list.items() if any(match in key for match in matched_list)]
    
    return filtered

def get_top_name(list):
    name = ""
    max = 0
    for key, val in list.items():
        if val > max:
            max = val
            name = key
    return name

def get_threshold(list):
    max_value = max(list.values())
    treshold = max_value/(2*(len(list))**2)
    return treshold

def compare_names(origin = "", name = "", distance=100):
    ratio = fuzz.partial_ratio(origin, name)
    if ratio > distance:
        return True
    return False

def fuzzy_filter_names(origin = "", names = [], distance=100):
    result = []
    for name in names:
        ratio = fuzz.partial_ratio(origin, name)
        if ratio > distance:
            result.append(name)
    return result

In [94]:
conn = db_connect()
cur = conn.cursor()

Connecting to the PostgreSQL database...


In [95]:
origin_name = 'AHMAD'
arabic_list = get_arabic_variants(cur, origin_name)
print("top arabic results: {dct}".format(dct=arabic_list))

arabic_total_result = get_top_frequency_names(arabic_list,get_threshold(arabic_list))
print("top arabic results: {dct}".format(dct=arabic_total_result))

top_arabic_name = get_top_name(arabic_list)
#arabic_total_result = fuzzy_filter_names(top_arabic_name, arabic_total_result, 30)
#print("top arabic results filtered: {dct}".format(dct=arabic_total_result))

top arabic results: {'احماد': 2, 'أحمد': 222623, 'احمد': 446, 'احمد زروق': 2, 'الزوج': 2, 'تاحمد': 1, 'حمد': 2, 'دادابايفا': 2, 'م': 2}
threshold: 1, total: 223082
top matched_list: ['أحمد', 'احمد']
top arabic results: ['أحمد', 'احمد', 'احمد زروق', 'تاحمد']


In [96]:
for arabic_value in arabic_total_result:
    english_list = get_english_variants(cur, arabic_value)
    threshold = get_threshold(english_list)
    english_top_frequency = get_top_frequency_names(english_list, threshold)
    english_total_result += english_top_frequency
english_total_result = list(set(english_total_result))

print("top english results: {dct}".format(dct=english_total_result))
#english_total_result = fuzzy_filter_names(origin_name, english_total_result, 50)
#print("top english results filtered: {dct}".format(dct=english_total_result))

threshold: 1, total: 692016
top matched_list: ['AHAMAD', 'AHAMADI', 'AHAMD', 'AHAMED', 'AHAMMAD', 'AHAMMED', 'AHEMAD', 'AHEMED', 'AHMAD', 'AHMADI', 'AHMADU', 'AHMED', 'AHMEDI', 'AHMMAD', 'AHMMED']
threshold: 1, total: 176664
top matched_list: ['AHMAD', 'AHMED', 'AHMET']
threshold: 8.810792884997278, total: 2
top matched_list: ['AHMAD']
threshold: 9.0, total: 1
top matched_list: ['AHMAD']
top english results: ['AHMED SAED', 'AHMED ELSAYED', 'AHAMADI', 'AHMADB', 'AHMED BERAIR', 'TOMA AHMED', 'EL IMAM AHMED', 'AHMED SAID MOHA', 'AHMED MOKHTAR A', 'AHMED ALI', 'AHMED ABDELMEGU', 'AHMED HUSSIEN', 'AHEMAD', 'AHMED ASEM READ', 'AHMED HASHIM', 'AHMED KHMIS', 'AHMAD EL', 'AHMAD WIKTOR', 'AHMED MOHAMED', 'AHMED NASIF', 'AHMADI', 'AHMED REFAAT', 'AHMED ELTEGANI', 'AHMED HAMED', 'AHMED RIRACHE ', 'AHMAD', 'AAHMED', 'AHMADU', 'AHMED BASSIOUNY', 'SARA AHMAD', 'AHMED FOUAD KAM', 'AHAMED', 'AHAMMED', 'AHAMMAD', 'ADIL AHMED', 'AHMEDC', 'AHMED FARGHALY', 'AHEMED', 'AHAMAD', 'AHMED RAMADAN', 'AHAMD', 'AH

In [97]:
for english_value in english_total_result:
    arabic_list = get_arabic_variants(cur, english_value)
    arabic_top_frequency = get_top_frequency_names(arabic_list, get_threshold(arabic_list))
    arabic_total_result += arabic_top_frequency
arabic_total_result = list(set(arabic_total_result))
print("top arabic results: {dct}".format(dct=arabic_total_result))
#arabic_total_result = fuzzy_filter_names(top_arabic_name, arabic_total_result, 50)
#print("top arabic results filtered: {dct}".format(dct=arabic_total_result))

threshold: 8.810792884997278, total: 2
top matched_list: ['احمد']
threshold: 8.138790281795801, total: 12
top matched_list: ['احمد', 'احمد السيد']
threshold: 1.2867335869715752, total: 5764
top matched_list: ['أحمد']
threshold: 9.0, total: 1
top matched_list: ['احمد']
threshold: 8.810792884997278, total: 2
top matched_list: ['احمد']
threshold: 8.810792884997278, total: 2
top matched_list: ['احمد']
threshold: 8.810792884997278, total: 2
top matched_list: ['احمد']
threshold: 8.434915419926712, total: 6
top matched_list: ['احمد', 'احمد سعيد محمد', 'احمد سعيد محمد ']
threshold: 8.585786437626904, total: 4
top matched_list: ['احمد', 'احمد مختار']
threshold: 7.533674285440339, total: 37
top matched_list: ['احمد علي', 'احمد على']
threshold: 8.585786437626904, total: 4
top matched_list: ['احمد', 'احمد عبدالمجيد ']
threshold: 8.810792884997278, total: 2
top matched_list: ['احمد']
threshold: 2.873720767302302, total: 2579
top matched_list: ['أحمد']
threshold: 8.810792884997278, total: 2
top matc

In [82]:
for arabic_value in arabic_total_result:
    english_list = get_english_variants(cur, arabic_value)
    #print(english_list)
    english_top_frequency = get_top_frequency_names(english_list, get_threshold(english_list))
    english_total_result += english_top_frequency
english_total_result = list(set(english_total_result))
print("top english results: {dct}".format(dct=english_total_result))
#english_total_result = fuzzy_filter_names(origin_name, english_total_result, 50)
#print("top english results filtered: {dct}".format(dct=english_total_result))

{'AHMED MOHAMED': 3, 'AHMED MOHAMED A': 13}
{'AHMED MOHAMED': 2}
{'AHMED MOHAMED': 3, 'AHMED MOHAMED R': 2}
{'AHMED ABDELMEGU': 2}
{'AHMAD': 2}
{'AHMED': 2}
{'AHMEDULLAH': 2}
{'AHMED SAID MOHA': 2}
{'AHMED': 2}
{'AHMED SAID MOHA': 2}
{'AHMAD': 1}
{'AHMAD ALSAYED': 2, 'AHMED ELSAYED': 10, 'AHMED ELSAYED A': 2}
{'A': 104, 'AAHMED': 2, 'ADIL AHMED': 2, 'AFMED': 2, 'AH': 2, 'AHAMAD': 5, ' AHAMAD': 2, 'AHAMADUL': 2, 'AHAMD': 4, 'AHAMED': 41, 'AHAMEDSHA': 2, 'AHAMEDUR': 2, 'AHAMMED': 3, 'AHED': 2, 'AHEMD': 6, 'AHMAAD': 2, 'AHMAD': 446, 'AHMADB': 1, 'AHMAD EL': 2, 'AHMAD WIKTOR': 2, 'AHMAED': 9, 'AHMAMED': 2, 'AHMAMMAD': 1, 'AHMD': 7, 'AHME': 2, 'AHMEAD': 2, 'AHMED': 175595, 'AHMED ABDELMEGU': 2, 'AHMED ALI': 2, 'AHMED ASEM READ': 2, 'AHMED BASSIOUNY': 2, 'AHMED BERAIR': 2, 'AHMEDC': 2, 'AHMED ELMALIK': 2, 'AHMED ELSAYED': 2, 'AHMED ELTEGANI': 2, 'AHMED FARGHALY': 2, 'AHMED FOUAD KAM': 2, 'AHMED HAMED': 2, 'AHMED HASHIM': 2, 'AHMED HUSSIEN': 2, 'AHMED KHMIS': 2, 'AHMED MAHMOUD': 3, 'AHMED MOH

In [83]:
#checkpoint 1
arabic_ignored_names = []
arabic_accepted_names = []
for arabic_value in arabic_total_result:
    english_list = get_english_variants(cur, arabic_value)
    all_variations_count = len(english_list.keys())
    common_variations_count = len(set(english_total_result).intersection(english_list.keys()))
    if (all_variations_count - common_variations_count) > common_variations_count:
        print(english_list)
        print(all_variations_count)
        print(common_variations_count)
        arabic_ignored_names.append(arabic_value)
    else:
        arabic_accepted_names.append(arabic_value)
print(arabic_ignored_names)
print(arabic_accepted_names)

{'A': 104, 'AAHMED': 2, 'ADIL AHMED': 2, 'AFMED': 2, 'AH': 2, 'AHAMAD': 5, ' AHAMAD': 2, 'AHAMADUL': 2, 'AHAMD': 4, 'AHAMED': 41, 'AHAMEDSHA': 2, 'AHAMEDUR': 2, 'AHAMMED': 3, 'AHED': 2, 'AHEMD': 6, 'AHMAAD': 2, 'AHMAD': 446, 'AHMADB': 1, 'AHMAD EL': 2, 'AHMAD WIKTOR': 2, 'AHMAED': 9, 'AHMAMED': 2, 'AHMAMMAD': 1, 'AHMD': 7, 'AHME': 2, 'AHMEAD': 2, 'AHMED': 175595, 'AHMED ABDELMEGU': 2, 'AHMED ALI': 2, 'AHMED ASEM READ': 2, 'AHMED BASSIOUNY': 2, 'AHMED BERAIR': 2, 'AHMEDC': 2, 'AHMED ELMALIK': 2, 'AHMED ELSAYED': 2, 'AHMED ELTEGANI': 2, 'AHMED FARGHALY': 2, 'AHMED FOUAD KAM': 2, 'AHMED HAMED': 2, 'AHMED HASHIM': 2, 'AHMED HUSSIEN': 2, 'AHMED KHMIS': 2, 'AHMED MAHMOUD': 3, 'AHMED MOHAMED': 7, 'AHMED MOKHTAR A': 2, 'AHMED MUSTAFA H': 2, 'AHMED NASIF': 2, 'AHMED RAMADAN': 3, 'AHMED REFAAT': 2, 'AHMED RIRACHE ': 2, 'AHMED SAED': 2, 'AHMED SAID MOHA': 2, 'AHMED SALIM': 2, 'AHMEDULLAH': 2, 'AHMEED': 6, 'AHMET': 184, 'AHMID': 2, 'AHMIED': 2, 'AHMMAD': 2, 'AHMMED': 2, 'AHNMED': 1, 'AKHMAD': 12, 

In [45]:
#checkpoint 2
english_ignored_names = []
english_accepted_names = []
for english_value in english_total_result:
    arabic_list = get_arabic_variants(cur, english_value)
    all_variations_count = len(arabic_list.keys())
    common_variations_count = len(set(arabic_total_result).intersection(arabic_list.keys()))
    if (all_variations_count - common_variations_count) > common_variations_count:
        print(arabic_list)
        print(all_variations_count)
        print(common_variations_count)
        english_ignored_names.append(english_value)
    else:
        english_accepted_names.append(english_value)
print(english_ignored_names)
print(english_accepted_names)

{'الهادي': 27, 'الهادى': 23, 'الهدى': 1, 'محمد الهادى': 2}
4
1
{'محمد ابوالحق': 2, 'محمد عبدالجبار': 2, 'محمد عبدالقادر': 2}
3
1
{'ايلى': 2, 'علي': 849, 'على': 3}
3
1
{'عبدالرحمن محمد': 2, 'محمد': 3, 'محمد عبدالرازق': 3, 'محمد عبد الرازق': 4, 'محمد عبدالراضي': 2, 'محمد عبدالرافع': 2, 'محمد عبدالرحمن': 9, 'محمد عبدالرحمن ': 6, 'محمد عبدالرحيم': 3, 'محمد عبدالرزاق': 3, 'محمد عبدالرؤف': 2, 'محمد عبدالرؤف ف': 2, 'محمد عبدالرؤوف ': 2}
13
4
{'الى': 2, 'عاليه': 3, 'على': 3}
3
1
{'حسن محمد ابراهي': 2, 'حسن محمد احمد': 4, 'حسن محمد العربى': 3, 'حسن محمد حسن': 3, 'حسن محمد ربيع': 2, 'حسن محمد عبدالج': 3, 'حسن محمد عبدالم': 3, 'حسن محمد على': 3, 'حسن محمد محمد': 5, 'حسن محمد وجدى': 2, 'حسن محمد يوسف': 2, 'محمد': 2}
12
1
{'محمد': 2, 'محمد حسن ابراهي': 2, 'محمد حسن السيد': 3, 'محمد حسن جميل': 4, 'محمد حسن سليم': 2, 'محمد حسن عبدالف': 2, 'محمد حسن محمد': 3, 'محمد حسن محمد ح': 2}
8
2
{'محمد عبدالجلال ': 2, 'محمد عبدالجواد': 2, 'محمد عبدالقادر': 3}
3
1
{'احمد محمد محمد': 8, 'احمد محمد محمود': 4, 'احمد

In [17]:
for english_value in english_total_result:
    arabic_list = get_arabic_variants(cur, english_value)
    arabic_top_frequency = get_top_frequency_names(arabic_list, get_threshold(arabic_list))
    arabic_total_result += arabic_top_frequency
arabic_total_result = list(set(arabic_total_result))
print("top arabic results: {dct}".format(dct=arabic_total_result))
arabic_total_result = fuzzy_filter_names(top_arabic_name, arabic_total_result, 50)
print("top arabic results filtered: {dct}".format(dct=arabic_total_result))

['حمزة', 'حمزه', 'حمزه محمود محمد', 'همسه']
['حمزه', 'هامزه']
['همسه']
['حمزه', 'همسا', 'همسة', 'همسه']
['همسه']
['هامزه']
['حمزة']
['حمزه']
['همسه']
top arabic results: ['همسه', 'حمزة', 'حمزه محمود محمد', 'هامزه', 'همسا', 'حمزه', 'همسة']
top arabic results filtered: ['حمزة', 'حمزه محمود محمد', 'هامزه', 'حمزه']


In [18]:
for arabic_value in arabic_total_result:
    english_list = get_english_variants(cur, arabic_value)
    english_top_frequency = get_top_frequency_names(english_list, get_threshold(english_list))
    english_total_result += english_top_frequency
english_total_result = list(set(english_total_result))
print("top english results: {dct}".format(dct=english_total_result))
english_total_result = fuzzy_filter_names(origin_name, english_total_result, 50)
print("top english results filtered: {dct}".format(dct=english_total_result))

['HAMZA', 'HUMZA']
['HAMZA']
['HAMZAH', 'HAMZEH']
['HAMZA', 'HAMZAH', 'KHAMZA']
top english results: ['HAMZA', 'HAMZAH', 'HAMSA', 'HMSA', 'HAMZEH', 'KHAMZA', 'HUMZA', 'HIMSA', 'HAMSAH']
top english results filtered: ['HAMZA', 'HAMZAH', 'HAMSA', 'HAMZEH', 'KHAMZA', 'HUMZA', 'HIMSA', 'HAMSAH']


In [17]:
for english_value in english_total_result:
    arabic_list = get_arabic_variants(cur, english_value)
    arabic_top_frequency = get_top_frequency_names(arabic_list, get_threshold(english_list))
    arabic_total_result += arabic_top_frequency
arabic_total_result = list(set(arabic_total_result))
print("top arabic results: {dct}".format(dct=arabic_total_result))
arabic_total_result = fuzzy_filter_names(top_arabic_name, arabic_total_result, 50)
print("top arabic results filtered: {dct}".format(dct=arabic_total_result))

top arabic results: ['همسه', 'حمزه', 'هامزه', 'همسة', 'حمزه محمود محمد', 'حمزة', 'همسا']


In [98]:
def get_english_variants_by_set(cursor, arabic_names=set()):
    query = """
    SELECT NAME, SUM(COUNT) FROM (
        SELECT ENG AS NAME, COUNT FROM GIVEN_NAMES_MASTER
        WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ARB IN %(names)s
        UNION ALL SELECT ENG AS NAME, COUNT FROM FAMILY_NAMES_MASTER
        WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ARB IN %(names)s
    ) AS SUB group BY NAME;
    """     
    
    result = {}
    
    cur.execute(query,{'names': tuple(arabic_names), })
    result.update(cur.fetchall())
    
    return result

def get_arabic_variants_by_set(cursor, english_names=set()): 
    query = """
        SELECT SUB.NAME, SUM(COUNT) FROM (
            SELECT ARB AS NAME, COUNT FROM GIVEN_NAMES_MASTER
            WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ENG IN %(names)s
            UNION ALL SELECT ARB AS NAME, COUNT FROM FAMILY_NAMES_MASTER
            WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ENG IN %(names)s
        ) AS SUB GROUP BY NAME;
        """     
    result = {}

    cur.execute(query,{'names': tuple(english_names), })
    result.update(cur.fetchall())
    
    return result

def get_counts_for_english_names(cursor, english_names=set()):
    query = """
    SELECT NAME, SUM(COUNT) FROM (
        SELECT ENG AS NAME, COUNT FROM GIVEN_NAMES_MASTER
        WHERE ENG IN %(names)s
        UNION ALL SELECT ENG AS NAME, COUNT FROM FAMILY_NAMES_MASTER
        WHERE ENG IN %(names)s
    ) AS SUB group BY NAME;
    """     
    
    result = {}
    
    cur.execute(query,{'names': tuple(english_names), })
    result.update(cur.fetchall())
    
    return result

def get_counts_for_arabic_names(cursor, arabic_names=set()):
    query = """
    SELECT NAME, SUM(COUNT) FROM (
        SELECT ARB AS NAME, COUNT FROM GIVEN_NAMES_MASTER
        WHERE ARB IN %(names)s
        UNION ALL SELECT ARB AS NAME, COUNT FROM FAMILY_NAMES_MASTER
        WHERE ARB IN %(names)s
    ) AS SUB group BY NAME;
    """     
    
    result = {}
    
    cur.execute(query,{'names': tuple(arabic_names), })
    result.update(cur.fetchall())
    
    return result

def group_unique_names(dictionary = {}, delta = {}):
    """
    Adds new names from delta to dictionary and sums up the counts
    Returns set of new names added to dictionary.
    """
    names_delta = set()
    for name in delta:
        if name in dictionary:
            dictionary[name] += delta[name]
        else:
            dictionary[name]= delta[name]
            names_delta.add(name)
    return names_delta

In [99]:
english_dict = get_counts_for_english_names(cur, english_total_result)
arabic_dict = get_counts_for_arabic_names(cur, arabic_total_result)
print(english_dict)
print(arabic_dict)

{'AAHMED': 2, 'ADIL AHMED': 6, 'AHAMAD': 24302, 'AHAMADI': 5764, 'AHAMD': 16265, 'AHAMED': 47477, 'AHAMMAD': 25422, 'AHAMMED': 24280, 'AHEMAD': 2581, 'AHEMED': 814, 'AHMAD': 223964, 'AHMADB': 1, 'AHMADE': 13, 'AHMAD EL': 4, 'AHMADI': 26265, 'AHMADU': 5177, 'AHMAD WIKTOR': 2, 'AHMED': 435643, 'AHMED ABDELMEGU': 4, 'AHMED ALI': 40, 'AHMED ASEM READ': 2, 'AHMED BASSIOUNY': 2, 'AHMED BERAIR': 2, 'AHMEDC': 2, 'AHMED ELMALIK': 2, 'AHMED ELSAYED': 12, 'AHMED ELTEGANI': 2, 'AHMED FARGHALY': 2, 'AHMED FOUAD KAM': 2, 'AHMED HAMED': 6, 'AHMED HASHIM': 2, 'AHMED HUSSIEN': 2, 'AHMEDI': 6038, 'AHMED KHMIS': 2, 'AHMED MAHMOUD': 27, 'AHMED MOHAMED': 134, 'AHMED MOKHTAR A': 4, 'AHMED MUSTAFA H': 2, 'AHMED NASIF': 2, 'AHMED RAMADAN': 8, 'AHMED REFAAT': 2, 'AHMED RIRACHE ': 2, 'AHMED SAED': 2, 'AHMED SAID MOHA': 6, 'AHMED SALIM': 4, 'AHMEDULLAH': 6, 'AHMET': 33964, 'AHMMAD': 4100, 'AHMMED': 24142, 'EL IMAM AHMED': 2, 'SARA AHMAD': 2, 'TOMA AHMED': 2}
{'ااحمد': 6, 'احماد': 1267, 'أحمد': 692019, 'احمد': 20

In [100]:
#get arabic versions from english origin
arabic_variants = get_arabic_variants_by_set(cur, english_dict.keys())

#validate arabic names
arabic_validation_variants = {}  
accepted_arabic_dict = {}
rejected_arabic_dict = {}

for key in arabic_variants.keys():
    arabic_validation_variants.clear()
    arabic_validation_variants = get_english_variants(cur, key)
    invalid_total = 0
    valid_total = 0
        
    for var in arabic_validation_variants.keys():
        if var in english_dict :
            valid_total += arabic_validation_variants[var]
        else :
            invalid_total += arabic_validation_variants[var]
            
    if invalid_total == 0 or valid_total/invalid_total > 5:
        print("ACCEPT*key: {k}, valid total: {v}, invalid total: {i}".format(k=key, v=valid_total, i=invalid_total))
        accepted_arabic_dict[key] = arabic_variants[key]
    else:
        print("REJECT key: {k}, valid total: {v}, invalid total: {i}".format(k=key, v=valid_total, i=invalid_total))
        rejected_arabic_dict[key] = arabic_variants[key]
        
    
print("accepted {a}".format(a=accepted_arabic_dict.keys()))
print("rejected {r}".format(r=rejected_arabic_dict.keys()))


ACCEPT*key: ااحمد, valid total: 3, invalid total: 0
REJECT key: ابراهيم, valid total: 2, invalid total: 322118
REJECT key: اجمد, valid total: 2, invalid total: 17568
REJECT key: احم, valid total: 3, invalid total: 1338
ACCEPT*key: احماد, valid total: 1253, invalid total: 4
ACCEPT*key: احمت, valid total: 2, invalid total: 0
ACCEPT*key: أحمد, valid total: 691965, invalid total: 51
ACCEPT*key: احمد, valid total: 176358, invalid total: 306
REJECT key: احمد , valid total: 2, invalid total: 10
REJECT key: احمد ابراهيم, valid total: 2, invalid total: 28
REJECT key: احمد ابراهيم عب, valid total: 2, invalid total: 3
ACCEPT*key: احمد ال, valid total: 2, invalid total: 0
REJECT key: احمد السيد, valid total: 10, invalid total: 4
REJECT key: احمد الصديق, valid total: 2, invalid total: 5
ACCEPT*key: احمد الفاضل, valid total: 2, invalid total: 0
ACCEPT*key: احمدالله, valid total: 2, invalid total: 0
ACCEPT*key: احمد بن الكيلان, valid total: 2, invalid total: 0
ACCEPT*key: احمد بوح, valid total: 2, in

In [57]:
#get english versions from arabic origin
english_variants = get_english_variants_by_set(cur, arabic_dict.keys())

#validate arabic names
english_validation_variants = {}  
accepted_english_dict = {}
rejected_english_dict = {}

for key in english_variants.keys():
    english_validation_variants.clear()
    english_validation_variants = get_arabic_variants(cur, key)
    invalid_total = 0
    valid_total = 0
        
    for var in english_validation_variants.keys():
        if var in arabic_dict :
            valid_total += english_validation_variants[var]
        else :
            invalid_total += english_validation_variants[var]
            
    if invalid_total == 0 or valid_total/invalid_total > 2:
        print("ACCEPT*key: {k}, valid total: {v}, invalid total: {i}".format(k=key, v=valid_total, i=invalid_total))
        accepted_english_dict[key] = english_variants[key]
    else:
        print("REJECT key: {k}, valid total: {v}, invalid total: {i}".format(k=key, v=valid_total, i=invalid_total))
        rejected_english_dict[key] = english_variants[key]
        
    
print("accepted {a}".format(a=accepted_english_dict.keys()))
print("rejected {r}".format(r=rejected_english_dict.keys()))


ACCEPT*key: GAMZA, valid total: 2, invalid total: 0
REJECT key: GAMZAT, valid total: 2, invalid total: 1237
REJECT key: H, valid total: 3, invalid total: 3810
REJECT key: HAMJA, valid total: 2, invalid total: 3919
REJECT key: HAMSA, valid total: 2, invalid total: 11646
ACCEPT*key: HAMUZA, valid total: 2, invalid total: 0
ACCEPT*key: HAMZA, valid total: 259061, invalid total: 2
ACCEPT*key: HAMZAH, valid total: 50570, invalid total: 0
ACCEPT*key: HAMZEH, valid total: 18477, invalid total: 0
ACCEPT*key: HAMZH, valid total: 2, invalid total: 0
ACCEPT*key: HEMZA, valid total: 2, invalid total: 0
ACCEPT*key: HUMZA, valid total: 30885, invalid total: 0
ACCEPT*key: KHAMZA, valid total: 5, invalid total: 0
REJECT key: REHAB, valid total: 2, invalid total: 98942
REJECT key: ZAINAB, valid total: 2, invalid total: 225741
accepted dict_keys(['GAMZA', 'HAMUZA', 'HAMZA', 'HAMZAH', 'HAMZEH', 'HAMZH', 'HEMZA', 'HUMZA', 'KHAMZA'])
rejected dict_keys(['GAMZAT', 'H', 'HAMJA', 'HAMSA', 'REHAB', 'ZAINAB'])


In [22]:
# reprocess rejected values
accepted_str = ', '.join(accepted_english_dict.keys())
rejected_keys = set(rejected_english_dict.keys())

for key in rejected_keys:
    compare = compare_names(accepted_str, key, 75)
    if compare:
        accepted_english_dict[key] = rejected_english_dict[key]
        rejected_english_dict.pop(key)
    print("key: {k} compare:{c}".format(k=key, c=compare))
    
print("accepted {a}".format(a=accepted_english_dict.keys()))
print("rejected {r}".format(r=rejected_english_dict.keys()))

key: ZAINAB compare:False
key: H compare:True
key: REHAB compare:False
key: HAMJA compare:True
key: GAMZAT compare:True
accepted dict_keys(['GAMZA', 'HAMSA', 'HAMSAH', 'HAMUZA', 'HAMZA', 'HAMZAH', 'HAMZEH', 'HAMZH', 'HEMZA', 'HIMSA', 'HMSA', 'HUMZA', 'KHAMZA', 'H', 'HAMJA', 'GAMZAT'])
rejected dict_keys(['REHAB', 'ZAINAB'])


In [23]:
# reprocess rejected arabic values
accepted_str = ', '.join(accepted_arabic_dict.keys())
rejected_keys = set(rejected_arabic_dict.keys())

for key in rejected_keys:
    compare = compare_names(accepted_str, key, 40)
    if compare:
        accepted_arabic_dict[key] = rejected_arabic_dict[key]
        rejected_arabic_dict.pop(key)
    print("key: {k} compare:{c}".format(k=key, c=compare))
    
print("accepted {a}".format(a=accepted_arabic_dict.keys()))
print("rejected {r}".format(r=rejected_arabic_dict.keys()))

key: همسه compare:True
accepted dict_keys(['حمزة', 'حمزه', 'حمزه محمود محمد', 'هامزه', 'همسا', 'همسة', 'همسه'])
rejected dict_keys([])
