In [23]:
import psycopg2
from configparser import ConfigParser

def config(filename='prepare_data.ini', section='phonetic'):
    # create a parser
    parser = ConfigParser()
    # read config file
    parser.read(filename)
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

def db_connect():
    """ Connect to the PostgreSQL database server """
    conn = None
    # read connection parameters
    params = config()

    # connect to the PostgreSQL server
    print('Connecting to the PostgreSQL database...')
    conn = psycopg2.connect(**params)

    return conn

In [24]:
def get_english_variants(cursor, arabic_name):
    query = """
    SELECT NAME, SUM(COUNT) FROM (
        SELECT ENG AS NAME, COUNT FROM GIVEN_NAMES_MASTER
        WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ARB = '""" + arabic_name + """'
        UNION ALL SELECT ENG AS NAME, COUNT FROM FAMILY_NAMES_MASTER
        WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ARB = '""" + arabic_name + """'
    ) AS SUB group BY NAME;
    """     
    
    result = {}
    
    cur.execute(query)
    result.update(cur.fetchall())
    
    return result

def get_arabic_variants(cursor, english_name): 
    query = """
        SELECT SUB.NAME, SUM(COUNT) FROM (
            SELECT ARB AS NAME, COUNT FROM GIVEN_NAMES_MASTER
            WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ENG = '""" + english_name + """' 
            UNION ALL SELECT ARB AS NAME, COUNT FROM FAMILY_NAMES_MASTER
            WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ENG = '""" + english_name + """' 
        ) AS SUB GROUP BY NAME;
        """     
    result = {}

    cur.execute(query)
    result.update(cur.fetchall())
    
    return result

def validate_arabic_name_by_english_variations(cursor, arabic_name, valid_english_variations):
    english_variations = get_english_variants(cursor, arabic_name)
    total_valid_count = 0
    total_invalid_count = 0
    
    total = sum(english_variations.values())
    if total < 3:
        return False

    for key, val in english_variations.items():
        if key in valid_english_variations:
            total_valid_count += val
        else:
            total_invalid_count += val
    
    #print("for {name}: valid: {valid}, invalid: {invalid}".format(name=arabic_name, valid=total_valid_count, invalid=total_invalid_count))
    
    if total_valid_count < 3:
        return False

    if total_valid_count > total_invalid_count or total_valid_count > 100:
        return True
    
    print("english variations for {name} are: {dic}".format(name=arabic_name, dic=english_variations))
    return False
    
def validate_english_name_by_arabic_variations(cursor, english_name, valid_arabic_variations):
    arabic_variations = get_arabic_variants(cursor, english_name)
    total_valid_count = 0
    total_invalid_count = 0
    
    total = sum(arabic_variations.values())
    if total < 3:
        return False
    
    for key, val in arabic_variations.items():
        if key in valid_arabic_variations:
            total_valid_count += val
        else:
            total_invalid_count += val
    
    #print("for {name}: valid: {valid}, invalid: {invalid}".format(name=arabic_name, valid=total_valid_count, invalid=total_invalid_count))

    if total_valid_count < 3:
        return False
    
    if total_valid_count > total_invalid_count or total_valid_count > 100:
        return True
    
    print("arabic variations for {name} are: {dic}".format(name=english_name, dic=arabic_variations))
    return False

def get_top_frequency_names(list):
    total = sum(list.values())
    lower_accepted_frequency = 100
    threshold = 10
    
    max_value = max(list.values())
    if total > 6561:
        threshold = 1
    else:
        threshold -= total**(1./4.)
        
    print("threshold: {thre}, total: {tot}".format(thre=threshold, tot=total))
    matched_list = [key for key, val in list.items() 
                    if len(key) > 2 and 
                    key not in top_noise_data and 
                    (val / total * 100 > threshold or val >= lower_accepted_frequency)]
    print("top matched_list: {thre}".format(thre=matched_list))
    not_matched_list = [ (key, val) for key, val in list.items() if val / total * 100 <= threshold and val < lower_accepted_frequency]
    matched_list_with_composite = [key for key, val in list.items() if any(match in key for match in matched_list)]
    if(len(matched_list_with_composite) - len(matched_list) > 3):
        return matched_list
    
    return matched_list_with_composite

In [25]:
conn = db_connect()
cur = conn.cursor()
english_total_result = []
english_top_frequency = []
arabic_total_result = []
arabic_top_frequency = []
top_noise_data = []
origin_name = ''
def reset_global_variables(name):
    global cur, english_total_result, english_top_frequency, arabic_total_result, arabic_top_frequency, top_noise_data, origin_name
    cur = conn.cursor()
    english_total_result = []
    english_top_frequency = []
    arabic_total_result = []
    arabic_top_frequency = []
    top_noise_data = ['MRS', 'MRS.', 'MRSS', 'MR', 'MR.', 'MISS']
    origin_name = name

Connecting to the PostgreSQL database...


In [32]:
reset_global_variables('نينا')
english_list = get_english_variants(cur, origin_name)
print("english variations: {dct}".format(dct=english_list))

english_total_result = get_top_frequency_names(english_list)
print("top english results: {dct}".format(dct=english_total_result))

english variations: {'ALEXANDRA': 2, 'ALLA': 2, 'AM': 2, 'ANA': 2, 'ANAHIT': 2, 'ANDRIANAMBININA': 4, 'ANNA': 2, 'ASTANINA': 2, 'CABANLIT': 2, 'CASTILLO': 2, 'CORAZON': 2, 'DVORYANINOVA': 2, 'EKATERINA': 4, 'ELENA': 2, 'ELENITA': 2, 'ELIZAVETA': 2, 'ELVIRA': 3, 'EMMA': 3, 'EUFROSINA': 2, 'EVDOKIYA': 2, 'EVGENYA': 2, 'FANISA': 2, 'FATMA': 2, 'FE': 2, 'FLOR': 2, 'GALINA': 7, 'GALYAMDAN': 2, 'GEGHETSIK': 2, 'GILMANOVA': 2, 'GLSHAT': 2, 'GRYNEVYCH': 2, 'G TUGADI': 2, 'GULSIRA': 2, 'I': 2, 'IGNATENKO': 2, 'IONOV': 2, 'IRINA': 3, 'IVONOVA': 2, 'JULIETA': 2, 'KALENITINA': 2, 'KARAKAY': 2, 'KHAKIMA': 2, 'KHANIFA': 2, 'KHOVANSKAYA': 2, 'KLAVDIYA': 2, 'KODATSKA': 2, 'KOZLOV': 2, 'LARISA': 4, 'LIDIA': 2, 'LIDIIA': 2, 'LIDIYA': 2, 'LIUDMILA': 2, 'LIVITSKYI': 2, 'LIZA': 2, 'LUSINE': 2, 'LYUBOV': 2, 'M': 6, 'MADUNA': 2, 'MALOU': 2, 'MANIA': 2, 'MARGARITA': 2, 'MARIA': 2, 'MARIYA': 5, 'MARSHANI': 2, 'MORGACHEVA': 2, 'MRS': 411, 'MRSS': 2, 'MS': 21, 'N': 2, 'NA': 2, 'NADEZHDA': 3, 'NAIENA': 2, 'NAINA'

In [33]:
iterable_list = english_total_result[:]
for english_name in iterable_list:
    arabic_list = get_arabic_variants(cur, english_name)
    arabic_top_frequency = get_top_frequency_names(arabic_list)
    arabic_total_result += arabic_top_frequency
arabic_total_result = list(set(arabic_total_result))
print("top arabic results: {dct}".format(dct=arabic_total_result))

threshold: 1, total: 22568
top matched_list: ['نعينع', 'نينا']
threshold: 1.155925991484203, total: 6118
top matched_list: ['نينا']
threshold: 1, total: 15129
top matched_list: ['نينا']
threshold: 3.2802315799201427, total: 2039
top matched_list: ['نعناع', 'ننا', 'نينا']
threshold: 4.5658812931196655, total: 872
top matched_list: ['نينا']
threshold: 1, total: 95276
top matched_list: ['نينا']
top arabic results: ['ننا', 'نينا', 'نعينع', 'نعناع', 'نننا', 'نيناه']


In [34]:
iterable_list = arabic_total_result[:]
for arabic_name in iterable_list:
    english_list = get_english_variants(cur, arabic_name)
    english_top_frequency = get_top_frequency_names(english_list)
    english_total_result += english_top_frequency
english_total_result = list(set(english_total_result))

print("top english results: {dct}".format(dct=english_total_result))

threshold: 2.134198502820448, total: 3828
top matched_list: ['NENA', 'NNENNA']
threshold: 1, total: 140248
top matched_list: ['NAINA', 'NAYNA', 'NEENA', 'NENA', 'NIINA', 'NINA']
threshold: 5.376412829151812, total: 457
top matched_list: ['NAINA']
threshold: 6.019675953172298, total: 251
top matched_list: ['NENA']
threshold: 8.810792884997278, total: 2
top matched_list: ['NENA']
threshold: 7.834263229332006, total: 22
top matched_list: ['NAINA', 'NENAH', 'NINA', 'NINAH']
top english results: ['NENAH', 'NAYNA', 'NENA', 'NAINA', 'NEENA', 'NINA', 'NINAH', 'NNENNA', 'NIINA']


In [35]:
iterable_list = english_total_result[:]
for english_name in iterable_list:
    if not validate_english_name_by_arabic_variations(cur, english_name, arabic_total_result):
        print("removing name: {name}".format(name=english_name))
        english_total_result.remove(english_name)
        
english_total_result.sort()
print("top english results: {dct}".format(dct=english_total_result))
print("top english results count: {c}".format(c=len(english_total_result)))

arabic variations for NENAH are: {'ننه': 449, 'نيناح': 4, 'نيناه': 10, 'نينه': 6}
removing name: NENAH
top english results: ['NAINA', 'NAYNA', 'NEENA', 'NENA', 'NIINA', 'NINA', 'NINAH', 'NNENNA']
top english results count: 8


In [36]:
iterable_list = english_total_result[:]
for english_name in iterable_list:
    arabic_list = get_arabic_variants(cur, english_name)
    arabic_top_frequency = get_top_frequency_names(arabic_list)
    arabic_total_result += arabic_top_frequency
arabic_total_result = list(set(arabic_total_result))
print("top arabic results: {dct}".format(dct=arabic_total_result))

threshold: 1, total: 22568
top matched_list: ['نعينع', 'نينا']
threshold: 1.155925991484203, total: 6118
top matched_list: ['نينا']
threshold: 1, total: 15129
top matched_list: ['نينا']
threshold: 3.2802315799201427, total: 2039
top matched_list: ['نعناع', 'ننا', 'نينا']
threshold: 4.5658812931196655, total: 872
top matched_list: ['نينا']
threshold: 1, total: 95276
top matched_list: ['نينا']
threshold: 8.434915419926712, total: 6
top matched_list: ['نيناه']
threshold: 3.1458976819582434, total: 2207
top matched_list: ['ننا']
top arabic results: ['ننا', 'نينا', 'نعينع', 'نعناع', 'نننا', 'نيناه']


In [37]:
iterable_list = arabic_total_result[:]
for arabic_name in iterable_list:
    if not validate_arabic_name_by_english_variations(cur, arabic_name, english_total_result):
        print("removing name: {name}".format(name=arabic_name))
        arabic_total_result.remove(arabic_name)
        
arabic_total_result.sort()
print("top arabic results: {dct}".format(dct=arabic_total_result))
print("top arabic results count: {c}".format(c=len(arabic_total_result)))

removing name: نننا
english variations for نيناه are: {'MRS': 2, 'NAINA': 2, 'NENAH': 10, 'NINA': 2, 'NINAH': 6}
removing name: نيناه
top arabic results: ['نعناع', 'نعينع', 'ننا', 'نينا']
top arabic results count: 4
