In [1]:
import psycopg2
from configparser import ConfigParser

def config(filename='prepare_data.ini', section='phonetic'):
    # create a parser
    parser = ConfigParser()
    # read config file
    parser.read(filename)
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

def db_connect():
    """ Connect to the PostgreSQL database server """
    conn = None
    # read connection parameters
    params = config()

    # connect to the PostgreSQL server
    print('Connecting to the PostgreSQL database...')
    conn = psycopg2.connect(**params)

    return conn

In [2]:
def get_english_variants(cursor, arabic_name):
    query = """
    SELECT NAME, SUM(COUNT) FROM (
        SELECT ENG AS NAME, COUNT FROM GIVEN_NAMES_MASTER
        WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ARB = '""" + arabic_name + """'
        UNION ALL SELECT ENG AS NAME, COUNT FROM FAMILY_NAMES_MASTER
        WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ARB = '""" + arabic_name + """'
    ) AS SUB group BY NAME;
    """     
    
    result = {}
    
    cur.execute(query)
    result.update(cur.fetchall())
    
    return result

def get_arabic_variants(cursor, english_name): 
    query = """
        SELECT SUB.NAME, SUM(COUNT) FROM (
            SELECT ARB AS NAME, COUNT FROM GIVEN_NAMES_MASTER
            WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ENG = '""" + english_name + """' 
            UNION ALL SELECT ARB AS NAME, COUNT FROM FAMILY_NAMES_MASTER
            WHERE ARB IS NOT NULL AND ENG IS NOT NULL AND ARB != '' AND ENG != '' AND ENG = '""" + english_name + """' 
        ) AS SUB GROUP BY NAME;
        """     
    result = {}

    cur.execute(query)
    result.update(cur.fetchall())
    
    return result

def validate_arabic_name_by_english_variations(cursor, arabic_name, valid_english_variations):
    english_variations = get_english_variants(cursor, arabic_name)
    total_valid_count = 0
    total_invalid_count = 0
    for key, val in english_variations.items():
        if key in valid_english_variations:
            total_valid_count += val
        else:
            total_invalid_count += val
    
    if total_valid_count > total_invalid_count or total_valid_count > 100:
        return True
    
    print("english variations for {name} are: {dic}".format(name=arabic_name, dic=english_variations))
    return False
    
def validate_english_name_by_arabic_variations(cursor, english_name, valid_arabic_variations):
    arabic_variations = get_arabic_variants(cursor, english_name)
    total_valid_count = 0
    total_invalid_count = 0
    
    total = sum(arabic_variations.values())
    if total < 3:
        return False
    
    for key, val in arabic_variations.items():
        if key in valid_arabic_variations:
            total_valid_count += val
        else:
            total_invalid_count += val
    
    if total_valid_count > total_invalid_count or total_valid_count > 100:
        return True
    
    print("arabic variations for {name} are: {dic}".format(name=english_name, dic=arabic_variations))
    return False

def get_top_frequency_names(list):
    total = sum(list.values())
    lower_accepted_frequency = 100
    threshold = 10
    
    max_value = max(list.values())
    if total > 6561:
        threshold = 1
    else:
        threshold -= total**(1./4.)
        
    print("threshold: {thre}, total: {tot}".format(thre=threshold, tot=total))
    matched_list = [key for key, val in list.items() 
                    if len(key) > 2 and 
                    key not in top_noise_data and 
                    (val / total * 100 > threshold or val >= lower_accepted_frequency)]
    print("top matched_list: {thre}".format(thre=matched_list))
    not_matched_list = [ (key, val) for key, val in list.items() if val / total * 100 <= threshold and val < lower_accepted_frequency]
    matched_list_with_composite = [key for key, val in list.items() if any(match in key for match in matched_list)]
    if(len(matched_list_with_composite) - len(matched_list) > 3):
        return matched_list
    
    return matched_list_with_composite

In [4]:
def reset_global_variables(name):
    conn = db_connect()
    cur = conn.cursor()
    english_total_result = []
    english_top_frequency = []
    arabic_total_result = []
    arabic_top_frequency = []
    top_noise_data = ['MRS', 'MRS.', 'MRSS', 'MR', 'MR.', 'MISS']
    origin_name = name