In [12]:
import psycopg2
from configparser import ConfigParser
import pandas as pd

def config(filename='prepare_data.ini', section='phonetic'):
    parser = ConfigParser()
    parser.read(filename)
 
    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 
    return db

def db_connect():
    """ Connect to the PostgreSQL database server """
    conn = None
    params = config()
    conn = psycopg2.connect(**params)
    print('Connected to the PostgreSQL database...')
    
    return conn

def read_given_names():
    conn = db_connect()
    sql_result = pd.DataFrame()
    try:
        query = """
            SELECT * FROM (
                SELECT ENG, ARB, SUM(COUNT) AS COUNT FROM (
                    SELECT ENG, ARB, COUNT FROM GIVEN_NAMES_MASTER
                    WHERE ENG IS NOT NULL AND ENG != '' AND ARB IS NOT NULL AND ARB != ''
                ) AS SUB GROUP BY ENG, ARB
                ORDER BY COUNT DESC
            ) AS S
            UNION ALL
            SELECT * FROM (
                SELECT ENG, ARB, SUM(COUNT) AS COUNT FROM (
                    SELECT ENG, ARB, COUNT FROM FAMILY_NAMES_MASTER
                    WHERE ENG IS NOT NULL AND ENG != '' AND ARB IS NOT NULL AND ARB != ''
                ) AS SUB GROUP BY ENG, ARB
                ORDER BY COUNT DESC
            ) AS S
            UNION ALL
            SELECT * FROM (
                SELECT ENG, ARB, SUM(FREQ) AS COUNT FROM (
                    SELECT ENG, ARB, FREQ FROM GIVEN_NAMES_DAN
                    WHERE ENG IS NOT NULL AND ENG != '' AND ARB IS NOT NULL AND ARB != ''
                ) AS SUB GROUP BY ENG, ARB
                ORDER BY COUNT DESC
            ) AS S

            """

        sql_result = pd.read_sql(query, con=conn)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
    return sql_result

In [13]:
import random
import numpy as np
from itertools import permutations, repeat, combinations, chain, product, cycle
import math
import os
import sys
import csv
import pickle
import requests
import pyphi

target_dir = "/home/jupyter/notebooks/PoC/data-preparation/output/understanding_data/"
all_eng_given_names = []
all_arb_given_names = []
all_pairs = []

def get_yamli_arabic_varinats(name):
    name = ''.join(e for e in name if e.isalnum())
    url = 'http://api.yamli.com/transliterate.ashx?tool=api&account_id=&prot=http%3A&hostname=fuzzyarabic.herokuapp.com&path=%2F&build=5447&sxhr_id=51&word=' + name
    headers = {"content-type": "application/json"}
    
    response = requests.get(url)
    if len(response.text) > 66:
        arr = pyphi.jsonify.loads(response.text[62:-4])['data']
        data = pyphi.jsonify.loads(arr)['r']
        variants = data.split('|')
    else:
        variants = []
    variants = [''.join(e for e in name if e.isalnum() and not e.isdigit()) for name in variants]
    return variants

def format_variants_list(s):
    return [','.join(sorted(set(s.tolist())))]

def format_variants_list2(s):
    lis = s.tolist()
    flat_list = [item for sublist in lis for item in sublist.split(',')]

    return ','.join(sorted(set(flat_list)))

def remove_long_variants(s):    
    a = list(set(s['eng_variants'].split(',')))
    l = len(s['trimmed_eng'])
    a = [item for item in a if (l * 0.3 < len(item) <= l * 2 and item not in s['trimmed_eng']) or item == s['trimmed_eng'] ]
    s['eng_variants'] = ','.join(sorted(set(a)))
    
    a = list(set(s['arb_variants'].split(',')))
    l = len(s['trimmed_arb'])
    a = [item for item in a if (l * 0.3 < len(item) <= l * 2 and item not in s['trimmed_arb']) or item == s['trimmed_arb'] ]
    s['arb_variants'] = ','.join(sorted(set(a)))
    return s

def create_negative(row):
    global all_eng_given_names, all_arb_given_names
    
    good_eng_variants = list(set(row['eng_variants'].split(',')))
    good_arb_variants = list(set(row['arb_variants'].split(',')))

    eng_variants_len = len(good_eng_variants)
    arb_variants_len = len(good_arb_variants)
    
    '''
    if(eng_variants_len == 1 and arb_variants_len == 1):
        good_eng_variants = [repeated for value in good_eng_variants for repeated in repeat(value, 2)]
        good_arb_variants = [repeated for value in good_arb_variants for repeated in repeat(value, 2)]
    '''
    
    if eng_variants_len > arb_variants_len:
        bigger_variants_count = eng_variants_len
    else:
        bigger_variants_count = arb_variants_len
        
    ### making good arabic and english variants of same length
    '''
    if eng_variants_len > arb_variants_len:
        quotient, modulo = divmod(eng_variants_len, arb_variants_len)
        extension = random.sample(good_arb_variants, modulo)
        good_arb_variants = [repeated for value in good_arb_variants for repeated in repeat(value, quotient)]
        good_arb_variants.extend(extension)
    elif arb_variants_len > eng_variants_len:
        quotient, modulo = divmod(arb_variants_len, eng_variants_len)
        extension = random.sample(good_eng_variants, modulo)
        good_eng_variants = [repeated for value in good_eng_variants for repeated in repeat(value, quotient)]
        good_eng_variants.extend(extension)
    
    eng_variants_len = len(good_eng_variants)
    arb_variants_len = len(good_arb_variants)
    '''
    
    random_eng_negative = []
    random_arb_negative = []
    
    desired_negative_length_per_set = math.ceil(bigger_variants_count * (bigger_variants_count - 1) * 0.75)
    needed_negative_length = desired_negative_length_per_set + bigger_variants_count
    
    if (len(all_eng_given_names) - bigger_variants_count) > (needed_negative_length):
        random_eng_negative = random.sample(all_eng_given_names, needed_negative_length)
        random_eng_negative = list(np.setdiff1d(random_eng_negative, good_eng_variants, assume_unique=True))
        random_eng_negative = random.sample(random_eng_negative, desired_negative_length_per_set)
    else:
        random_eng_negative = list(np.setdiff1d(all_eng_given_names, good_eng_variants, assume_unique=True))
        '''
        quotient, modulo = divmod(bigger_variants_count * (bigger_variants_count - 1), len(random_eng_negative))
        extension = random.sample(random_eng_negative, modulo)
        random_eng_negative = [repeated for value in random_eng_negative for repeated in repeat(value, quotient)]
        random_eng_negative.extend(extension)
        ''' 
        
    if (len(all_arb_given_names) - bigger_variants_count) > (needed_negative_length):
        random_arb_negative = random.sample(all_arb_given_names, needed_negative_length)
        random_arb_negative = list(np.setdiff1d(random_arb_negative, good_arb_variants, assume_unique=True))
        random_arb_negative = random.sample(random_arb_negative, desired_negative_length_per_set)
    else:
        random_arb_negative = list(np.setdiff1d(all_arb_given_names, good_arb_variants, assume_unique=True))
        '''
        quotient, modulo = divmod(bigger_variants_count * (bigger_variants_count - 1), len(random_arb_negative))
        extension = random.sample(random_arb_negative, modulo)
        random_arb_negative = [repeated for value in random_arb_negative for repeated in repeat(value, quotient)]
        random_arb_negative.extend(extension)
        '''
    
    #row['eng_variants'] = ','.join(good_eng_variants)
    #row['arb_variants'] = ','.join(good_arb_variants)
    row['negative_eng_variants'] = ','.join(random_eng_negative)
    row['negative_arb_variants'] = ','.join(random_arb_negative)

    return row

def build_top_given_names(min_count):
    global top_given_names, all_eng_given_names, all_arb_given_names, low_accuracy_names
    given_names = read_given_names()
        
    given_names['trimmed_eng'] = given_names['eng'].map(lambda x: x.strip(' '))
    given_names['trimmed_arb'] = given_names['arb'].map(lambda x: x.strip(' '))
    given_names['count'] = given_names.groupby(['trimmed_eng', 'trimmed_arb'])['count'].transform('sum')
    given_names = given_names.drop_duplicates(subset=['trimmed_eng', 'trimmed_arb'], keep='first')
    
    low_accuracy_threshold = 50
    if min_count < low_accuracy_threshold:
        top_given_names = given_names[given_names['count'] >= low_accuracy_threshold]
        top_given_names = top_given_names.copy()
        low_accuracy_names = given_names[(given_names['count'] >= min_count) & (given_names['count'] < low_accuracy_threshold)]
        low_accuracy_names = low_accuracy_names.copy()
    else:
        top_given_names = given_names[given_names['count'] >= min_count]
        top_given_names = top_given_names.copy()
        low_accuracy_names = pd.DataFrame(columns=['eng', 'arb', 'count', 'trimmed_eng', 'trimmed_arb', 'eng_variants', 'arb_variants'])
    
    all_eng_given_names = list(set(top_given_names['eng'].tolist()))
    all_arb_given_names = list(set(top_given_names['arb'].tolist()))
    random.shuffle(all_eng_given_names)
    random.shuffle(all_arb_given_names)

def group_names_both():
    global top_given_names
    top_given_names['eng_variants'] = top_given_names.groupby('trimmed_arb')['eng'].transform(format_variants_list)
    top_given_names['arb_variants'] = top_given_names.groupby('trimmed_eng')['arb'].transform(format_variants_list)
    
    top_given_names['arb_variants'] = top_given_names.groupby('trimmed_eng')['arb_variants'].transform(format_variants_list2)
    top_given_names['eng_variants'] = top_given_names.groupby('trimmed_eng')['eng_variants'].transform(format_variants_list2)
    top_given_names['arb_variants'] = top_given_names.groupby('trimmed_arb')['arb_variants'].transform(format_variants_list2)
    top_given_names['eng_variants'] = top_given_names.groupby('trimmed_arb')['eng_variants'].transform(format_variants_list2)

    top_given_names['arb_variants'] = top_given_names.groupby('trimmed_eng')['arb_variants'].transform(format_variants_list2)
    top_given_names['eng_variants'] = top_given_names.groupby('trimmed_eng')['eng_variants'].transform(format_variants_list2)
    
    if not low_accuracy_names.empty:
        low_accuracy_names['eng_variants'] = low_accuracy_names.groupby('trimmed_arb')['eng'].transform(format_variants_list)
        low_accuracy_names['arb_variants'] = low_accuracy_names.groupby('trimmed_eng')['arb'].transform(format_variants_list)
        top_given_names = pd.concat([top_given_names, low_accuracy_names])

    top_given_names['arb_variants'] = top_given_names.groupby('trimmed_arb')['arb_variants'].transform(format_variants_list2)
    top_given_names['eng_variants'] = top_given_names.groupby('trimmed_arb')['eng_variants'].transform(format_variants_list2)
    
    '''
    top_given_names['arb_variants'] = top_given_names.groupby('trimmed_eng')['arb_variants'].transform(format_variants_list2)
    top_given_names['eng_variants'] = top_given_names.groupby('trimmed_eng')['eng_variants'].transform(format_variants_list2)
    top_given_names['arb_variants'] = top_given_names.groupby('trimmed_arb')['arb_variants'].transform(format_variants_list2)
    top_given_names['eng_variants'] = top_given_names.groupby('trimmed_arb')['eng_variants'].transform(format_variants_list2)

    top_given_names['arb_variants'] = top_given_names.groupby('trimmed_eng')['arb_variants'].transform(format_variants_list2)
    top_given_names['eng_variants'] = top_given_names.groupby('trimmed_eng')['eng_variants'].transform(format_variants_list2)
    top_given_names['arb_variants'] = top_given_names.groupby('trimmed_arb')['arb_variants'].transform(format_variants_list2)
    top_given_names['eng_variants'] = top_given_names.groupby('trimmed_arb')['eng_variants'].transform(format_variants_list2)
    '''

def clean_up_output_files():
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    ## clearing files content
    open(target_dir + 'eng_eng_pairs.tsv', 'w').close()
    open(target_dir + 'eng_arb_pairs.tsv', 'w').close()
    open(target_dir + 'arb_arb_pairs.tsv', 'w').close()
    open(target_dir + 'names_dict.pkl', 'w').close()
    open(target_dir + 'all_eng_names.pkl', 'w').close()
    open(target_dir + 'all_arb_names.pkl', 'w').close()

def load_up_data_from_files():
    global eng_eng_df, eng_arb_df, arb_arb_df, eng_arb_negative_df, eng_eng_nagative_df, arb_arb_negative_df, names_dict, all_eng_names, all_arb_names
    
    eng_eng_df = pd.read_csv(target_dir + 'eng_eng_pairs.tsv',sep='\t', header=None)
    eng_arb_df = pd.read_csv(target_dir + 'eng_arb_pairs.tsv',sep='\t', header=None)
    arb_arb_df = pd.read_csv(target_dir + 'arb_arb_pairs.tsv',sep='\t', header=None)
    #eng_arb_negative_df = pd.read_csv(target_dir + 'neg_eng_arb_pairs.tsv',sep='\t', header=None)
    #eng_eng_nagative_df = pd.read_csv(target_dir + 'neg_eng_eng_pairs.tsv',sep='\t', header=None)
    #arb_arb_negative_df = pd.read_csv(target_dir + 'neg_arb_arb_pairs.tsv',sep='\t', header=None)
    
    names_dict = load_obj('names_dict')
    all_eng_names = load_obj('all_eng_names')
    all_arb_names = load_obj('all_arb_names')
    
def append_list_to_tsv(file_name, list_to_be_added, folder = ''):
    folder = target_dir + folder + "/"
    if not os.path.exists(folder):
        os.makedirs(folder)

    df = pd.DataFrame(list_to_be_added)
    df.to_csv(folder + file_name + '.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'a', header=False)    
    
def write_data_to_output_files():
    i = 0
    names_dict = {}
    all_eng_names = []
    all_arb_names = []
    set_len = len(top_given_names)
    eng_eng_count = 0
    arb_arb_count = 0
    eng_arb_count = 0
    
    for index, row in top_given_names.iterrows():
        i += 1
        if i % 10000 == 0:
                print("processed {i} out of {t}".format(i=i, t=set_len))

        eng_variants = row['eng_variants'].split(',')
        arb_variants = row['arb_variants'].split(',')
        #negative_eng_variants = row['negative_eng_variants'].split(',')
        #negative_arb_variants = row['negative_arb_variants'].split(',')
        
        all_eng_names.extend([[elem, i] for elem in eng_variants])
        all_arb_names.extend([[elem, i] for elem in arb_variants])
        names_dict[i] = {'eng': eng_variants, 'arb': arb_variants}
        
        eng_variants_length = len(eng_variants)
        arb_variants_length = len(arb_variants)
        #negative_eng_variants_length = len(negative_eng_variants)
        #negative_arb_variants_length = len(negative_arb_variants)
        
        maximum_eng_variants_threshold = 50
        if(eng_variants_length > maximum_eng_variants_threshold):
            eng_eng_pairs = []
            percent = maximum_eng_variants_threshold**2 / eng_variants_length**2
            randomly_selected_count = math.ceil(eng_variants_length * percent)
            for eng in eng_variants:
                temp = random.sample(eng_variants, randomly_selected_count)
                temp.append(eng)
                eng_eng = list(product([eng], temp))
                eng_eng = [list(elem) + [str(i)] for elem in eng_eng]
                eng_eng_pairs.extend(eng_eng)
        else: 
            eng_eng_pairs = list(product(eng_variants, repeat=2))
            eng_eng_pairs = [list(elem) + [str(i)] for elem in eng_eng_pairs]
        
        eng_eng_count += len(eng_eng_pairs)
        append_list_to_tsv('eng_eng_pairs', eng_eng_pairs)

        arb_arb_pairs = list(product(arb_variants, repeat=2))
        arb_arb_pairs = [list(elem) + [str(i)] for elem in arb_arb_pairs]
        arb_arb_count += len(arb_arb_pairs)
        append_list_to_tsv('arb_arb_pairs', arb_arb_pairs)

        eng_arb_pairs = set(list(product(eng_variants, arb_variants)))
        eng_arb_pairs = [list(elem) + [str(i)] for elem in eng_arb_pairs]
        eng_arb_count += len(eng_arb_pairs)
        
        '''
        if len(eng_arb_pairs) < len(eng_eng_pairs):
            quotient, modulo = divmod(len(eng_eng_pairs), len(eng_arb_pairs))
            extension = random.sample(eng_arb_pairs, modulo)
            eng_arb_pairs = [repeated for value in eng_arb_pairs for repeated in repeat(value, quotient)]
            eng_arb_pairs.extend(extension)
        '''
        append_list_to_tsv('eng_arb_pairs', eng_arb_pairs)
        
        '''
        needed_negatives_length = math.ceil(eng_variants_length * (eng_variants_length - 1) * 0.75)

        if needed_negatives_length > negative_eng_variants_length:
            quotient, modulo = divmod(needed_negatives_length, negative_eng_variants_length)
            extension = random.sample(negative_eng_variants, modulo)
            negative_eng_variants = [repeated for value in negative_eng_variants for repeated in repeat(value, quotient)]
            negative_eng_variants.extend(extension)

        if needed_negatives_length > negative_arb_variants_length:
            quotient, modulo = divmod(needed_negatives_length, negative_arb_variants_length)
            extension = random.sample(negative_arb_variants, modulo)
            negative_arb_variants = [repeated for value in negative_arb_variants for repeated in repeat(value, quotient)]
            negative_arb_variants.extend(extension)

        needed_negatives_length = math.ceil(eng_variants_length * (eng_variants_length - 1) * 0.5)

        random.shuffle(negative_eng_variants)
        neg_eng_eng_pairs = list(zip(negative_eng_variants[:needed_negatives_length], cycle(eng_variants)))
        neg_eng_eng_pairs = [list(elem) + [str(i)] for elem in neg_eng_eng_pairs]
        append_list_to_tsv('neg_eng_eng_pairs', neg_eng_eng_pairs)

        neg_arb_arb_pairs = list(zip(negative_arb_variants[:needed_negatives_length], cycle(arb_variants)))
        neg_arb_arb_pairs = [list(elem) + [str(i)] for elem in neg_arb_arb_pairs]
        append_list_to_tsv('neg_arb_arb_pairs', neg_arb_arb_pairs)

        neg_eng_arb_pairs = list(zip(negative_eng_variants[needed_negatives_length:], cycle(arb_variants)))
        neg_eng_arb_pairs = [list(elem) + [str(i)] for elem in neg_eng_arb_pairs]
        append_list_to_tsv('neg_eng_arb_pairs', neg_eng_arb_pairs)

        neg_eng_arb_pairs = list(zip(cycle(eng_variants), negative_arb_variants[needed_negatives_length:]))
        neg_eng_arb_pairs = [list(elem) + [str(i)] for elem in neg_eng_arb_pairs]
        append_list_to_tsv('neg_eng_arb_pairs', neg_eng_arb_pairs)
        '''
        
    save_obj(names_dict, 'names_dict')
    save_obj(all_eng_names, 'all_eng_names')
    save_obj(all_arb_names, 'all_arb_names')
    print('eng_eng count = {e_e},arb_arb count = {a_a}, eng_arb count = {e_a}'.format(e_e=eng_eng_count, a_a=arb_arb_count, e_a=eng_arb_count))
    
def save_obj(obj, name ):
    with open(target_dir + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(target_dir + name + '.pkl', 'rb') as f:
        return pickle.load(f)


In [49]:
%%time

build_top_given_names(20)
print(len(top_given_names))
print(len(set(all_eng_given_names)))
print(len(set(all_arb_given_names)))

Connected to the PostgreSQL database...
159005
94389
70375
CPU times: user 3.64 s, sys: 94.9 ms, total: 3.73 s
Wall time: 9.89 s


In [None]:
%%time
i = 0
top_given_names = pd.read_csv(target_dir + 'all_names_with_yamli.tsv', sep='\t', header=None)
top_given_names.columns=['eng', 'arb', 'count', 'trimmed_eng', 'trimmed_arb', 'yamli_arb_variants']
names = list(set((top_given_names[top_given_names.isnull().any(axis=1)]['trimmed_eng']).tolist()))
set_len = len(names)
print(set_len)
for name in names:
    arb_variants = get_yamli_arabic_varinats(name)
    top_given_names.loc[top_given_names['trimmed_eng'] == name, 'yamli_arb_variants'] = ','.join(arb_variants)
    i += 1
    if i % 3000 == 0:
        print("processed {i} out of {t}".format(i=i, t=set_len))
        top_given_names.to_csv(target_dir + 'all_names_with_yamli.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)
        
top_given_names.to_csv(target_dir + 'all_names_with_yamli.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)



In [None]:
%%time
top_given_names['arb_variants'] = top_given_names.groupby('trimmed_eng')['arb'].transform(format_variants_list)
top_given_names = top_given_names.drop_duplicates(subset=['arb_variants', 'yamli_arb_variants', 'eng'], keep='first').copy()

In [None]:
%%time
i = 0
top_given_names = pd.read_csv(target_dir + 'all_names_with_yamli.tsv', sep='\t', header=None)
top_given_names.columns=['eng', 'arb', 'count', 'trimmed_eng', 'trimmed_arb', 'yamli_arb_variants', 'arb_variants', 'eng_variants']
names = list(set((top_given_names['trimmed_arb']).tolist()))
set_len = len(names)
print(set_len)
for name in names:
    matched_rows = top_given_names[top_given_names['trimmed_arb'] == name]
    eng_variants = (matched_rows['eng']).tolist()
    arb_variants = (matched_rows['arb_variants']).tolist()
    arb_variants = list(set([name for sublist in arb_variants for name in sublist.split(',')]))
    yamli_variants = (matched_rows['yamli_arb_variants']).tolist()
    yamli_variants = list(set([name for sublist in yamli_variants for name in sublist.split(',')]))

    top_given_names.loc[top_given_names['trimmed_arb'] == name, 'eng_variants'] = ','.join(eng_variants)
    top_given_names.loc[top_given_names['trimmed_arb'] == name, 'arb_variants'] = ','.join(arb_variants)
    top_given_names.loc[top_given_names['trimmed_arb'] == name, 'yamli_arb_variants'] = ','.join(yamli_variants)
    i += 1
    if i % 3000 == 0:
        print("processed {i} out of {t}".format(i=i, t=set_len))
        top_given_names.to_csv(target_dir + 'all_names_with_yamli.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)
        
top_given_names.to_csv(target_dir + 'all_names_with_yamli.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)



In [None]:
%%time
top_given_names = top_given_names.drop_duplicates(subset=['arb_variants', 'yamli_arb_variants', 'eng_variants'], keep='first').copy()
top_given_names.to_csv(target_dir + 'all_names_with_yamli.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)


In [None]:
top_given_names

In [None]:
%%time
top_given_names = pd.read_csv(target_dir + 'all_names_with_yamli.tsv', sep='\t', header=None)
top_given_names.columns=['eng', 'arb', 'count', 'trimmed_eng', 'trimmed_arb', 'yamli_arb_variants', 'arb_variants', 'eng_variants']

temp_df = top_given_names.reset_index().copy()
all_eng_variants = temp_df[['index', 'eng_variants']]
all_eng_variants = [tuple([index, eng.split(',')]) for index, eng in all_eng_variants.values]
all_arb_variants = temp_df[['index', 'arb_variants']]
all_arb_variants = [tuple([index, arb.split(',')]) for index, arb in all_arb_variants.values]
all_yamli_variants = temp_df[['index', 'yamli_arb_variants']]
all_yamli_variants = [tuple([index, yamli.split(',')]) for index, yamli in all_yamli_variants.values]
indexes = (temp_df['index']).tolist()


In [19]:
%%time
top_given_names = pd.read_csv(target_dir + 'all_names_with_yamli_merged.tsv', sep='\t', header=None)
top_given_names.columns=['eng', 'arb', 'count', 'trimmed_eng', 'trimmed_arb', 'yamli_arb_variants', 'arb_variants', 'eng_variants']

temp_df = top_given_names.reset_index().copy()
all_eng_variants = temp_df[['index', 'eng_variants']]
all_eng_variants = [tuple([index, eng.split(',')]) for index, eng in all_eng_variants.values if index > 10000]
all_arb_variants = temp_df[['index', 'arb_variants']]
all_arb_variants = [tuple([index, arb.split(',')]) for index, arb in all_arb_variants.values if index > 10000]
all_yamli_variants = temp_df[['index', 'yamli_arb_variants']]
all_yamli_variants = [tuple([index, yamli.split(',')]) for index, yamli in all_yamli_variants.values if index > 10000]
indexes = [i for i in (temp_df['index']).tolist() if i > 10000]

CPU times: user 813 ms, sys: 36.4 ms, total: 850 ms
Wall time: 849 ms


In [66]:
top_given_names.loc[0][2]

5485960.0

In [21]:
%%time
i= 10001
while len(indexes) > 0:
    current_index = indexes[0]
    row = top_given_names.loc[current_index]
    eng_variants = row['eng_variants'].split(',')
    arb_variants = row['arb_variants'].split(',')
    yamli_variants = row['yamli_arb_variants'].split(',')
    matched_eng = [(i, eng) for i, eng in all_eng_variants if any(s in eng for s in eng_variants)]
    matched_arb = [(i, arb) for i, arb in all_arb_variants if any(s in arb for s in arb_variants)]
    matched_yamli = [(i, yamli) for i, yamli in all_yamli_variants if any(s in yamli for s in yamli_variants)]
    
    matched_eng_indexes = [i for i, eng in matched_eng]
    matched_arb_indexes = [i for i, arb in matched_arb]
    matched_yamli_indexes = [i for i, yamli in matched_yamli]
    
    matched1 = set(matched_eng_indexes).intersection(matched_arb_indexes)
    matched2 = set(matched_eng_indexes).intersection(matched_yamli_indexes)
    matched3 = set(matched_arb_indexes).intersection(matched_yamli_indexes)
    matched = set(matched1).union(matched2).union(matched3)        
    
    all_eng_variants = [(i, eng) for i, eng in all_eng_variants if i not in matched]
    all_arb_variants = [(i, eng) for i, eng in all_arb_variants if i not in matched]
    all_yamli_variants = [(i, eng) for i, eng in all_yamli_variants if i not in matched]
    
    if current_index not in matched:
        print(eng_variants)
        print(matched_eng)
        print(current_index)
        print(matched)
        print(top_given_names.loc[current_index])
        print(top_given_names.iloc[current_index])
    
    matched_rows = top_given_names.loc[matched,:]
    matched_eng_variants = matched_rows['eng_variants'].tolist()
    matched_eng_variants = list(set([eng for sublist in matched_eng_variants for eng in sublist.split(',')]))
    matched_arb_variants =  matched_rows['arb_variants'].tolist()
    matched_arb_variants = list(set([arb for sublist in matched_arb_variants for arb in sublist.split(',')]))
    matched_yamli_variants = matched_rows['yamli_arb_variants'].tolist()
    matched_yamli_variants = list(set([name for sublist in matched_yamli_variants for name in sublist.split(',')]))
    
    top_given_names.loc[current_index, 'eng_variants'] = ','.join(matched_eng_variants)
    top_given_names.loc[current_index, 'arb_variants'] = ','.join(matched_arb_variants)
    top_given_names.loc[current_index, 'yamli_arb_variants'] = ','.join(matched_yamli_variants)
    
    for e in matched:
        indexes.remove(e)
        if e != current_index:
            top_given_names.drop(e, inplace=True)
    
    i += 1
    if i % 1000 == 0:
        print("processed {i}, remaining:{o}, eng_variants_length:{l}".format(i=i, o=len(indexes),l=len(all_eng_variants)))
        #top_given_names.to_csv(target_dir + 'all_names_with_yamli_merged.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)
        
top_given_names.to_csv(target_dir + 'all_names_with_yamli_merged.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)


processed 11000, remaining:55326, eng_variants_length:55326
processed 12000, remaining:54325, eng_variants_length:54325
processed 13000, remaining:53325, eng_variants_length:53325
processed 14000, remaining:52325, eng_variants_length:52325
processed 15000, remaining:51324, eng_variants_length:51324
processed 16000, remaining:50324, eng_variants_length:50324
processed 17000, remaining:49323, eng_variants_length:49323
processed 18000, remaining:48323, eng_variants_length:48323
processed 19000, remaining:47323, eng_variants_length:47323
processed 20000, remaining:46323, eng_variants_length:46323
processed 21000, remaining:45323, eng_variants_length:45323
processed 22000, remaining:44323, eng_variants_length:44323
processed 23000, remaining:43323, eng_variants_length:43323
processed 24000, remaining:42323, eng_variants_length:42323
processed 25000, remaining:41323, eng_variants_length:41323
processed 26000, remaining:40323, eng_variants_length:40323
processed 27000, remaining:39323, eng_va

In [63]:
%%time
i = 0
for index, row in top_given_names.iterrows():
    lll = row[6].split(',')
    hhhh  = row[7].split(',')
    if len(lll) > 1:
        i+=1
        #print( lll)

CPU times: user 4.82 s, sys: 12.1 ms, total: 4.84 s
Wall time: 4.85 s


In [14]:
top_given_names = pd.read_csv(target_dir + 'all_names_with_yamli_merged.tsv', sep='\t', header=None)

In [42]:
i

856

In [18]:
top_given_names.loc[0][5]

'محاميت,محاميد,مشامض,موحد,مشامت,موحمض,مخامات,موخمت,موهاميد,مهموض,مإهمد,محمود,محماد,مهامض,موهامض,مهميت,معحامض,مخامت,محض,موخمات,مهميط,ماهمت,معحماد,محامود,موهامود,مخيميط,معهامد,موهيض,موخماد,مهميد,معحماض,موخامد,مهمض,ماحمد,مووهامد,مخامض,مخميد,مخامط,موهميت,مقحمد,مهامات,مووهمد,موحامد,محيميد,محمة,موهمود,مشامة,مخيمة,مواهمد,موشماض,مهماد,مهمات,محميد,موحميت,مشامط,مخمة,مهمة,موهامط,معهامض,محامية,مهمت,مشمض,مواهامد,موشامت,موكهمد,محاماة,موهمت,موحيد,مووحمد,مهمد,موشمض,معحمض,محمية,مشماض,معهماد,موشامد,موشمات,معحامد,معهمض,مهمود,محمط,مهميض,موخامت,معهمد,مشيمد,مخيمات,مهاميد,موهاميت,موخمض,مشمة,موخامض,موهاد,موهميد,محيمد,مئهمد,ماهمد,محموض,مشامد,ميحمد,موحامت,موخمة,محماة,موشمط,موهمط,موهماد,محماض,محموود,موخامات,معهيمد,موحاد,مهض,مخاماد,مخماد,موهد,مشماد,موهمض,مخامة,محد,موهامد,مخمد,موشامات,محامة,معهماض,مكهمد,مهامة,موحامض,موحماد,مخمض,موهمد,ميهمد,مشاماد,موحمط,موخمد,مشمط,موخمط,موشماد,مشميد,مشمت,موحض,موهميض,مشمات,موهاماد,محميض,موشامض,معحمد,موشمت,موحمة,مهماض,مهامد,مخيمت,محميت,محمد,مشمد,موهض,موشميد,موشمد,مهاماد,موحمود,مخمت,م

In [None]:
for index in matched:
    row = top_given_names.iloc[index]
    print(row['eng_variants'])
    print(row['arb_variants'])
    print(row['yamli_arb_variants'])

In [None]:
matched_yamli_indexes

In [None]:
matched_eng_variants

In [None]:
top_gi    

In [None]:
top_given_names_without_yamli

In [None]:
top_given_names


In [None]:
top_given_names['eng_variants'] = top_given_names.groupby('trimmed_arb')['eng'].transform(format_variants_list)
top_given_names['arb_variants'] = top_given_names.groupby('trimmed_eng')['arb'].transform(format_variants_list)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first').copy()
top_given_names['arb_variants'] = top_given_names.groupby('trimmed_eng')['arb_variants'].transform(format_variants_list2)
top_given_names['eng_variants'] = top_given_names.groupby('trimmed_eng')['eng_variants'].transform(format_variants_list2)
top_given_names['arb_variants'] = top_given_names.groupby('trimmed_arb')['arb_variants'].transform(format_variants_list2)
top_given_names['eng_variants'] = top_given_names.groupby('trimmed_arb')['eng_variants'].transform(format_variants_list2)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first').copy()




In [None]:
%%time

## round 1

group_names_both()

print(top_given_names.shape)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first').copy()
print(top_given_names.shape)
top_given_names['eng_variants'] = top_given_names.groupby('arb_variants')['eng_variants'].transform(format_variants_list2)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first').copy()
print(top_given_names.shape)
top_given_names['arb_variants'] = top_given_names.groupby('eng_variants')['arb_variants'].transform(format_variants_list2)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first').copy()
print(top_given_names.shape)

## round 2
#top_given_names = top_given_names.apply(remove_long_variants, axis=1)
#top_given_names = top_given_names.apply(create_negative, axis=1)

In [None]:
top_given_names.iloc[5]['eng_variants']

In [None]:
print(top_given_names.shape)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first').copy()
print(top_given_names.shape)

In [None]:
%%time
clean_up_output_files()
write_data_to_output_files()

In [10]:
top_given_names = pd.read_csv(target_dir + 'all_names_with_yamli.tsv', sep='\t', header=None)



In [None]:
%%time
load_up_data_from_files()

In [11]:
top_given_names[top_given_names[0] == 'SOHAIL']

Unnamed: 0,0,1,2,3,4,5,6,7
82,SOHAIL,سهيل,195943.0,SOHAIL,سهيل,"صهيل,سحيل,سهيل,سهايل,ثوهيل,سوحيل,سحايل,صهايل,ث...",سهيل,"SOHAIL,SUHAIL"


In [None]:
def prepare_df(df):
    if len(df.columns) == 4:
        df.drop(df.columns[0], axis=1, inplace=True)
    df = df.drop_duplicates(subset=[1, 2], keep='first').copy()
    df = df.sample(frac=1).reset_index(drop=True)
    return df

def prepare_duplicate_df(df, desired_length):
    print(len(df))
    if len(df.columns) == 4:
        df.drop(df.columns[0], axis=1, inplace=True)
    df = df.drop_duplicates(subset=[1, 2], keep='first').copy()
    
    quotient, modulo = divmod(desired_length, len(df))
    
    df = pd.concat([df]*quotient, ignore_index=True)
    
    if modulo > 0:
        fract = modulo / len(df)
        df = pd.concat([df, df.sample(frac=fract)])

    df = df.sample(frac=1).reset_index(drop=True)
    print(len(df))
    return df

def concatenate_ordered_names(row):
    return ' '.join(row.tolist())

def concatenate_reversed_names(row):
    lst = row.tolist()
    lst = lst[::-1]
    return ' '.join(lst)

def concatenate_out_of_order_names(row):
    lst = row.tolist()
    if len(set(lst)) < 3:
        return ''
    rev = lst[:]
    rev = rev[::-1]
    shuff = lst[:]
    random.shuffle(shuff)
    while shuff == rev or shuff == lst:
        random.shuffle(shuff)
    return ' '.join(shuff)

def concatenate_with_initial(row):
    global current_processed_pair, selected_rnd_names_for_initial, row_index_for_eng_arb, ignore_row_index_for_eng_arb
    lst = row.tolist()
    lct = random.randrange(len(lst)+1)
    rnd_initial = ''
    if current_languages_pairs == 'eng_eng':
        rnd_name = random.choice(all_eng_names)[0]
        selected_rnd_names_for_initial.put((rnd_name, lct))
        rnd_initial = rnd_name[0] + random.choice(['', ' ', '.', '. '])
    elif current_languages_pairs == 'arb_arb':
        rnd_name = random.choice(all_arb_names)[0]
        selected_rnd_names_for_initial.put((rnd_name, lct))
        rnd_initial = rnd_name[0] + random.choice(['', ' ', '.', '. '])
    elif current_languages_pairs == 'eng_arb':
        row_index_for_eng_arb += 1
        eng_arb_factor = row_index_for_eng_arb % 4
        if eng_arb_factor == 0 or eng_arb_factor == 1:
            temp = random.choice(all_eng_names)
            first_pair = temp[0]
            group = temp[1]
            #second_pair = random.choice([item for item in all_arb_names if item[1] == group])[0]
            second_pair = random.choice(names_dict[group]['arb'])
        else:
            temp = random.choice(all_arb_names)
            second_pair = temp[0]
            group = temp[1]
            #first_pair = random.choice([item for item in all_eng_names if item[1] == group])[0]
            first_pair = random.choice(names_dict[group]['eng'])
            
        if eng_arb_factor == 0 or eng_arb_factor == 2:
            selected_rnd_names_for_initial.put((second_pair, lct))
            rnd_initial = first_pair[0] + random.choice(['', ' ', '.', '. '])
        else:
            selected_rnd_names_for_initial.put((second_pair[0] + random.choice(['', ' ', '.', '. ']), lct))
            rnd_initial = first_pair
        
    lst.insert(lct, rnd_initial)
    return ' '.join(lst)

def concatenate_with_initial_name(row):
    global selected_rnd_names_for_initial
    lst = row.tolist()
    name, lct = selected_rnd_names_for_initial.get()
    lst.insert(lct, name)
    
    if random.choice(range(1, 5)) % 4 == 0:
        lst = lst[::-1]
    return ' '.join(lst)

def concatenate_append_random_name(row):
    global current_processed_pair, row_index_for_eng_arb, ignore_row_index_for_eng_arb
    lst = row.tolist()
    if len(lst) == 1:
        lct = random.randrange(0, 2)
    else:
        lct = random.randrange(1, len(lst)+1)
    rnd_name = ''
    if current_languages_pairs == 'eng_eng':
        rnd_name = random.choice(all_eng_names)[0]
    elif current_languages_pairs == 'arb_arb':
        rnd_name = random.choice(all_arb_names)[0]
    elif current_languages_pairs == 'eng_arb':
        row_index_for_eng_arb += 1
        if current_processed_pair == 'eng':
            if (row_index_for_eng_arb % 2 == 1) or ignore_row_index_for_eng_arb:
                rnd_name = random.choice(all_eng_names)[0]
        elif current_processed_pair == 'arb':
            if (row_index_for_eng_arb % 2 == 0) or ignore_row_index_for_eng_arb:
                rnd_name = random.choice(all_arb_names)[0]

    lst.insert(lct, rnd_name)
    return ' '.join(lst)

def concatenate_append_random_name_in_middle(row):
    global current_processed_pair, row_index_for_eng_arb, ignore_row_index_for_eng_arb
    lst = row.tolist()
    lct = random.randrange(1, len(lst))
    rnd_name = ''
    if current_languages_pairs == 'eng_eng':
        rnd_name = random.choice(all_eng_names)[0]
    elif current_languages_pairs == 'arb_arb':
        rnd_name = random.choice(all_arb_names)[0]
    elif current_languages_pairs == 'eng_arb':
        row_index_for_eng_arb += 1
        if current_processed_pair == 'eng':
            if (row_index_for_eng_arb % 2 == 1) or ignore_row_index_for_eng_arb:
                rnd_name = random.choice(all_eng_names)[0]
        elif current_processed_pair == 'arb':
            if (row_index_for_eng_arb % 2 == 0) or ignore_row_index_for_eng_arb:
                rnd_name = random.choice(all_arb_names)[0]

    lst.insert(lct, rnd_name)
    return ' '.join(lst)


def concatenate_append_random_two_names(row):
    global current_processed_pair, row_index_for_eng_arb, ignore_row_index_for_eng_arb
    lst = row.tolist()
    row_index_for_eng_arb += 1
    if len(lst) == 1:
        lct = random.randrange(2)
        if current_languages_pairs == 'eng_eng' or (current_languages_pairs == 'eng_arb' and current_processed_pair == 'eng' and ((row_index_for_eng_arb % 2 == 1 or ignore_row_index_for_eng_arb))):
            lst.insert(lct, random.choice(all_eng_names)[0])
            lst.insert(lct, random.choice(all_eng_names)[0])
        elif current_languages_pairs == 'arb_arb' or (current_languages_pairs == 'eng_arb' and current_processed_pair == 'arb' and ((row_index_for_eng_arb % 2 == 0 or ignore_row_index_for_eng_arb))):
            lst.insert(lct, random.choice(all_arb_names)[0])
            lst.insert(lct, random.choice(all_arb_names)[0])
    else:
        lct = random.randrange(1, len(lst)+1)        
        if current_languages_pairs == 'eng_eng' or (current_languages_pairs == 'eng_arb' and current_processed_pair == 'eng' and ((row_index_for_eng_arb % 2 == 1 or ignore_row_index_for_eng_arb))):
            lst.insert(lct, random.choice(all_eng_names)[0])
            lct = random.randrange(1, len(lst)+1)
            lst.insert(lct, random.choice(all_eng_names)[0])
        elif current_languages_pairs == 'arb_arb' or (current_languages_pairs == 'eng_arb' and current_processed_pair == 'arb' and ((row_index_for_eng_arb % 2 == 0 or ignore_row_index_for_eng_arb))):
            lst.insert(lct, random.choice(all_arb_names)[0])
            lct = random.randrange(1, len(lst)+1)
            lst.insert(lct, random.choice(all_arb_names)[0])                
    return ' '.join(lst)

def concatenate_append_random_two_names_in_middle(row):
    global current_processed_pair, row_index_for_eng_arb, ignore_row_index_for_eng_arb
    lst = row.tolist()
    row_index_for_eng_arb += 1
    lct = random.randrange(1, len(lst))        
    if current_languages_pairs == 'eng_eng' or (current_languages_pairs == 'eng_arb' and current_processed_pair == 'eng' and ((row_index_for_eng_arb % 2 == 1 or ignore_row_index_for_eng_arb))):
        lst.insert(lct, random.choice(all_eng_names)[0])
        lct = random.randrange(1, len(lst))
        lst.insert(lct, random.choice(all_eng_names)[0])
    elif current_languages_pairs == 'arb_arb' or (current_languages_pairs == 'eng_arb' and current_processed_pair == 'arb' and ((row_index_for_eng_arb % 2 == 0 or ignore_row_index_for_eng_arb))):
        lst.insert(lct, random.choice(all_arb_names)[0])
        lct = random.randrange(1, len(lst))
        lst.insert(lct, random.choice(all_arb_names)[0])                
    return ' '.join(lst)


def concatenate_append_random_three_names(row):
    global current_processed_pair, row_index_for_eng_arb, ignore_row_index_for_eng_arb
    lst = row.tolist()
    lct = random.randrange(2)
    if current_languages_pairs == 'eng_eng' or (current_languages_pairs == 'eng_arb' and current_processed_pair == 'eng' and ((row_index_for_eng_arb % 2 == 1 or ignore_row_index_for_eng_arb))):
        lst.insert(lct, random.choice(all_eng_names)[0])
        lst.insert(lct, random.choice(all_eng_names)[0])
        lst.insert(lct, random.choice(all_eng_names)[0])
    elif current_languages_pairs == 'arb_arb' or (current_languages_pairs == 'eng_arb' and current_processed_pair == 'arb' and ((row_index_for_eng_arb % 2 == 0 or ignore_row_index_for_eng_arb))):
        lst.insert(lct, random.choice(all_arb_names)[0])
        lst.insert(lct, random.choice(all_arb_names)[0])
        lst.insert(lct, random.choice(all_arb_names)[0])

    return ' '.join(lst)

def concatenate_append_random_four_names(row):
    global current_processed_pair, row_index_for_eng_arb, ignore_row_index_for_eng_arb
    lst = []
    if current_languages_pairs == 'eng_eng' or (current_languages_pairs == 'eng_arb' and current_processed_pair == 'eng' and ((row_index_for_eng_arb % 2 == 1 or ignore_row_index_for_eng_arb))):
        lst.append(random.choice(all_eng_names)[0])
        lst.append(random.choice(all_eng_names)[0])
        lst.append(random.choice(all_eng_names)[0])
        lst.append(random.choice(all_eng_names)[0])
    elif current_languages_pairs == 'arb_arb' or (current_languages_pairs == 'eng_arb' and current_processed_pair == 'arb' and ((row_index_for_eng_arb % 2 == 0 or ignore_row_index_for_eng_arb))):
        lst.append(random.choice(all_arb_names)[0])
        lst.append(random.choice(all_arb_names)[0])
        lst.append(random.choice(all_arb_names)[0])
        lst.append(random.choice(all_arb_names)[0])

    return ' '.join(lst)

def process_and_save_pairs(df, number_of_parts, name_of_output_file, part1_apply_function, part2_apply_function):
    global row_index_for_eng_arb, current_processed_pair
    folder = target_dir + folder_name + "/"
    if not os.path.exists(folder):
        os.makedirs(folder)

    number_of_pairs = len(df)
    pairs_indexes = number_of_parts * list(range(0,number_of_pairs))
    random.shuffle(pairs_indexes)
    name_pairs = []
    for row in df.itertuples():
        index,part1,part2,group = row
        if index % 100000 == 0:
            print("batch {index}".format(index=(index / 100000)))
        for i in range(0, number_of_parts):
            name_pairs.append([part1, part2, pairs_indexes.pop()])
    
    df = pd.DataFrame(name_pairs)
    df.columns = ['pair1', 'pair2', 'group']
    grp = df.groupby('group')
    if current_languages_pairs == 'eng_arb':
        current_processed_pair = 'eng'
        row_index_for_eng_arb = 0
    df['pair1'] = grp['pair1'].apply(part1_apply_function)
    if current_languages_pairs == 'eng_arb':
        current_processed_pair = 'arb'
        row_index_for_eng_arb = 0
    df['pair2'] = grp['pair2'].apply(part2_apply_function)
    print(len(df))
    df = df.dropna()
    print(len(df))
    df.to_csv(folder + name_of_output_file + '.tsv',sep='\t', quoting=csv.QUOTE_NONE)
    


In [None]:
random.choice(all_eng_names)

In [None]:
%%time
from queue import Queue
selected_rnd_names_for_initial = Queue()

folder_name = 'eng_eng/'
df = prepare_df(eng_eng_df)
current_languages_pairs = 'eng_eng'

process_and_save_pairs(df, 4, 'pos_4x4_ordered_pairs', concatenate_ordered_names, concatenate_ordered_names)

In [None]:
from queue import Queue

def process_all_language_pairs(dataset, language_pair, desired_length = 0):
    global current_languages_pairs, folder_name, row_index_for_eng_arb, selected_rnd_names_for_initial, ignore_row_index_for_eng_arb
    
    row_index_for_eng_arb = 0
    ignore_row_index_for_eng_arb = True
    selected_rnd_names_for_initial = Queue()
    current_languages_pairs = language_pair
    
    if desired_length == 0:
        df = prepare_df(dataset)
    else:
        df = prepare_duplicate_df(dataset, desired_length)
    
    folder_name = current_languages_pairs + '/'
    '''
    #4x4
    process_and_save_pairs(df, 4, 'pos_4x4_pairs', concatenate_ordered_names, concatenate_ordered_names)
    process_and_save_pairs(df, 4, 'pos_4x4_reversed_pairs', concatenate_ordered_names, concatenate_reversed_names)
    process_and_save_pairs(df, 4, 'neg_4x4_unordered_pairs', concatenate_ordered_names, concatenate_out_of_order_names)
    process_and_save_pairs(df, 3, 'pos_initial_3x4_pairs', concatenate_with_initial, concatenate_with_initial_name)
    process_and_save_pairs(df, 3, 'neg_3_1x3_1_pairs', concatenate_append_random_name, concatenate_append_random_name)
    process_and_save_pairs(df, 2, 'neg_2_2x2_2_pairs', concatenate_append_random_two_names, concatenate_append_random_two_names)
    process_and_save_pairs(df, 1, 'neg_1_3x1_3_pairs', concatenate_append_random_three_names, concatenate_append_random_three_names)

    #3x4
    process_and_save_pairs(df, 2, 'neg_2_1x2_2_pairs', concatenate_append_random_name, concatenate_append_random_two_names)
    process_and_save_pairs(df, 1, 'neg_1_2x1_3_pairs', concatenate_append_random_two_names, concatenate_append_random_three_names)
    process_and_save_pairs(df, 3, 'pos_3x4_reversed_pairs', concatenate_reversed_names, concatenate_append_random_name_in_middle)
    
    #2x4
    process_and_save_pairs(df, 2, 'pos_2x4_reversed_pairs', concatenate_reversed_names, concatenate_append_random_two_names_in_middle)
    process_and_save_pairs(df, 1, 'neg_1_1x1_3_pairs', concatenate_append_random_name, concatenate_append_random_three_names)
    
    #1x4
    process_and_save_pairs(df, 1, 'neg_1x4_pairs', concatenate_ordered_names, concatenate_append_random_four_names)
    
    #3x3
    process_and_save_pairs(df, 3, 'pos_3x3_pairs', concatenate_ordered_names, concatenate_ordered_names)
    process_and_save_pairs(df, 3, 'pos_3x3_reversed_pairs', concatenate_ordered_names, concatenate_reversed_names)
    '''
    process_and_save_pairs(df, 3, 'neg_3x3_unordered_pairs', concatenate_ordered_names,  concatenate_out_of_order_names)
    '''
    process_and_save_pairs(df, 2, 'neg_2_1x2_1_pairs', concatenate_append_random_name, concatenate_append_random_name)
    process_and_save_pairs(df, 1, 'neg_1_2x1_2_pairs', concatenate_append_random_two_names, concatenate_append_random_two_names)
    process_and_save_pairs(df, 2, 'pos_initial_2x3_pairs', concatenate_with_initial, concatenate_with_initial_name)

    #2x3
    process_and_save_pairs(df, 1, 'neg_1_1x1_2_pairs', concatenate_append_random_name, concatenate_append_random_two_names)
    
    #2x2
    process_and_save_pairs(df, 2, 'pos_2x2_pairs', concatenate_ordered_names, concatenate_ordered_names)
    process_and_save_pairs(df, 2, 'pos_2x2_reversed_pairs', concatenate_ordered_names, concatenate_reversed_names)
    process_and_save_pairs(df, 1, 'neg_1_1x1_1_pairs', concatenate_append_random_name, concatenate_append_random_name)
    process_and_save_pairs(df, 1, 'pos_1x2_initails_pairs', concatenate_with_initial, concatenate_with_initial_name)
    
    #1x1
    process_and_save_pairs(df, 1, 'pos_1x1_pairs', concatenate_ordered_names, concatenate_ordered_names)

    ignore_row_index_for_eng_arb = False
    if current_languages_pairs == 'eng_arb':        
        #3x4
        process_and_save_pairs(df, 3, 'pos_3x4_pairs', concatenate_append_random_name, concatenate_append_random_name)
        
        #2x4
        process_and_save_pairs(df, 2, 'pos_2x4_pairs', concatenate_append_random_two_names, concatenate_append_random_two_names)
        
        #1x4
        process_and_save_pairs(df, 1, 'pos_1x4_pairs', concatenate_append_random_three_names, concatenate_append_random_three_names)
        
        #2x3
        process_and_save_pairs(df, 2, 'pos_2x3_pairs', concatenate_append_random_name, concatenate_append_random_name)
        
        #1x3
        process_and_save_pairs(df, 1, 'pos_1x3_pairs', concatenate_append_random_two_names, concatenate_append_random_two_names)
        
        #1x2
        process_and_save_pairs(df, 1, 'pos_1x2_pairs', concatenate_append_random_name, concatenate_append_random_name)
    else:
        #3x4
        process_and_save_pairs(df, 3, 'pos_3x4_pairs', concatenate_ordered_names, concatenate_append_random_name)
        
        #2x4
        process_and_save_pairs(df, 2, 'pos_2x4_pairs', concatenate_ordered_names, concatenate_append_random_two_names)
        
        #1x4
        process_and_save_pairs(df, 1, 'pos_1x4_pairs', concatenate_ordered_names, concatenate_append_random_three_names)
        
        #2x3
        process_and_save_pairs(df, 2, 'pos_2x3_pairs', concatenate_ordered_names, concatenate_append_random_name)
        
        #1x3
        process_and_save_pairs(df, 1, 'pos_1x3_pairs', concatenate_ordered_names, concatenate_append_random_two_names)
        
        #1x2
        process_and_save_pairs(df, 1, 'pos_1x2_pairs', concatenate_ordered_names, concatenate_append_random_name)

    '''
        

In [None]:
from queue import Queue

def process_all_language_pairs(dataset, language_pair, desired_length = 0):
    global current_languages_pairs, folder_name, row_index_for_eng_arb, selected_rnd_names_for_initial, ignore_row_index_for_eng_arb
    
    row_index_for_eng_arb = 0
    ignore_row_index_for_eng_arb = True
    selected_rnd_names_for_initial = Queue()
    current_languages_pairs = language_pair
    
    if desired_length == 0:
        df = prepare_df(dataset)
    else:
        df = prepare_duplicate_df(dataset, desired_length)
    
    folder_name = current_languages_pairs + '/'
    
    #4x4
    process_and_save_pairs(df, 4, 'pos_4x4_pairs', concatenate_ordered_names, concatenate_ordered_names)
    process_and_save_pairs(df, 4, 'neg_4x4_unordered_pairs', concatenate_ordered_names, concatenate_out_of_order_names)
    process_and_save_pairs(df, 3, 'neg_3_1x3_1_pairs', concatenate_append_random_name, concatenate_append_random_name)

    #3x4
    process_and_save_pairs(df, 2, 'neg_2_1x2_2_pairs', concatenate_append_random_name, concatenate_append_random_two_names)
    
    #2x4
    process_and_save_pairs(df, 1, 'neg_1_1x1_3_pairs', concatenate_append_random_name, concatenate_append_random_three_names)
        
    #3x3
    process_and_save_pairs(df, 3, 'pos_3x3_pairs', concatenate_ordered_names, concatenate_ordered_names)
    process_and_save_pairs(df, 3, 'neg_3x3_unordered_pairs', concatenate_ordered_names,  concatenate_out_of_order_names)
    process_and_save_pairs(df, 2, 'neg_2_1x2_1_pairs', concatenate_append_random_name, concatenate_append_random_name)

    #2x3
    process_and_save_pairs(df, 1, 'neg_1_1x1_2_pairs', concatenate_append_random_name, concatenate_append_random_two_names)
    
    #2x2
    process_and_save_pairs(df, 2, 'pos_2x2_pairs', concatenate_ordered_names, concatenate_ordered_names)
    process_and_save_pairs(df, 1, 'neg_1_1x1_1_pairs', concatenate_append_random_name, concatenate_append_random_name)
    
    #1x1
    process_and_save_pairs(df, 1, 'pos_1x1_pairs', concatenate_ordered_names, concatenate_ordered_names)

    ignore_row_index_for_eng_arb = False
    if current_languages_pairs == 'eng_arb':        
        #3x4
        process_and_save_pairs(df, 3, 'pos_3x4_pairs', concatenate_append_random_name, concatenate_append_random_name)
        
        #2x4
        process_and_save_pairs(df, 2, 'pos_2x4_pairs', concatenate_append_random_two_names, concatenate_append_random_two_names)
                
        #2x3
        process_and_save_pairs(df, 2, 'pos_2x3_pairs', concatenate_append_random_name, concatenate_append_random_name)
    else:
        #3x4
        process_and_save_pairs(df, 3, 'pos_3x4_pairs', concatenate_ordered_names, concatenate_append_random_name)
        
        #2x4
        process_and_save_pairs(df, 2, 'pos_2x4_pairs', concatenate_ordered_names, concatenate_append_random_two_names)
                
        #2x3
        process_and_save_pairs(df, 2, 'pos_2x3_pairs', concatenate_ordered_names, concatenate_append_random_name)

In [None]:
%%time
#process_all_language_pairs(eng_eng_df, 'eng_eng')
eng_eng_no_dup_df = eng_eng_df.drop_duplicates(subset=[1, 2], keep='first')
process_all_language_pairs(arb_arb_df, 'arb_arb', len(eng_eng_no_dup_df))

In [None]:
%%time
process_all_language_pairs(eng_eng_df, 'eng_eng')

In [None]:
%%time
eng_eng_no_dup_df = eng_eng_df.drop_duplicates(subset=[1, 2], keep='first')
process_all_language_pairs(eng_arb_df, 'eng_arb', len(eng_eng_no_dup_df))

In [None]:
folder = target_dir + "eng_eng/"
pos_3x4_initails_df = pd.read_csv(folder + 'neg_1x1_pairs.tsv',sep='\t', header=None)



In [None]:
pos_3x4_initails_df[pos_3x4_initails_df[1] == 'MAHMOUD']

In [None]:
def unify_save_files(file_name, flag):
    df = pd.read_csv(file_name + '.tsv',sep='\t', header=None)
    cols = df.columns.tolist()
    if len(cols) == 4:
        df.drop(df.columns[3], axis=1, inplace=True)
        df.drop(df.columns[0], axis=1, inplace=True)
        df['similarity'] = flag
        cols = df.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        df = df[cols]
        df = df[df[1] != 'pair1']
        df.to_csv(file_name + '.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)
    elif len(cols) == 3:
        df[0] = flag
        df = df[df[1] != 'pair1']
        df.to_csv(file_name + '.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)
    

In [None]:
def unify_files(language_pair):
    path = target_dir + language_pair + '/'
'''
    positive_files = [
        'pos_4x4_pairs', 'pos_4x4_reversed_pairs', 'pos_initial_3x4_pairs',
        'pos_3x4_pairs', 'pos_3x4_reversed_pairs',
        'pos_2x4_pairs', 'pos_2x4_reversed_pairs',
        'pos_1x4_pairs',
        'pos_3x3_pairs', 'pos_3x3_reversed_pairs', 'pos_initial_2x3_pairs',
        'pos_2x3_pairs',
        'pos_1x3_pairs',
        'pos_2x2_pairs', 'pos_2x2_reversed_pairs', 'pos_1x2_initails_pairs',
        'pos_1x2_pairs',
        'pos_1x1_pairs'
    ]
    
    negative_files = [
        'neg_4x4_pairs', 'neg_4x4_unordered_pairs', 'neg_3_1x3_1_pairs', 'neg_2_2x2_2_pairs', 'neg_1_3x1_3_pairs',
        'neg_3x4_pairs', 'neg_2_1x2_2_pairs', 'neg_1_2x1_3_pairs',
        'neg_2x4_pairs', 'neg_1_1x1_3_pairs',
        'neg_1x4_pairs',
        'neg_3x3_pairs', 'neg_3x3_unordered_pairs', 'neg_2_1x2_1_pairs', 'neg_1_2x1_2_pairs',
        'neg_2x3_pairs', 'neg_1_1x1_2_pairs',
        'neg_1x3_pairs',
        'neg_2x2_pairs', 'neg_1_1x1_1_pairs',
        'neg_1x2_pairs',
        'neg_1x1_pairs'
    ]
'''  

    positive_files = [
        'pos_4x4_pairs', 'pos_3x4_pairs', 'pos_2x4_pairs', 
        'pos_3x3_pairs', 'pos_2x3_pairs',
        'pos_2x2_pairs', 
        'pos_1x1_pairs'
    ]
    
    negative_files = [
        'neg_4x4_unordered_pairs', 'neg_3_1x3_1_pairs',
        'neg_2_1x2_2_pairs',
        'neg_1_1x1_3_pairs',
        'neg_3x3_unordered_pairs', 'neg_2_1x2_1_pairs',
        'neg_1_1x1_2_pairs',
        'neg_1_1x1_1_pairs',
        'neg_1x1_pairs'
    ]
    for positive_file in positive_files:
        unify_save_files(path + positive_file, 1)
    
    for negative_file in negative_files:
        unify_save_files(path + negative_file, 0)
        

In [None]:
%%time
unify_files('eng_eng')
unify_files('eng_arb')
unify_files('arb_arb')

In [None]:
keys = list(names_dict.keys())

def create_negative_datasets(iterable_list, pair1_lang, pair2_lang):
    folder = target_dir + pair1_lang + '_' + pair2_lang + "/"
    neg_4x4_pairs = [] #7
    len_4x4_pair1 = 3
    len_4x4_pair2 = 4

    neg_3x4_pairs = [] #6
    len_3x4_pair1 = 2
    len_3x4_pair2 = 4

    neg_2x4_pairs = [] #5
    len_2x4_pair1 = 1
    len_2x4_pair2 = 4

    #neg_1x4_pairs = [] #4

    neg_3x3_pairs = [] #5
    len_3x3_pair1 = 2
    len_3x3_pair2 = 3

    neg_2x3_pairs = [] #4
    len_2x3_pair1 = 1
    len_2x3_pair2 = 3

    neg_1x3_pairs = [] #3
    len_1x3_pair1 = 0
    len_1x3_pair2 = 3

    neg_2x2_pairs = [] #3
    len_2x2_pair1 = 1
    len_2x2_pair2 = 2

    neg_1x2_pairs = [] #2
    len_1x2_pair1 = 0
    len_1x2_pair2 = 2

    neg_1x1_pairs = [] #1
    len_1x1_pair1 = 0
    len_1x1_pair2 = 1
    
    multiplier = int(len(eng_eng_no_dup_df) / len(iterable_list))
    number_of_random_needed = multiplier * 36 + 1
    
    for name in iterable_list:
        sel_keys = set(random.sample(keys, number_of_random_needed))
        sel_keys = [key for key in sel_keys if key != name[1]]
        pair1_keys = sel_keys[:multiplier * 10]
        pair1_src = [random.sample(names_dict[item][pair1_lang], 1)[0] for item in pair1_keys]
        pair2_keys = sel_keys[multiplier * 10:]
        pair2_src = [random.sample(names_dict[item][pair2_lang], 1)[0] for item in pair2_keys]
                
        pair1_index = 0
        pair2_index = 0

        #4x4
        for i in range(0, multiplier):
            pair1 = [name[0]]
            pair2 = []
            for y in range(0, len_4x4_pair1):
                pair1.append(pair1_src[pair1_index])
                random.shuffle(pair1)
                pair1_index += 1
            for y in range(0, len_4x4_pair2):
                pair2.append(pair2_src[pair2_index])
                pair2_index += 1
            neg_4x4_pairs.append([(' '.join(pair1)), (' '.join(pair2))])
    
        #3x4
        for i in range(0, multiplier):
            pair1 = [name[0]]
            pair2 = []
            for y in range(0, len_3x4_pair1):
                pair1.append(pair1_src[pair1_index])
                random.shuffle(pair1)
                pair1_index += 1
            for y in range(0, len_3x4_pair2):
                pair2.append(pair2_src[pair2_index])
                pair2_index += 1
            neg_3x4_pairs.append([(' '.join(pair1)), (' '.join(pair2))])
        
        #2x4
        for i in range(0, multiplier):
            pair1 = [name[0]]
            pair2 = []
            for y in range(0, len_2x4_pair1):
                pair1.append(pair1_src[pair1_index])
                random.shuffle(pair1)
                pair1_index += 1
            for y in range(0, len_2x4_pair2):
                pair2.append(pair2_src[pair2_index])
                pair2_index += 1
            neg_2x4_pairs.append([(' '.join(pair1)), (' '.join(pair2))])

        #3x3
        for i in range(0, multiplier):
            pair1 = [name[0]]
            pair2 = []
            for y in range(0, len_3x3_pair1):
                pair1.append(pair1_src[pair1_index])
                random.shuffle(pair1)
                pair1_index += 1
            for y in range(0, len_3x3_pair2):
                pair2.append(pair2_src[pair2_index])
                pair2_index += 1
            neg_3x3_pairs.append([(' '.join(pair1)), (' '.join(pair2))])

        #2x3
        for i in range(0, multiplier):
            pair1 = [name[0]]
            pair2 = []
            for y in range(0, len_2x3_pair1):
                pair1.append(pair1_src[pair1_index])
                random.shuffle(pair1)
                pair1_index += 1
            for y in range(0, len_2x3_pair2):
                pair2.append(pair2_src[pair2_index])
                pair2_index += 1
            neg_2x3_pairs.append([(' '.join(pair1)), (' '.join(pair2))])

        #1x3
        for i in range(0, multiplier):
            pair1 = [name[0]]
            pair2 = []
            for y in range(0, len_1x3_pair1):
                pair1.append(pair1_src[pair1_index])
                pair1_index += 1
            for y in range(0, len_1x3_pair2):
                pair2.append(pair2_src[pair2_index])
                pair2_index += 1
            neg_1x3_pairs.append([(' '.join(pair1)), (' '.join(pair2))])

        #2x2
        for i in range(0, multiplier):
            pair1 = [name[0]]
            pair2 = []
            for y in range(0, len_2x2_pair1):
                pair1.append(pair1_src[pair1_index])
                random.shuffle(pair1)
                pair1_index += 1
            for y in range(0, len_2x2_pair2):
                pair2.append(pair2_src[pair2_index])
                pair2_index += 1
            neg_2x2_pairs.append([(' '.join(pair1)), (' '.join(pair2))])

        #1x2
        for i in range(0, multiplier):
            pair1 = [name[0]]
            pair2 = []
            for y in range(0, len_1x2_pair1):
                pair1.append(pair1_src[pair1_index])
                pair1_index += 1
            for y in range(0, len_1x2_pair2):
                pair2.append(pair2_src[pair2_index])
                pair2_index += 1
            neg_1x2_pairs.append([(' '.join(pair1)), (' '.join(pair2))])

        #1x1
        for i in range(0, multiplier):
            pair1 = [name[0]]
            pair2 = []
            for y in range(0, len_1x1_pair1):
                pair1.append(pair1_src[pair1_index])
                pair1_index += 1
            for y in range(0, len_1x1_pair2):
                pair2.append(pair2_src[pair2_index])
                pair2_index += 1
            neg_1x1_pairs.append([(' '.join(pair1)), (' '.join(pair2))])
    
    df = pd.DataFrame(neg_4x4_pairs)
    df.to_csv(folder + 'neg_4x4_pairs.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False) 
    df = pd.DataFrame(neg_3x4_pairs)
    df.to_csv(folder + 'neg_3x4_pairs.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False) 
    df = pd.DataFrame(neg_2x4_pairs)
    df.to_csv(folder + 'neg_2x4_pairs.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False)     
    df = pd.DataFrame(neg_3x3_pairs)
    df.to_csv(folder + 'neg_3x3_pairs.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False) 
    df = pd.DataFrame(neg_2x3_pairs)
    df.to_csv(folder + 'neg_2x3_pairs.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False) 
    df = pd.DataFrame(neg_1x3_pairs)
    df.to_csv(folder + 'neg_1x3_pairs.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False) 
    df = pd.DataFrame(neg_2x2_pairs)
    df.to_csv(folder + 'neg_2x2_pairs.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False) 
    df = pd.DataFrame(neg_1x2_pairs)
    df.to_csv(folder + 'neg_1x2_pairs.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False) 
    df = pd.DataFrame(neg_1x1_pairs)
    df.to_csv(folder + 'neg_1x1_pairs.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False) 


In [None]:
%%time
eng_eng_no_dup_df = eng_eng_df.drop_duplicates(subset=[1, 2], keep='first')
create_single_negative_datasets()
#create_single_negative_datasets(all_eng_names, 'eng', 'arb')
#create_single_negative_datasets(all_arb_names, 'arb', 'arb')

In [None]:
from fuzzywuzzy import fuzz
keys = list(names_dict.keys())
neg_eng_eng_pairs = []
neg_eng_arb_pairs = []
neg_arb_arb_pairs = []
print(len(keys))

def create_single_negative_datasets():
    folder = target_dir + "fuzzy/"
    if not os.path.exists(folder):
        os.makedirs(folder)
    eng_multiplier = math.ceil(len(eng_eng_no_dup_df) / len(all_eng_names))
    arb_multiplier = math.ceil(len(eng_eng_no_dup_df) / len(all_arb_names))
    
    i = 0
    for key in keys:
        if i % 1000 == 0:
            print("batch {index}".format(index=(i / 1000)))
        i += 1
        
        eng_names = names_dict[key]['eng']
        #eng_names_len = len(eng_names)
        arb_names = names_dict[key]['arb']
        #arb_names_len = len(arb_names)
        #similar_keys1 = set([name[1] for name in all_eng_names if any(s in name[0] for s in eng_names)])
        #similar_keys2 = set([name[1] for name in all_arb_names if any(s in name[0] for s in arb_names)])
        #similar_keys = similar_keys1.union(similar_keys2)
        similar_keys = [key]
        #other_eng_names = [name[0]for name in all_eng_names if name[1] not in similar_keys and 100 > fuzz.partial_ratio(eng_names[0], name[0]) > 49]
        #other_arb_names = [name[0] for name in all_arb_names if name[1] not in similar_keys and 100 > fuzz.partial_ratio(arb_names[0], name[0]) > 49]

        '''
        eng_multiplier = math.ceil(len(eng_eng_no_dup_df) / len(all_eng_names))
        needed_eng_eng_negatives = eng_multiplier * eng_names_len
        if len(other_eng_names) >= needed_eng_eng_negatives:
            eng_eng_src_names = random.sample(other_eng_names, needed_eng_eng_negatives)
        else:
            print('problem generating eng_eng for key:{key}'.format(key=key))

        needed_eng_arb_negatives = eng_multiplier * eng_names_len
        if len(other_arb_names) >= needed_eng_arb_negatives:
            eng_arb_src_names = random.sample(other_arb_names, needed_eng_arb_negatives)
        else:
            print('problem generating eng_arb for key:{key}'.format(key=key))

        arb_multiplier = math.ceil(len(eng_eng_no_dup_df) / len(all_arb_names))
        needed_arb_arb_negatives = arb_multiplier * arb_names_len
        if len(other_arb_names) >= needed_arb_arb_negatives:
            arb_arb_src_names = random.sample(other_arb_names, needed_arb_arb_negatives)
        else:
            print('problem generating arb_arb for key:{key}'.format(key=key))
        
        #other_eng_names.sort(key = lambda x: x[1],reverse=True)
        #other_arb_names.sort(key = lambda x: x[1],reverse=True)

        index = 0
        '''
        for eng in eng_names:
            src = []
            indx = 0
            while (len(src) < eng_multiplier) and indx < 150:
                indx += 1
                rnd = random.sample(all_eng_names, (eng_multiplier*2))
                eng_eng_src_names = [name[0] for name in rnd if name[1] not in similar_keys and 100 > fuzz.partial_ratio(eng, name[0]) > 49]
                src.extend(eng_eng_src_names)
            
            if indx >= 150 and len(src) < (eng_multiplier):
                print("no eng negative added for name: {name}, key: {key}, added: {count}".format(name=eng, key=key, count=len(src)))
                rnd = random.sample(all_eng_names, (eng_multiplier - len(src)))
                eng_eng_src_names = [name[0] for name in rnd if name[1] not in similar_keys]
                src.extend(eng_eng_src_names)
                
            neg_eng_eng_pairs.extend(list(product([eng], src)))
            src = []
            indx = 0
            while (len(src) < eng_multiplier) and indx < 150:
                indx += 1
                rnd = random.sample(all_arb_names, (eng_multiplier*2))
                eng_arb_src_names = [name[0] for name in rnd if name[1] not in similar_keys and 100 > fuzz.partial_ratio(arb_names[0], name[0]) > 49]
                src.extend(eng_arb_src_names)
            
            if indx >= 150 and len(src) < (eng_multiplier):
                print("no arb negative added for name: {name}, key: {key}, added: {count}".format(name=eng, key=key, count=len(src)))
                rnd = random.sample(all_arb_names, (eng_multiplier - len(src)))
                eng_arb_src_names = [name[0] for name in rnd if name[1] not in similar_keys]
                src.extend(eng_arb_src_names)
            neg_eng_arb_pairs.extend(list(product([eng], src)))
            '''
            lower_limit = index * eng_multiplier
            higher_limit = ((index + 1) * eng_multiplier)
            index += 1
            src = eng_eng_src_names[lower_limit:higher_limit]
            #src = [item[0] for item in src]
            neg_eng_eng_pairs.extend(list(product([eng], src)))
            src = eng_arb_src_names[lower_limit:higher_limit]
            #src = [item[0] for item in src]
            neg_eng_arb_pairs.extend(list(product([eng], src)))
            '''
        #index = 0
        for arb in arb_names:
            src = []
            indx = 0
            while (len(src) < arb_multiplier) and indx < 150:
                indx += 1
                rnd = random.sample(all_arb_names, (arb_multiplier*3))
                arb_arb_src_names = [name[0] for name in rnd if name[1] not in similar_keys and 100 > fuzz.partial_ratio(arb, name[0]) > 49]
                src.extend(arb_arb_src_names)
            
            if indx >= 150 and len(src) < (arb_multiplier):
                print("no arb negative added for name: {name}, key: {key}, added: {count}".format(name=arb, key=key, count=len(src)))
                rnd = random.sample(all_arb_names, (arb_multiplier - len(src)))
                arb_arb_src_names = [name[0] for name in rnd if name[1] not in similar_keys]
                src.extend(arb_arb_src_names)

            neg_arb_arb_pairs.extend(list(product([arb], src)))
            '''
            lower_limit = index * arb_multiplier
            higher_limit = ((index + 1) * arb_multiplier)
            index += 1
            src = arb_arb_src_names[lower_limit:higher_limit]
            #src = [item[0] for item in src]
            neg_arb_arb_pairs.extend(list(product([arb], src)))
            '''
    
    df = pd.DataFrame(neg_eng_eng_pairs)
    df.to_csv(folder + 'neg_eng_eng_pairs.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False) 
    df = pd.DataFrame(neg_eng_arb_pairs)
    df.to_csv(folder + 'neg_eng_arb_pairs.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False) 
    df = pd.DataFrame(neg_arb_arb_pairs)
    df.to_csv(folder + 'neg_arb_arb_pairs.tsv',sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False) 

In [None]:
folder = target_dir + "eng_eng/"
file_name = 'pos_2x4_pairs.tsv'
df = pd.read_csv(folder + file_name,sep='\t', header=None)
df = df.sample(frac=1).reset_index(drop=True)
df = df.sample(frac=1).reset_index(drop=True)
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df

In [None]:
%%time
neg_eng_eng_pairs = []
neg_eng_arb_pairs = []
neg_arb_arb_pairs = []

#similar_keys = [k for k in keys if k == key]
eng_multiplier = math.ceil(len(eng_eng_no_dup_df) / len(all_eng_names))
arb_multiplier = math.ceil(len(eng_eng_no_dup_df) / len(all_arb_names))

key = 1
#similar_keys1 = set([name[1] for name in all_eng_names if any(s in name[0] for s in eng_names)])
#similar_keys2 = set([name[1] for name in all_arb_names if any(s in name[0] for s in arb_names)])
#similar_keys = similar_keys1.union(similar_keys2)
similar_keys = [key]
eng_names = names_dict[key]['eng']
arb_names = names_dict[key]['arb']

index = 0
for eng in eng_names:
    src = []
    while len(src) < (eng_multiplier):
        rnd = random.sample(all_eng_names, (eng_multiplier*2))
        eng_eng_src_names = [name[0]for name in rnd if name[1] not in similar_keys and 100 > fuzz.partial_ratio(eng_names[0], name[0]) > 49]
        src.extend(eng_eng_src_names)
    '''
    lower_limit = index * eng_multiplier
    higher_limit = ((index + 1) * eng_multiplier)
    index += 1
    src = eng_eng_src_names[lower_limit:higher_limit]
    #src = [item[0] for item in src]
    '''
    neg_eng_eng_pairs.extend(list(product([eng], src)))
    src = []
    while len(src) < (eng_multiplier):
        rnd = random.sample(all_arb_names, (eng_multiplier))
        eng_arb_src_names = [name[0] for name in rnd if name[1] not in similar_keys and 100 > fuzz.partial_ratio(arb_names[0], name[0]) > 49]
        src.extend(eng_arb_src_names)
    #src = eng_arb_src_names[lower_limit:higher_limit]
    #src = [item[0] for item in src]
    neg_eng_arb_pairs.extend(list(product([eng], src)))

index = 0
for arb in arb_names:
    '''
    lower_limit = index * arb_multiplier
    higher_limit = ((index + 1) * arb_multiplier)
    index += 1
    src = arb_arb_src_names[lower_limit:higher_limit]
    #src = [item[0] for item in src]
    '''
    src = []
    while len(src) < (arb_multiplier):
        rnd = random.sample(all_arb_names, (arb_multiplier*100))
        arb_arb_src_names = [name[0] for name in rnd if name[1] not in similar_keys and 100 > fuzz.partial_ratio(arb_names[0], name[0]) > 49]
        src.extend(arb_arb_src_names)
    
    neg_arb_arb_pairs.extend(list(product([arb], src)))

In [None]:
neg_arb_arb_pairs

In [None]:
[item for sublist in t for item in sublist[0]]

In [None]:
%%time
if len(eng_eng_df.columns) == 4:
    eng_eng_df.drop(eng_eng_df.columns[0], axis=1, inplace=True)
eng_eng_df = eng_eng_df.drop_duplicates(subset=[1, 2], keep='first').copy()
eng_eng_df = eng_eng_df.sample(frac=1).reset_index(drop=True)
eng_eng_df = eng_eng_df.reset_index()
eng_eng_df.columns = ['index', 'part1', 'part2', 'group_id']

if len(eng_arb_df.columns) == 4:
    eng_arb_df.drop(eng_arb_df.columns[0], axis=1, inplace=True)
eng_arb_df = eng_arb_df.drop_duplicates(subset=[1, 2], keep='first').copy()
eng_arb_df = eng_arb_df.sample(frac=1).reset_index(drop=True)
eng_arb_df = eng_arb_df.reset_index()
eng_arb_df.columns = ['index', 'part1', 'part2', 'group_id']

if len(arb_arb_df.columns) == 4:
    arb_arb_df.drop(arb_arb_df.columns[0], axis=1, inplace=True)
arb_arb_df = arb_arb_df.drop_duplicates(subset=[1, 2], keep='first').copy()
arb_arb_df = arb_arb_df.sample(frac=1).reset_index(drop=True)
arb_arb_df = arb_arb_df.reset_index()
arb_arb_df.columns = ['index', 'part1', 'part2', 'group_id']

if len(eng_arb_negative_df.columns) == 4:
    eng_arb_negative_df.drop(eng_arb_negative_df.columns[0], axis=1, inplace=True)
eng_arb_negative_df = eng_arb_negative_df.drop_duplicates(subset=[1, 2], keep='first').copy()
eng_arb_negative_df = eng_arb_negative_df.sample(frac=1).reset_index(drop=True)
eng_arb_negative_df = eng_arb_negative_df.reset_index()
eng_arb_negative_df.columns = ['index', 'part1', 'part2', 'group_id']

if len(eng_eng_nagative_df.columns) == 4:
    eng_eng_nagative_df.drop(eng_eng_nagative_df.columns[0], axis=1, inplace=True)
eng_eng_nagative_df = eng_eng_nagative_df.drop_duplicates(subset=[1, 2], keep='first').copy()
eng_eng_nagative_df = eng_eng_nagative_df.reset_index(drop=True)
eng_eng_nagative_df = eng_eng_nagative_df.reset_index()
eng_eng_nagative_df.columns = ['index', 'part1', 'part2', 'group_id']

if len(arb_arb_negative_df.columns) == 4:
    arb_arb_negative_df.drop(arb_arb_negative_df.columns[0], axis=1, inplace=True)
arb_arb_negative_df = arb_arb_negative_df.drop_duplicates(subset=[1, 2], keep='first').copy()
arb_arb_negative_df = arb_arb_negative_df.sample(frac=1).reset_index(drop=True)
arb_arb_negative_df = arb_arb_negative_df.reset_index()
arb_arb_negative_df.columns = ['index', 'part1', 'part2', 'group_id']


In [None]:
%%time

## four part names
if len(eng_eng_df.columns) == 4:
    eng_eng_df.drop(eng_eng_df.columns[0], axis=1, inplace=True)
eng_eng_df = eng_eng_df.drop_duplicates(subset=[1, 2], keep='first').copy()
eng_eng_df = eng_eng_df.sample(frac=1).reset_index(drop=True)
eng_eng_df = eng_eng_df.reset_index()
eng_eng_df.columns = ['index', 'part1', 'part2', 'group_id']
eng_eng_df['set_type'] = eng_eng_df['index'] - eng_eng_df['index'] % 4
eng_eng_df.drop('index', axis=1, inplace=True)

In [None]:
eng_eng_df

In [None]:
%%time
grp = eng_eng_df.groupby('set_type', sort=False)
eng_eng_df['part1'] =  grp['part1'].apply(lambda x: ' '.join(x))
eng_eng_df['part2'] = eng_eng_df.groupby('set_type')['part2'].apply(lambda x: ' '.join(x))
eng_eng_df['group_id'] = eng_eng_df.groupby('set_type')['group_id'].apply(lambda x: ', '.join(str(x)))
eng_eng_df = eng_eng_df.drop_duplicates(subset=['set_type'], keep='first').copy()
eng_eng_df.drop('set_type', axis=1, inplace=True)
eng_eng_df = eng_eng_df.sample(frac=1).reset_index(drop=True)
eng_eng_df['similarity'] = 1
cols = eng_eng_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
eng_eng_df = eng_eng_df[cols]
#eng_eng_df.columns = ['similarity', 'part1', 'part2', 'group_id']

In [None]:
len(eng_eng_df.drop_duplicates(subset=[1, 2], keep='first'))



In [None]:
print(len(eng_eng_df))
print(len(eng_arb_df))
print(len(arb_arb_df))


eng_eng_no_dup_df = eng_eng_df.drop_duplicates(subset=[1, 2], keep='first')
eng_arb_no_dup_df = eng_arb_df.drop_duplicates(subset=[1, 2], keep='first')
arb_arb_no_dup_df = arb_arb_df.drop_duplicates(subset=[1, 2], keep='first')

print(len(eng_eng_no_dup_df))
print(len(eng_arb_no_dup_df))
print(len(arb_arb_no_dup_df))

In [None]:
names = set(eng_arb_df[eng_arb_df[3] == 2][2].tolist())
for name in names:
    print(name)
    print(transliterateString(name.lower(), 0))

In [None]:
eng_arb_df[eng_arb_df[3] == 3]

In [None]:
','.join(set(eng_arb_df[eng_arb_df[3] == 6][1].tolist()))

In [None]:
buck2uni = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "a": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "h": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "sh": u"\u0634", # shiin
            "s": u"\u0635", # Saad
            "d": u"\u0636", # Daad
            "t": u"\u0637", # Taa'
            "z": u"\u0638", # Zaa' (DHaa')
            "e": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "f": u"\u064B", # fatHatayn
            "n": u"\u064C", # Dammatayn
            "k": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
}

def transString(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2uni.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

print(transString(u'موتشاماد'))
print(transString('mrHbA', 1))

In [None]:
import sys, getopt, codecs, os, re

# Declare a dictionary with Buckwalter's ASCII symbols as the keys, and
# their unicode equivalents as values.

buck2uni = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "a": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "h": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "sh": u"\u0634", # shiin
            "s": u"\u0635", # Saad
            "d": u"\u0636", # Daad
            "t": u"\u0637", # Taa'
            "z": u"\u0638", # Zaa' (DHaa')
            "e": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "f": u"\u064B", # fatHatayn
            "n": u"\u064C", # Dammatayn
            "k": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
}

# For a reverse transliteration (Unicode -> Buckwalter), a dictionary
# which is the reverse of the above buck2uni is essential.

uni2buck = {}

# Iterate through all the items in the buck2uni dict.
for (key, value) in buck2uni.items():
    # The value from buck2uni becomes a key in uni2buck, and vice
    # versa for the keys.
    uni2buck[value] = key

def transliterateString(inString, reverse=1):

	out = ""
	
	# For normal Buckwalter -> Unicode transliteration..
	if not reverse:

		# Loop over each character in the string, inString.
		for char in inString:
			# Look up current char in the dictionary to get its
			# respective value. If there is no match, e.g., chars like
			# spaces, then just stick with the current char without any
			# conversion.
			out = out + buck2uni.get(char, char)
	
	# Same as above, just in the other direction.
	else:

		for char in inString:
			out = out + uni2buck.get(char, char)

	return out

print(transliterateString(u'موتشاماد'))
print(transliterateString('mrHbA', 0))

In [None]:
%%time
#eng_eng_df = eng_eng_df.sample(frac=1).reset_index(drop=True)
#eng_eng_df = eng_eng_df.sample(frac=1).reset_index()


eng_eng_all_pos_4x4_ordered = eng_eng_df[0:500].copy()
'''
eng_eng_all_pos_4x4_unordered = 
eng_eng_all_pos_3x4_initails = 
eng_eng_all_pos_3x4_ordered = 
eng_eng_all_pos_3x4_unordered = 
eng_eng_all_pos_2x4_ordered = 
eng_eng_all_pos_2x4_unordered = 
eng_eng_all_pos_1x4_any_order = 
eng_eng_all_pos_3x3_ordered = 
eng_eng_all_pos_3x3_unordered = 
eng_eng_all_pos_2x3_initails = 
eng_eng_all_pos_2x3_any_order = 
eng_eng_all_pos_1x3_any_order = 
eng_eng_all_pos_2x2_ordered = 
eng_eng_all_pos_2x2_unordered = 
eng_eng_all_pos_1x2_initails = 
eng_eng_all_pos_1x2_any_order = 
eng_eng_all_pos_1x1 = 


eng_eng_4neg_x_4neg = 
eng_eng_3neg_1pos_x_3neg_1pos = 
eng_eng_2neg_2pos_x_2neg_2pos = 
eng_eng_2neg_x_2neg =
eng_eng_1neg_1pos_x_1neg_1pos = 
eng_eng_1neg_x_1neg = 
'''
'''
index = 0
group_id = 1
eng_eng_df['group'] = 1


eng_eng_df.loc[eng_eng_df['index'] % 52 == 0 | , 'set_type'] = eng_eng_df['index']
eng_eng_df.loc[eng_eng_df['index'] % 10 == 1, 'set_type'] = eng_eng_df['index'] - 1
eng_eng_df.loc[eng_eng_df['index'] % 10 == 2, 'set_type'] = eng_eng_df['index'] - 2
eng_eng_df.loc[eng_eng_df['index'] % 10 == 3, 'set_type'] = eng_eng_df['index'] - 3
eng_eng_df.loc[eng_eng_df['index'] % 10 == 4, 'set_type'] = eng_eng_df['index']
eng_eng_df.loc[eng_eng_df['index'] % 10 == 5, 'set_type'] = eng_eng_df['index'] - 1
eng_eng_df.loc[eng_eng_df['index'] % 10 == 6, 'set_type'] = eng_eng_df['index'] - 2
eng_eng_df.loc[eng_eng_df['index'] % 10 == 7, 'set_type'] = eng_eng_df['index']
eng_eng_df.loc[eng_eng_df['index'] % 10 == 8, 'set_type'] = eng_eng_df['index'] - 1
eng_eng_df.loc[eng_eng_df['index'] % 10 == 9, 'set_type'] = eng_eng_df['index']
eng_eng_df.sort_values('group')
'''
'''
for i, row in eng_eng_df.iterrows():
    index += 1
    if index % 10 == 4 or index % 10 == 7 or index % 10 == 8 or index % 10 == 0:
        group_id += 1
    eng_eng_df.loc[i,'group'] = group_id
    
'''

In [None]:
eng_eng_all_pos_4x4_ordered

In [None]:
%%time
frames = [eng_ara_df, eng_eng_df, ara_ara_df, eng_arb_negative_df, eng_eng_nagative_df, ara_ara_negative_df]
all_pairs_df = pd.concat(frames)

In [None]:
print(top_given_names['eng_variants'].loc[0])

In [None]:
print(top_given_names['eng_variants'].map(len).max())

lengths = top_given_names[top_given_names['eng_variants'].str.len() >= 1]['eng_variants'].tolist()
lengths = [len(sublist.split(',')) for sublist in lengths]
print(max(lengths))

In [None]:
%%time
import csv
all_pairs_df.to_csv('/home/jupyter/notebooks/PoC/data-preparation/output/understanding_data/all_pairs.tsv',sep='\t', quoting=csv.QUOTE_NONE)



In [None]:
print(top_given_names.shape)
top_given_names = top_given_names.drop_duplicates(subset=['good_eng_variants', 'good_arb_variants'], keep='first').copy()
print(top_given_names.shape)

In [None]:
bottom_given_names = given_names[given_names['count'] < 10]
bottom_given_names = bottom_given_names.copy()
bottom_given_names.shape

In [None]:
(top_given_names[top_given_names['eng'] == "MOHAMMED"]['eng_variants'].tolist()[0])

In [None]:
%%time

'''
bottom_given_names['eng_variants'] = bottom_given_names.groupby(['trimmed_arb'])['trimmed_eng'].transform(format_variants_list)
bottom_given_names['arb_variants'] = bottom_given_names.groupby(['trimmed_eng'])['trimmed_arb'].transform(format_variants_list)
'''

print(bottom_given_names.shape)
bottom_given_names = bottom_given_names.drop_duplicates(subset=['trimmed_eng', 'trimmed_arb'], keep='first').copy()
print(bottom_given_names.shape)

In [None]:
%%time
frames = [eng_ara_df, eng_eng_df, ara_ara_df, eng_arb_negative_df, eng_eng_nagative_df, ara_ara_negative_df]
all_pairs_df = pd.concat(frames)


In [None]:
'''
def create_negative_both(row):
    global top_given_names, all_eng_given_names, all_arb_given_names
    
    eng_variants = list(set(row['eng_variants'].split(',')))
    arb_variants = list(set(row['arb_variants'].split(',')))
    
    #print(eng_variants)
    #print(arb_variants)
    
    matched_variants = top_given_names[top_given_names['trimmed_eng'].isin(eng_variants)
                                       | top_given_names['trimmed_arb'].isin(arb_variants)]
    
    eng_matches = list(set(matched_variants['eng_variants'].tolist()))
    arb_matches = list(set(matched_variants['arb_variants'].tolist()))
    
    
    flat_eng_matches = list(set([item for sublist in eng_matches for item in sublist.split(',')]))
    flat_arb_matches = list(set([item for sublist in arb_matches for item in sublist.split(',')]))
    
    #print(flat_eng_matches)
    #print(flat_arb_matches)
    
    matched_variants = top_given_names[top_given_names['trimmed_eng'].isin(flat_eng_matches)
                                       | top_given_names['trimmed_arb'].isin(flat_arb_matches)]
    
    eng_matches = list(set(matched_variants['eng_variants'].tolist()))
    arb_matches = list(set(matched_variants['arb_variants'].tolist()))
    
    #print(eng_matches)
    #print(arb_matches)
    
    eng_variants = sorted(set([item for sublist in eng_matches for item in sublist.split(',')])) 
    arb_variants = sorted(set([item for sublist in arb_matches for item in sublist.split(',')]))
    #print(eng_variants)
    #print(arb_variants)
    
    eng_word_length = len(row['trimmed_eng'])
    good_eng_variants = [item for item in eng_variants if (eng_word_length * 0.3 < len(item) <= eng_word_length * 2 and item not in row['trimmed_eng']) or item == row['trimmed_eng']]
    arb_word_length = len(row['trimmed_arb'])
    good_arb_variants = [item for item in arb_variants if (arb_word_length * 0.3 < len(item) <= arb_word_length * 2 and item not in row['trimmed_arb']) or item == row['trimmed_arb']]

    eng_variants_len = len(good_eng_variants)
    arb_variants_len = len(good_arb_variants)
    
    ### making good arabic and english variants of same length
    if eng_variants_len > arb_variants_len:
        quotient, modulo = divmod(eng_variants_len, arb_variants_len)
        extension = random.sample(good_arb_variants, modulo)
        good_arb_variants = [repeated for value in good_arb_variants for repeated in repeat(value, quotient)]
        good_arb_variants.extend(extension)
    elif arb_variants_len > eng_variants_len:
        quotient, modulo = divmod(arb_variants_len, eng_variants_len)
        extension = random.sample(good_eng_variants, modulo)
        good_eng_variants = [repeated for value in good_eng_variants for repeated in repeat(value, quotient)]
        good_eng_variants.extend(extension)
    
    eng_variants_len = len(good_eng_variants)
    arb_variants_len = len(good_arb_variants)
    
    random_eng_negative = []
    random_arb_negative = []
    
    if (len(all_eng_given_names) - eng_variants_len) > (eng_variants_len * eng_variants_len):
        random_eng_negative = random.sample(all_eng_given_names, (eng_variants_len * eng_variants_len))
        random_eng_negative = list(np.setdiff1d(random_eng_negative, good_eng_variants, assume_unique=True))
    
    else:
        random_eng_negative = list(np.setdiff1d(all_eng_given_names, good_eng_variants, assume_unique=True))
        '''
        quotient, modulo = divmod(eng_variants_len * (eng_variants_len - 1), len(random_eng_negative))
        extension = random.sample(random_eng_negative, modulo)
        random_eng_negative = [repeated for value in random_eng_negative for repeated in repeat(value, quotient)]
        random_eng_negative.extend(extension)
        ''' 
    if (len(all_arb_given_names) - arb_variants_len) > (arb_variants_len * arb_variants_len):
        random_arb_negative = random.sample(all_arb_given_names, (arb_variants_len * arb_variants_len))
        random_arb_nagative = list(np.setdiff1d(random_arb_negative, good_arb_variants, assume_unique=True))
    
    else:
        random_arb_negative = list(np.setdiff1d(all_arb_given_names, good_arb_variants, assume_unique=True))
        '''
        quotient, modulo = divmod(arb_variants_len * (arb_variants_len - 1), len(random_arb_negative))
        extension = random.sample(random_arb_negative, modulo)
        random_arb_negative = [repeated for value in random_arb_negative for repeated in repeat(value, quotient)]
        random_arb_negative.extend(extension)
        '''
    
    row['eng_variants'] = ','.join(eng_variants)
    row['arb_variants'] = ','.join(arb_variants)
    row['good_eng_variants'] = ','.join(good_eng_variants)
    row['good_arb_variants'] = ','.join(good_arb_variants)
    row['negative_eng_variants'] = ','.join(random_eng_negative)
    row['negative_arb_variants'] = ','.join(random_arb_negative)
    
    return row
'''

In [None]:
'''

def format_list_both(row):
    global top_given_names
    
    eng_variants = list(set(row['eng_variants'].split(',')))
    arb_variants = list(set(row['arb_variants'].split(',')))
    
    matched_variants = top_given_names[top_given_names['trimmed_eng'].isin(eng_variants)
                                       | top_given_names['trimmed_arb'].isin(arb_variants)]
    
    eng_matches = list(set(matched_variants['eng_variants'].tolist()))
    arb_matches = list(set(matched_variants['arb_variants'].tolist()))
    
    
    flat_eng_matches = list(set([item for sublist in eng_matches for item in sublist.split(',')]))
    flat_arb_matches = list(set([item for sublist in arb_matches for item in sublist.split(',')]))
        
    matched_variants = top_given_names[top_given_names['trimmed_eng'].isin(flat_eng_matches)
                                       | top_given_names['trimmed_arb'].isin(flat_arb_matches)]
    
    eng_matches = list(set(matched_variants['eng_variants'].tolist()))
    arb_matches = list(set(matched_variants['arb_variants'].tolist()))
    
    eng_variants = sorted(set([item for sublist in eng_matches for item in sublist.split(',')])) 
    arb_variants = sorted(set([item for sublist in arb_matches for item in sublist.split(',')]))
    
    row['eng_variants'] = ','.join(eng_variants)
    row['arb_variants'] = ','.join(arb_variants)
    
    return row

def format_list_both2(row):
    global top_given_names
    
    eng_variants = list(set(row['eng_variants'].split(',')))
    arb_variants = list(set(row['arb_variants'].split(',')))
    
    matched_variants = top_given_names[top_given_names['trimmed_eng'].isin(eng_variants)
                                       | top_given_names['trimmed_arb'].isin(arb_variants)]
    
    eng_matches = list(set(matched_variants['eng_variants'].tolist()))
    arb_matches = list(set(matched_variants['arb_variants'].tolist()))
    
    eng_variants = list(set([item for sublist in eng_matches for item in sublist.split(',')]))
    arb_variants = list(set([item for sublist in arb_matches for item in sublist.split(',')]))
            
    row['eng_variants'] = ','.join(eng_variants)
    row['arb_variants'] = ','.join(arb_variants)
    
    return row
    
%%time

## round 1
print(top_given_names.shape)
top_given_names['eng_variants'] = top_given_names.groupby(['trimmed_arb'])['trimmed_eng'].transform(format_variants_list)
top_given_names['arb_variants'] = top_given_names.groupby(['trimmed_eng'])['trimmed_arb'].transform(format_variants_list)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first').copy()
print(top_given_names.shape)

## round 2

top_given_names = top_given_names.apply(format_list_both2, axis=1)

print(top_given_names.shape)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first').copy()
print(top_given_names.shape)

top_given_names = top_given_names.apply(format_list_both2, axis=1)

print(top_given_names.shape)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first').copy()
print(top_given_names.shape)

top_given_names = top_given_names.apply(remove_long_variants_both, axis=1)
top_given_names = top_given_names.apply(create_negative, axis=1)

print(top_given_names.shape)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first').copy()
print(top_given_names.shape)
'''

In [None]:
'''
def format_variants_list2(s):
    lis = s.tolist()
    flat_list = [item for sublist in lis for item in sublist.split(',')]

    return ','.join(sorted(set(flat_list)))


def format_list(s, df, colum_name, second_column_name):
    a = set(s.split(','))
    a = sorted(a)
    ddd = df[df[colum_name].isin(a)][second_column_name].tolist()
    flat_list = [item for sublist in ddd for item in sublist.split(',')]
    ssss = ','.join(sorted(set(flat_list)))
    return ssss

'''

In [None]:

'''

%%time
## round 3
top_given_names['eng_variants'] = top_given_names.groupby(['arb_variants'])['eng_variants'].transform(format_variants_list2)
top_given_names['arb_variants'] = top_given_names.groupby(['eng_variants'])['arb_variants'].transform(format_variants_list2)

print(top_given_names.shape)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first').copy()
print(top_given_names.shape)


## round 4 optional
top_given_names['eng_variants'] = top_given_names['eng_variants'].apply(lambda x: format_list(x, top_given_names, 'trimmed_eng', 'eng_variants'))
top_given_names['arb_variants'] = top_given_names['arb_variants'].apply(lambda x: format_list(x, top_given_names, 'trimmed_arb', 'arb_variants'))

print(top_given_names.shape)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first').copy()
print(top_given_names.shape)


## round 5 optional
top_given_names['eng_variants'] = top_given_names.groupby(['arb_variants'])['eng_variants'].transform(format_variants_list2)
top_given_names['arb_variants'] = top_given_names.groupby(['eng_variants'])['arb_variants'].transform(format_variants_list2)

print(top_given_names.shape)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first').copy()
print(top_given_names.shape)

top_given_names['eng_variants'] = top_given_names.apply(lambda x: remove_long_variants(x, 'trimmed_eng', 'eng_variants'), axis=1)
top_given_names['arb_variants'] = top_given_names.apply(lambda x: remove_long_variants(x, 'trimmed_arb', 'arb_variants'), axis=1)

'''
'''

top_given_names['eng_variants'] = top_given_names['eng_variants'].apply(lambda x: format_list(x, top_given_names, 'trimmed_eng', 'eng_variants'))
top_given_names['arb_variants'] = top_given_names['arb_variants'].apply(lambda x: format_list(x, top_given_names, 'trimmed_arb', 'arb_variants'))

print(top_given_names.shape)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first').copy()
print(top_given_names.shape)


top_given_names['arb_variants'] = top_given_names.groupby(['trimmed_eng'])['trimmed_arb'].transform(lambda x: ','.join(x))
top_given_names['arb_variants'] = top_given_names['arb_variants'].apply(format_variants_list)
top_given_names = top_given_names.drop_duplicates(subset=['eng_variants', 'arb_variants'], keep='first')

'''