In [1]:
#Import libraries
import pandas as pd
import warnings
from faker import Faker
import random
from synthetic import SyntheticHelper
from match_type import MatchType
from defines import *
warnings.filterwarnings('ignore')

In [2]:
#Create a Faker Instance
helper = SyntheticHelper()
fake = Faker()

In [3]:
#Data required for ER Dataset
#Generate 50 random examples of names for both male and female
female_names = [{'FirstName': fake.first_name_female(), 'SuffixName': '', 'Gender': random.choice(GENDER_FEMALE), 'LastName': fake.last_name(),
                 'Prefix': random.choice(PREFIX_FEMALE), 'MiddleName': random.choice(MIDDLE_NAME)} for _ in range(10000)]
female_names_values = [x['FirstName']+x['LastName'] for x in female_names]
male_names = [{'FirstName': fake.first_name_male(), 'SuffixName': random.choice(MALE_SUFFIXES), 'Gender': random.choice(GENDER_MALE),
                'LastName': fake.last_name(), 'Prefix': random.choice(PREFIX_MALE), 'MiddleName': random.choice(MIDDLE_NAME)} for _ in range(10000)]
male_names_values = [x['FirstName']+x['LastName'] for x in male_names]

all_names = female_names + male_names
all_names_values = female_names_values + male_names_values

new_names = []
indexes_to_remove = []
for i in range(len(all_names_values)):
    if all_names_values[i] in new_names:
        indexes_to_remove.append(i)
    else:
        new_names.append(all_names_values[i])

all_names_filter = []
for i in range(len(all_names)):
    if i not in indexes_to_remove:
        all_names_filter.append(all_names[i])
female_name_list = [name['FirstName'] for name in female_names]
male_name_list = [name['FirstName'] for name in male_names]

#Generate 50 random examples of address for both male and female
addresses = [helper.generate_address(CITIES) for _ in range(len(all_names_filter))]

#Generate 50 random examples of numbers for both male and female
numbers = [{'Number': fake.phone_number(), 'Type':""} for _ in range(len(all_names_filter))]

#Generate 50 random examples of email addresses for both male and female
all_emails = [{'Email': helper.generate_email(all_names[i]['FirstName'], all_names[i]['LastName']), 'Type':""} for i in range(len(all_names_filter))]

#Create list of master_profiles
master_profiles = [all_names_filter, addresses, numbers, all_emails]

In [4]:
#Initiate a match type creator
match_creator = MatchType(master_profiles, female_name_list, male_name_list, PREFIX_FEMALE, PREFIX_MALE, MALE_SUFFIXES, MIDDLE_NAME, CITIES)

In [5]:
#Create easy and difficult matches and non-matches
easy_candidates_match = match_creator.create_easy_match()
difficult_candidates_match = match_creator.create_difficult_match()
easy_candidates_non_match = match_creator.create_easy_non_match()
difficult_candidates_non_match = match_creator.create_difficult_non_match()

100%|██████████| 17712/17712 [00:01<00:00, 15312.90it/s]


Easy Name matches created


100%|██████████| 17712/17712 [00:00<00:00, 1050473.88it/s]


Easy address matches created


100%|██████████| 17712/17712 [00:00<00:00, 2553606.23it/s]


Easy phone matches created


100%|██████████| 17712/17712 [00:00<00:00, 2600991.26it/s]


Easy email matches created


100%|██████████| 17712/17712 [00:01<00:00, 15139.11it/s]


Difficult name matches created


100%|██████████| 17712/17712 [00:00<00:00, 221521.69it/s]


Difficult address matches created


100%|██████████| 17712/17712 [00:00<00:00, 2103088.90it/s]


Difficult phone matches created


100%|██████████| 17712/17712 [00:06<00:00, 2697.92it/s]


Difficult email matches created


100%|██████████| 17712/17712 [00:02<00:00, 6925.16it/s]


Easy name non-matches created


100%|██████████| 17712/17712 [00:00<00:00, 24409.30it/s]


Easy address non-matches created


100%|██████████| 17712/17712 [00:00<00:00, 135692.08it/s]


Easy phone non-matches created


100%|██████████| 17712/17712 [00:00<00:00, 1868965.57it/s]


Easy email non-matches created


100%|██████████| 17712/17712 [00:00<00:00, 23837.79it/s]


Difficult name non-matches created


100%|██████████| 17712/17712 [38:39<00:00,  7.64it/s]


Difficult address non-matches created


100%|██████████| 17712/17712 [00:00<00:00, 96178.24it/s]


Difficult number non-matches created


100%|██████████| 17712/17712 [00:00<00:00, 1812912.11it/s]

Difficult email non-matches created





In [6]:
#Create pandas df for master and candidate
easy_match_df = helper.lists_to_dataframe(master_profiles, easy_candidates_match)
difficult_match_df = helper.lists_to_dataframe(master_profiles, difficult_candidates_match)
easy_non_match_df = helper.lists_to_dataframe(master_profiles, easy_candidates_non_match)
difficult_non_match_df = helper.lists_to_dataframe(master_profiles, difficult_candidates_non_match)

easy_match_df['label'] = '0'
difficult_match_df['label'] = '0'
easy_non_match_df['label'] = '1'
difficult_non_match_df['label'] = '1'

easy_match_df['group_id'] = easy_match_df.apply(lambda _: helper.generate_unique_id(), axis=1)
difficult_match_df['group_id'] = difficult_match_df.apply(lambda _: helper.generate_unique_id(), axis=1)
easy_non_match_df['group_id'] = easy_non_match_df.apply(lambda _: helper.generate_unique_id(), axis=1)
difficult_non_match_df['group_id'] = difficult_non_match_df.apply(lambda _: helper.generate_unique_id(), axis=1)

difficult_non_match_df.head()

Unnamed: 0,master_name,master_address,master_number,master_email,candidate_name,candidate_address,candidate_email,candidate_number,label,group_id
0,"{'FirstName': 'Teresa', 'SuffixName': '', 'Gen...","{'AddressLine1': '52755 Hinton Course', 'Addre...","{'Number': '(782)608-3632x023', 'Type': ''}","{'Email': 'teresa.hughes@yahoo.com', 'Type': ''}","{'FirstName': 'Teresa', 'LastName': 'Matthews'...","{'StateProvince': 'IL', 'City': 'Aurora', 'Cou...","{'Email': 'teresa.matthews@yahoo.com', 'Type':...","{'Number': '621-362-8964', 'Type': ''}",1,c409fd27-645d-48b5-aad6-7fa693e57457
1,"{'FirstName': 'Amanda', 'SuffixName': '', 'Gen...","{'AddressLine1': '4809 Colin Plaza Apt. 412', ...","{'Number': '563-485-7861x34379', 'Type': ''}","{'Email': 'amanda.davis@gmail.com', 'Type': ''}","{'FirstName': 'Amanda', 'LastName': 'Stewart',...","{'StateProvince': 'TX', 'City': 'San Antonio',...","{'Email': 'amanda.stewart@yahoo.com', 'Type': ''}","{'Number': '563-485-8851x37079', 'Type': ''}",1,669842ab-7508-4d09-8e5f-56d91e1ed969
2,"{'FirstName': 'Kimberly', 'SuffixName': '', 'G...","{'AddressLine1': '303 Butler Vista', 'AddressL...","{'Number': '001-204-563-8861x726', 'Type': ''}","{'Email': 'kimberly.anderson@gmail.com', 'Type...","{'FirstName': 'Julie', 'LastName': 'Anderson',...","{'StateProvince': 'FL', 'City': 'Orlando', 'Co...","{'Email': 'julie.anderson@gmail.com', 'Type': ''}","{'Number': '0000204-563-88619746', 'Type': ''}",1,c05ab3be-a04b-4903-a6e5-88b2084a0782
3,"{'FirstName': 'Mary', 'SuffixName': '', 'Gende...","{'AddressLine1': '478 Aaron Valley', 'AddressL...","{'Number': '456.287.0469', 'Type': ''}","{'Email': 'mary.blair@yahoo.com', 'Type': ''}","{'FirstName': 'Valerie', 'LastName': 'Blair', ...","{'StateProvince': 'NY', 'City': 'Syracuse', 'C...","{'Email': 'valerie.blair@icloud.com', 'Type': ''}","{'Number': '6599392340', 'Type': ''}",1,2b3ce813-9a05-4888-a1ea-d7b04e36e705
4,"{'FirstName': 'Kristina', 'SuffixName': '', 'G...","{'AddressLine1': '4092 Steven Villages', 'Addr...","{'Number': '+1-939-589-6803x014', 'Type': ''}","{'Email': 'kristina.baker@yahoo.com', 'Type': ''}","{'FirstName': 'Ashley', 'LastName': 'Baker', '...","{'StateProvince': 'NY', 'City': 'New York City...","{'Email': 'ashley.baker@icloud.com', 'Type': ''}","{'Number': '+14939-51956803x814', 'Type': ''}",1,777eefa9-294b-418c-aa7a-585d4869510a


In [7]:
#Concat all the dfs
final_df = pd.concat([easy_match_df, difficult_match_df, easy_non_match_df, difficult_non_match_df])
final_df.head()

Unnamed: 0,master_name,master_address,master_number,master_email,candidate_name,candidate_address,candidate_email,candidate_number,label,group_id
0,"{'FirstName': 'Teresa', 'SuffixName': '', 'Gen...","{'AddressLine1': '52755 Hinton Course', 'Addre...","{'Number': '(782)608-3632x023', 'Type': ''}","{'Email': 'teresa.hughes@yahoo.com', 'Type': ''}","{'FirstName': 'Teresa', 'LastName': 'Hughes', ...","{'StateProvince': 'IL', 'Country': '', 'Addres...","{'Email': '', 'Type': ''}","{'Number': '(782)608-3632x023', 'Type': ''}",0,e879c944-5b01-498e-b065-b9d73dbe5aab
1,"{'FirstName': 'Amanda', 'SuffixName': '', 'Gen...","{'AddressLine1': '4809 Colin Plaza Apt. 412', ...","{'Number': '563-485-7861x34379', 'Type': ''}","{'Email': 'amanda.davis@gmail.com', 'Type': ''}","{'FirstName': 'Amanda', 'LastName': 'Davis', '...","{'StateProvince': 'TX', 'Country': 'USA', 'Add...","{'Email': 'amanda.davis@gmail.com', 'Type': ''}","{'Number': '563-485-7861x34379', 'Type': ''}",0,8bafe723-7d62-4a9f-9de7-ebe8b9099222
2,"{'FirstName': 'Kimberly', 'SuffixName': '', 'G...","{'AddressLine1': '303 Butler Vista', 'AddressL...","{'Number': '001-204-563-8861x726', 'Type': ''}","{'Email': 'kimberly.anderson@gmail.com', 'Type...","{'FirstName': 'Kimberly', 'LastName': 'Anderso...","{'StateProvince': 'FL', 'Country': 'United Sta...","{'Email': 'kimberly.anderson@gmail.com', 'Type...","{'Number': '001-204-563-8861x726', 'Type': ''}",0,ada9466c-8abe-4c0f-b273-01b50c8d3cf5
3,"{'FirstName': 'Mary', 'SuffixName': '', 'Gende...","{'AddressLine1': '478 Aaron Valley', 'AddressL...","{'Number': '456.287.0469', 'Type': ''}","{'Email': 'mary.blair@yahoo.com', 'Type': ''}","{'FirstName': 'Mary', 'LastName': 'Blair', 'Ge...","{'StateProvince': 'NY', 'Country': 'USA', 'Add...","{'Email': '', 'Type': ''}","{'Number': '456.287.0469', 'Type': ''}",0,bc2da630-76ac-4430-b546-953f5730dd21
4,"{'FirstName': 'Kristina', 'SuffixName': '', 'G...","{'AddressLine1': '4092 Steven Villages', 'Addr...","{'Number': '+1-939-589-6803x014', 'Type': ''}","{'Email': 'kristina.baker@yahoo.com', 'Type': ''}","{'FirstName': 'Kristina', 'LastName': 'Baker',...","{'StateProvince': 'NY', 'Country': 'United Sta...","{'Email': '', 'Type': ''}","{'Number': '', 'Type': ''}",0,215c1291-49ab-40c8-8834-891369075597


In [8]:
#Concatenate all the attributes as a single string
final_df['master_entity'] = (
    (final_df['master_name'].apply(lambda x: x['Prefix'] + " " + x['FirstName'] + " " + x['MiddleName'] + " " + x['LastName'] + " " 
                                   + x['SuffixName'])) + ", "
    + (final_df['master_address'].apply(lambda x: x['AddressLine1'] + " " + x['City'] + " " + x['StateProvince'] + " " + x['Country'])) + ", "
    + (final_df['master_email'].apply(lambda x: x['Email']))+ ", " + (final_df['master_number'].apply(lambda x: x['Number']))
)
final_df['candidate_entity'] = (
    (final_df['candidate_name'].apply(lambda x: x['Prefix'] + " " + x['FirstName'] + " " + x['MiddleName'] + " " + x['LastName'] + " " 
                                   + x['SuffixName'])) + ", "
    + (final_df['candidate_address'].apply(lambda x: x['AddressLine1'] + " " + x['City'] + " " + x['StateProvince'] + " " + x['Country'])) + ", "
    + (final_df['candidate_email'].apply(lambda x: x['Email']))+ ", " + (final_df['candidate_number'].apply(lambda x: x['Number']))
)
final_df['master_entity'] = final_df['master_entity'].apply(lambda x: x.strip())
final_df['candidate_entity'] = final_df['candidate_entity'].apply(lambda x: x.strip())

final_df['master_entity'] = final_df['master_entity'].apply(lambda x: helper.clean_commas(x))
final_df['candidate_entity'] = final_df['candidate_entity'].apply(lambda x: helper.clean_commas(x))
final_df.head()

Unnamed: 0,master_name,master_address,master_number,master_email,candidate_name,candidate_address,candidate_email,candidate_number,label,group_id,master_entity,candidate_entity
0,"{'FirstName': 'Teresa', 'SuffixName': '', 'Gen...","{'AddressLine1': '52755 Hinton Course', 'Addre...","{'Number': '(782)608-3632x023', 'Type': ''}","{'Email': 'teresa.hughes@yahoo.com', 'Type': ''}","{'FirstName': 'Teresa', 'LastName': 'Hughes', ...","{'StateProvince': 'IL', 'Country': '', 'Addres...","{'Email': '', 'Type': ''}","{'Number': '(782)608-3632x023', 'Type': ''}",0,e879c944-5b01-498e-b065-b9d73dbe5aab,Dr. Teresa Y Hughes 52755 Hinton Course Aurora...,Teresa Yezk Hughes 52755 Hinton Course Aurora ...
1,"{'FirstName': 'Amanda', 'SuffixName': '', 'Gen...","{'AddressLine1': '4809 Colin Plaza Apt. 412', ...","{'Number': '563-485-7861x34379', 'Type': ''}","{'Email': 'amanda.davis@gmail.com', 'Type': ''}","{'FirstName': 'Amanda', 'LastName': 'Davis', '...","{'StateProvince': 'TX', 'Country': 'USA', 'Add...","{'Email': 'amanda.davis@gmail.com', 'Type': ''}","{'Number': '563-485-7861x34379', 'Type': ''}",0,8bafe723-7d62-4a9f-9de7-ebe8b9099222,Ms. Amanda C Davis 4809 Colin Plaza Apt. 412 S...,Amanda C Davis 4809 Colin Plaza Apt. 412 San A...
2,"{'FirstName': 'Kimberly', 'SuffixName': '', 'G...","{'AddressLine1': '303 Butler Vista', 'AddressL...","{'Number': '001-204-563-8861x726', 'Type': ''}","{'Email': 'kimberly.anderson@gmail.com', 'Type...","{'FirstName': 'Kimberly', 'LastName': 'Anderso...","{'StateProvince': 'FL', 'Country': 'United Sta...","{'Email': 'kimberly.anderson@gmail.com', 'Type...","{'Number': '001-204-563-8861x726', 'Type': ''}",0,ada9466c-8abe-4c0f-b273-01b50c8d3cf5,Dr. Kimberly V Anderson 303 Butler Vista Orlan...,Miss. Kimberly V Anderson 303 Butler Vista Orl...
3,"{'FirstName': 'Mary', 'SuffixName': '', 'Gende...","{'AddressLine1': '478 Aaron Valley', 'AddressL...","{'Number': '456.287.0469', 'Type': ''}","{'Email': 'mary.blair@yahoo.com', 'Type': ''}","{'FirstName': 'Mary', 'LastName': 'Blair', 'Ge...","{'StateProvince': 'NY', 'Country': 'USA', 'Add...","{'Email': '', 'Type': ''}","{'Number': '456.287.0469', 'Type': ''}",0,bc2da630-76ac-4430-b546-953f5730dd21,Dr. Mary H Blair 478 Aaron Valley Syracuse NY ...,Mary Hxto Blair 478 Aaron Valley Syracuse NY U...
4,"{'FirstName': 'Kristina', 'SuffixName': '', 'G...","{'AddressLine1': '4092 Steven Villages', 'Addr...","{'Number': '+1-939-589-6803x014', 'Type': ''}","{'Email': 'kristina.baker@yahoo.com', 'Type': ''}","{'FirstName': 'Kristina', 'LastName': 'Baker',...","{'StateProvince': 'NY', 'Country': 'United Sta...","{'Email': '', 'Type': ''}","{'Number': '', 'Type': ''}",0,215c1291-49ab-40c8-8834-891369075597,Kristina Y Baker 4092 Steven Villages New York...,Mrs. Kristina Y Baker 4092 Steven Villages NY...


In [9]:
#Take out the master entities and candidate entities to combine in same df
master_entities = final_df[['master_entity']]
candidate_entities = final_df[['candidate_entity']]

master_entities = master_entities.rename(columns = {'master_entity':'entity'})
candidate_entities = candidate_entities.rename(columns = {'candidate_entity':'entity'})
all_entities = pd.concat([master_entities, candidate_entities])
all_entities.head()

Unnamed: 0,entity
0,Dr. Teresa Y Hughes 52755 Hinton Course Aurora...
1,Ms. Amanda C Davis 4809 Colin Plaza Apt. 412 S...
2,Dr. Kimberly V Anderson 303 Butler Vista Orlan...
3,Dr. Mary H Blair 478 Aaron Valley Syracuse NY ...
4,Kristina Y Baker 4092 Steven Villages New York...


In [10]:
#Save the entity data
all_entities.to_csv('datasets/entity_pairs.csv', index = False)