In [1]:
#Import libraries
import pandas as pd
import warnings
import numpy as np
from faker import Faker
import random
import string
import re
from copy import deepcopy
from nltk.corpus import words
import json
import ast
import uuid
from synthetic import SyntheticHelper
from match_type import MatchType
from defines import *
warnings.filterwarnings('ignore')

In [2]:
#Create a Faker Instance
helper = SyntheticHelper()
fake = Faker()

In [3]:
#Data required for ER Dataset
#Generate 50 random examples of names for both male and female
female_names = [{'FirstName': fake.first_name_female(), 'SuffixName': '', 'Gender': random.choice(GENDER_FEMALE), 'LastName': fake.last_name(),
                 'Prefix': random.choice(PREFIX_FEMALE), 'MiddleName': random.choice(MIDDLE_NAME)} for _ in range(10000)]
female_names_values = [x['FirstName']+x['LastName'] for x in female_names]
male_names = [{'FirstName': fake.first_name_male(), 'SuffixName': random.choice(MALE_SUFFIXES), 'Gender': random.choice(GENDER_MALE),
                'LastName': fake.last_name(), 'Prefix': random.choice(PREFIX_MALE), 'MiddleName': random.choice(MIDDLE_NAME)} for _ in range(10000)]
male_names_values = [x['FirstName']+x['LastName'] for x in male_names]

all_names = female_names + male_names
all_names_values = female_names_values + male_names_values

new_names = []
indexes_to_remove = []
for i in range(len(all_names_values)):
    if all_names_values[i] in new_names:
        indexes_to_remove.append(i)
    else:
        new_names.append(all_names_values[i])

all_names_filter = []
for i in range(len(all_names)):
    if i not in indexes_to_remove:
        all_names_filter.append(all_names[i])
female_name_list = [name['FirstName'] for name in female_names]
male_name_list = [name['FirstName'] for name in male_names]

#Generate 50 random examples of address for both male and female
addresses = [helper.generate_address(CITIES) for _ in range(len(all_names_filter))]

#Generate 50 random examples of numbers for both male and female
numbers = [{'Number': fake.phone_number(), 'Type':""} for _ in range(len(all_names_filter))]

#Generate 50 random examples of email addresses for both male and female
all_emails = [{'Email': helper.generate_email(all_names[i]['FirstName'], all_names[i]['LastName']), 'Type':""} for i in range(len(all_names_filter))]

#Create list of master_profiles
master_profiles = [all_names_filter, addresses, numbers, all_emails]

In [4]:
#Initiate a match type creator
match_creator = MatchType(master_profiles, female_name_list, male_name_list, PREFIX_FEMALE, PREFIX_MALE, MALE_SUFFIXES, MIDDLE_NAME, CITIES)

In [5]:
#Create easy and difficult matches and non-matches
easy_candidates_match = match_creator.create_easy_match()
difficult_candidates_match = match_creator.create_difficult_match()
easy_candidates_non_match = match_creator.create_easy_non_match()
difficult_candidates_non_match = match_creator.create_difficult_non_match()

In [6]:
#Create pandas df for master and candidate
easy_match_df = helper.lists_to_dataframe(master_profiles, easy_candidates_match)
difficult_match_df = helper.lists_to_dataframe(master_profiles, difficult_candidates_match)
easy_non_match_df = helper.lists_to_dataframe(master_profiles, easy_candidates_non_match)
difficult_non_match_df = helper.lists_to_dataframe(master_profiles, difficult_candidates_non_match)

easy_match_df['label'] = '0'
difficult_match_df['label'] = '0'
easy_non_match_df['label'] = '1'
difficult_non_match_df['label'] = '1'

easy_match_df['group_id'] = easy_match_df.apply(lambda _: helper.generate_unique_id(), axis=1)
difficult_match_df['group_id'] = difficult_match_df.apply(lambda _: helper.generate_unique_id(), axis=1)
easy_non_match_df['group_id'] = easy_non_match_df.apply(lambda _: helper.generate_unique_id(), axis=1)
difficult_non_match_df['group_id'] = difficult_non_match_df.apply(lambda _: helper.generate_unique_id(), axis=1)

difficult_non_match_df.head()

Unnamed: 0,master_name,master_address,master_number,master_email,candidate_name,candidate_address,candidate_email,candidate_number,label,group_id
0,"{'FirstName': 'Monica', 'SuffixName': '', 'Gen...","{'AddressLine1': '1377 Allen Greens', 'Address...","{'Number': '001-663-397-6541x73893', 'Type': ''}","{'Email': 'monica.morris@icloud.com', 'Type': ''}","{'FirstName': 'Chelsey', 'LastName': 'Morris',...","{'StateProvince': 'CA', 'City': 'Sacramento', ...","{'Email': 'chelsey.morris@icloud.com', 'Type':...","{'Number': '001-663-317-1441x73813', 'Type': ''}",1,d1d546fb-fcb0-4fba-81da-25e4540a5fe4
1,"{'FirstName': 'Amanda', 'SuffixName': '', 'Gen...","{'AddressLine1': '816 Alexander Run', 'Address...","{'Number': '765-349-9142x711', 'Type': ''}","{'Email': 'amanda.edwards@icloud.com', 'Type':...","{'FirstName': 'Suzanne', 'LastName': 'Edwards'...","{'StateProvince': 'CA', 'City': 'San Francisco...","{'Email': 'suzanne.edwards@gmail.com', 'Type':...","{'Number': '001-384-356-1569', 'Type': ''}",1,351c4100-7d43-4660-a365-22e030c7869a
2,"{'FirstName': 'Sarah', 'SuffixName': '', 'Gend...","{'AddressLine1': '2063 Chang View Suite 872', ...","{'Number': '(986)488-1230x91051', 'Type': ''}","{'Email': 'sarah.townsend@yahoo.com', 'Type': ''}","{'FirstName': 'Michelle', 'LastName': 'Townsen...","{'StateProvince': 'TX', 'City': 'Fort Worth', ...","{'Email': 'michelle.townsend@yahoo.com', 'Type...","{'Number': '(786)584-2674x4245', 'Type': ''}",1,35329f2b-bf80-40d2-a0e7-2f66ebee70a7
3,"{'FirstName': 'Jasmine', 'SuffixName': '', 'Ge...","{'AddressLine1': '630 Bean Ford Apt. 762', 'Ad...","{'Number': '001-297-768-7656', 'Type': ''}","{'Email': 'jasmine.sandoval@yahoo.com', 'Type'...","{'FirstName': 'Jasmine', 'LastName': 'Sawyer',...","{'StateProvince': 'NY', 'City': 'Rochester', '...","{'Email': 'jasmine.sawyer@gmail.com', 'Type': ''}","{'Number': '964.903.2461x723', 'Type': ''}",1,d66eb13e-097e-4afd-ada5-1f57272392a1
4,"{'FirstName': 'Meghan', 'SuffixName': '', 'Gen...","{'AddressLine1': '170 Sheena Crest Apt. 658', ...","{'Number': '727-381-0702x1967', 'Type': ''}","{'Email': 'meghan.jones@gmail.com', 'Type': ''}","{'FirstName': 'Jennifer', 'LastName': 'Jones',...","{'StateProvince': 'NY', 'City': 'Rochester', '...","{'Email': 'jennifer.jones@yahoo.com', 'Type': ''}","{'Number': '(609)316-2332', 'Type': ''}",1,894bf4c9-28c7-45a3-940a-d0aeff6d1e13


In [7]:
#Concat all the dfs
final_df = pd.concat([easy_match_df, difficult_match_df, easy_non_match_df, difficult_non_match_df])
for column in final_df.columns:
    final_df[column] = final_df[column].apply(lambda x: json.dumps(x))
final_df.head()

Unnamed: 0,master_name,master_address,master_number,master_email,candidate_name,candidate_address,candidate_email,candidate_number,label,group_id
0,"{""FirstName"": ""Monica"", ""SuffixName"": """", ""Gen...","{""AddressLine1"": ""1377 Allen Greens"", ""Address...","{""Number"": ""001-663-397-6541x73893"", ""Type"": """"}","{""Email"": ""monica.morris@icloud.com"", ""Type"": """"}","{""FirstName"": ""Monica"", ""LastName"": ""Morris"", ...","{""StateProvince"": ""CA"", ""Country"": ""United Sta...","{""Email"": """", ""Type"": """"}","{""Number"": ""001-663-397-6541x73893"", ""Type"": """"}","""0""","""aeaaf33c-08a7-48f7-986c-08ea3bde0a7a"""
1,"{""FirstName"": ""Amanda"", ""SuffixName"": """", ""Gen...","{""AddressLine1"": ""816 Alexander Run"", ""Address...","{""Number"": ""765-349-9142x711"", ""Type"": """"}","{""Email"": ""amanda.edwards@icloud.com"", ""Type"":...","{""FirstName"": ""Amanda"", ""LastName"": ""Edwards"",...","{""StateProvince"": ""CA"", ""Country"": ""United Sta...","{""Email"": """", ""Type"": """"}","{""Number"": ""765-349-9142x711"", ""Type"": """"}","""0""","""a8f9042b-9f36-414c-b780-30cbe7299edd"""
2,"{""FirstName"": ""Sarah"", ""SuffixName"": """", ""Gend...","{""AddressLine1"": ""2063 Chang View Suite 872"", ...","{""Number"": ""(986)488-1230x91051"", ""Type"": """"}","{""Email"": ""sarah.townsend@yahoo.com"", ""Type"": """"}","{""FirstName"": ""Sarah"", ""LastName"": ""Townsend"",...","{""StateProvince"": ""TX"", ""Country"": """", ""Addres...","{""Email"": """", ""Type"": """"}","{""Number"": """", ""Type"": """"}","""0""","""31575c74-2bb7-42ce-8b45-e8537484e861"""
3,"{""FirstName"": ""Jasmine"", ""SuffixName"": """", ""Ge...","{""AddressLine1"": ""630 Bean Ford Apt. 762"", ""Ad...","{""Number"": ""001-297-768-7656"", ""Type"": """"}","{""Email"": ""jasmine.sandoval@yahoo.com"", ""Type""...","{""FirstName"": ""Jasmine"", ""LastName"": ""Sandoval...","{""StateProvince"": ""NY"", ""Country"": ""USA"", ""Add...","{""Email"": """", ""Type"": """"}","{""Number"": ""001-297-768-7656"", ""Type"": """"}","""0""","""a40356a3-baa4-4616-a16c-ce3d0aa5f29a"""
4,"{""FirstName"": ""Meghan"", ""SuffixName"": """", ""Gen...","{""AddressLine1"": ""170 Sheena Crest Apt. 658"", ...","{""Number"": ""727-381-0702x1967"", ""Type"": """"}","{""Email"": ""meghan.jones@gmail.com"", ""Type"": """"}","{""FirstName"": ""Meghan"", ""LastName"": ""Jones"", ""...","{""StateProvince"": ""NY"", ""Country"": ""United Sta...","{""Email"": ""meghan.jones@gmail.com"", ""Type"": """"}","{""Number"": """", ""Type"": """"}","""0""","""c4face23-0162-4aac-b0aa-7377d310f91c"""
