### Simulate entities for matching

Simulate 2 datasets: 

* a "baseline" dataset, for matching a dataset with itself. We are using the trivial dataset for end-to-end smoke tests.
* a regular dataset, that simulates possible issues with mismatching entity data across datasets

In [20]:
from faker import Faker
import random
import json
import pandas as pd
import numpy as np
import re
import datetime as dt 

Fields to simulate:
* person internal_id
* service date
* person first name(s), last name(s), middle name(s)
* person dob
* person ssn
* person address st, city, postal code, state (always CA)
* person phone number

------------------------------------------------------------------------------------------------------

#### Trivial simulation

In [2]:
def create_persons(n_people):
    persons = []
    faker = Faker()
    for p in range(n_people):
        person = {}
        person["person_id"] = p + 1000
        
        person_service_date_shift = random.randint(30, 500)
        person["service_date"] = dt.date.today() - dt.timedelta(days=person_service_date_shift)
        person["first_name"] = faker.first_name()
        person["middle_name"] = faker.first_name()
        person["last_name"] = faker.last_name()
        person['dob'] = person["service_date"] - dt.timedelta(days=random.randint(366*18, 366*60))
        
        person["ssn"] = faker.ssn()
        person["street_address"] = faker.address()
        person["city"] = faker.city()
        person["zip"] = faker.postcode()
        person["state"] = "CA"
        person["phone"] = faker.phone_number()
        
        persons.append(person)
        
    return persons
    

In [4]:
#create_persons(10)

#### Non-trivial simulation

Here, we create the same person twice, with mismatching information. The output will be two "people lists": ListX and ListY

Possible mismatches:
* two last names in listX, only 1 last name in listY
* middle name as first name
* address changed
* phone number changed
* typo in names
* mismatching special characters in names (e.g. Smith-Lewis vs Smith Lewis)
* Shortened names, nicknames instead of names (e.g. Kat vs Katherine)

Thanks to [this project](https://github.com/carltonnorthern/nickname-and-diminutive-names-lookup) we have a csv of names and similar names to draw from for the simulation

In [None]:
# TODO: be smarter about this function
def insert_typo(name):
    if "i" in name:
        name = name.replace("i", "y")
    elif "o" in name:
        name = name.replace("o", "p")
    return name

In [16]:
names = pd.read_csv('names_lookup.csv' , header = None, delimiter = ";")
names["first"] = names[0].apply(lambda x: x.split(",")[0])
names["nicknames"] = names[0].apply(lambda x: x.split(",")[1:])
names.head(5)

Unnamed: 0,0,first,nicknames
0,"aaron,erin,ronnie,ron",aaron,"[erin, ronnie, ron]"
1,"abbigail,nabby,abby,gail,abbi,abbey",abbigail,"[nabby, abby, gail, abbi, abbey]"
2,"abednego,bedney",abednego,[bedney]
3,"abel,ebbie,ab,abe,eb",abel,"[ebbie, ab, abe, eb]"
4,"abiel,ab",abiel,[ab]


In [18]:
count_names_lookup = len(names)

In [63]:
def create_persons_datasets(n_people):
    ### Probabilities for whether to introduce a person variation or not:
    perc_first_name_mismatch = 0.5
    perc_double_last_name_mismatch = 0.2
    perc_middle_name_as_first_name = 0.1
    perc_typo = 0.3
    persons_x = []
    persons_y = []
    faker = Faker()
    for p in range(n_people):
        person_x = {}
        person_y = {}
        person_x["person_id"] = p + 1000
        person_y["person_id"] = person_x["person_id"]
        person_service_date_shift = random.randint(30, 500)
        person_x["service_date"] = dt.date.today() - dt.timedelta(days=person_service_date_shift)
        person_y["service_date"] = person_x["service_date"]
        
        same_name = bool(np.random.choice(np.arange(2), p=[perc_first_name_mismatch, (1-perc_first_name_mismatch)]))
        
        if same_name:
            person_x["first_name"] = faker.first_name()
            person_y["first_name"] = person_x["first_name"]
        else:
            #use nicknames lookup, random sample a name, and then sample a nickname for that row/name
            person_x["first_name"] = names["first"].iloc[np.random.choice(count_names_lookup+1)]
            person_y["first_name"] = names.loc[names['first'] == person_x["first_name"], "nicknames"].apply(lambda x: x[np.random.choice(np.arange(len(x)))]).values[0]

        person_x["middle_name"] = faker.first_name()
        person_x["last_name"] = faker.last_name()
        person_x['dob'] = person_x["service_date"] - dt.timedelta(days=random.randint(366*18, 366*60))
        
        person_y["middle_name"] = person_x["middle_name"]
        
        person_y["last_name"] = person_x["last_name"]
        double_last_name_mismatch = bool(np.random.choice(np.arange(2), p=[(1-perc_double_last_name_mismatch), perc_double_last_name_mismatch]))
        if double_last_name_mismatch:
            person_y["last_name"] = person_y["last_name"] + " " + faker.last_name()
        
        person_y['dob'] = person_x['dob']
        
        person_x["ssn"] = faker.ssn()
        person_y["ssn"] = person_x["ssn"]
#         person["street_address"] = faker.address()
#         person["city"] = faker.city()
#         person["zip"] = faker.postcode()
#         person["state"] = "CA"
#         person["phone"] = faker.phone_number()
        
        persons_x.append(person_x)
        persons_y.append(person_y)
        
    return persons_x, persons_y





### Example dataset creation:

In [64]:
people1, people2 = create_persons_datasets(6)

In [65]:
people1 = pd.DataFrame(people1)
people1

Unnamed: 0,person_id,service_date,first_name,middle_name,last_name,dob,ssn
0,1000,2020-09-02,christiano,Allen,Marquez,1967-05-29,119-77-5956
1,1001,2020-11-20,bert,William,Wheeler,2001-11-30,222-19-9745
2,1002,2021-10-01,dominic,Douglas,Gonzalez,1986-07-05,032-60-2115
3,1003,2020-09-24,leonore,Grant,Fuller,1965-08-24,793-29-7108
4,1004,2021-02-19,Jeffrey,Eileen,Kramer,1996-02-21,097-96-0399
5,1005,2020-11-16,Patricia,Amber,Taylor,2000-11-24,791-92-0488


In [66]:
people2 = pd.DataFrame(people2)
people2

Unnamed: 0,person_id,service_date,first_name,middle_name,last_name,dob,ssn
0,1000,2020-09-02,chris,Allen,Marquez,1967-05-29,119-77-5956
1,1001,2020-11-20,bob,William,Wheeler Odonnell,2001-11-30,222-19-9745
2,1002,2021-10-01,dom,Douglas,Gonzalez Reed,1986-07-05,032-60-2115
3,1003,2020-09-24,nora,Grant,Fuller,1965-08-24,793-29-7108
4,1004,2021-02-19,Jeffrey,Eileen,Kramer,1996-02-21,097-96-0399
5,1005,2020-11-16,Patricia,Amber,Taylor Anderson,2000-11-24,791-92-0488


In [68]:
people1.to_csv('../simulated_data/dataset_1.csv', index = False)

In [69]:
people2.to_csv('../simulated_data/dataset_2.csv', index = False)