# Match People

In [1]:
import pandas as pd
from person_matching_functions import *
from person_matcher import PersonMatcher

In [2]:
census = pd.read_csv('data/padron_1821.csv')
baptisms = pd.read_csv('data/Baptisms.csv')

In [3]:
census.dropna(how='all', inplace=True)
baptisms.dropna(how='all', inplace=True)

In [4]:
census['ecpp_id'] = range(1, len(census) + 1)
baptisms['#ID'] = range(1, len(baptisms) + 1)

In [5]:
# padrones
census['Ego_Last Name'] = (census['Ego_Paternal Last Name'].fillna('') + ' ' +
                            census['Ego_ Maternal Last Name'].fillna('')).str.strip()

In [6]:
census.columns

Index(['Ego_First Name', 'Ego_Paternal Last Name', 'Ego_ Maternal Last Name',
       '(Color) Race', 'Sex', 'Birth Year (Est.)', 'Father_First Name',
       'Father_Paternal Last Name', 'Mother_First Name',
       'Mother_Paternal Last Name', 'Husband_First Name',
       'Huband_Paternal Last name', 'Wife_First Name',
       'Wife_ Paternal Last Name', 'Unnamed: 14', 'Status', 'Unnamed: 16',
       'ecpp_id', 'Ego_Last Name'],
      dtype='object')

In [7]:
# 1790 census
# config = {
#     'ecpp_id_col': 'ecpp_id',
#     'records_id_col': '#ID',
#     'census': {
#         'First Name': 'First',
#         'Last Name': 'Last',
#         'Gender': 'Gender',
#         'Age': 'Age',
#     },
#     'baptisms': {
#         'First Name': 'SpanishName',
#         'Last Name': 'Surname',
#         'Mother First Name': 'MSpanishName',
#         'Mother Last Name': 'MSurname',
#         'Father First Name': 'FSpanishName',
#         'Father Last Name': 'FSurname',
#         'Gender': 'Sex',
#         'Age': 'Age',
#     }
# }

In [8]:
# # Padrones
config = {
    'ecpp_id_col': 'ecpp_id',
    'records_id_col': '#ID',
    'census': {
        'First Name': 'Ego_First Name',
        'Last Name': 'Ego_Last Name',
        'Gender': 'Sex'
        # 'Age': 'Age',
    },
    'baptisms': {
        'First Name': 'SpanishName',
        'Last Name': 'Surname',
        'Mother First Name': 'MSpanishName',
        'Mother Last Name': 'MSurname',
        'Father First Name': 'FSpanishName',
        'Father Last Name': 'FSurname',
        'Gender': 'Sex'
        # 'Age': 'Age',
    }
}

In [9]:
matcher = PersonMatcher(census=census, baptisms=baptisms, config=config)

In [10]:
%%time
matched_results = matcher.create_matched_records()

CPU times: total: 15.5 s
Wall time: 16.9 s


In [11]:
matched_results

Unnamed: 0,ecpp_id,#ID,Census_Ego_First Name,Census_Ego_Last Name,Census_Sex,Baptisms_SpanishName,Baptisms_Surname,Baptisms_MSpanishName,Baptisms_MSurname,Baptisms_FSpanishName,Baptisms_FSurname,Baptisms_Sex
0,1,1,Anastacio,Carrillo Lugo,M,Juan Joseph,Salazar,Maria del Loreto,Espinosa,José Loreto Salazar,,M
1,1,2,Anastacio,Carrillo Lugo,M,Joseph Ignacio Matheo,Velasco y Lara,Maria Antonia Euvalda,Campos,Joseph Fernando,Velasco y Lara,M
2,1,3,Anastacio,Carrillo Lugo,M,Maria Estefana Valvanera,Villa,Maria Paula,Martines,Juan Joseph,Villa,F
3,1,4,Anastacio,Carrillo Lugo,M,Maria Antonia,Quixada,Ju[a]na Maria,Armenta,Vicente,Quixada,F
4,1,5,Anastacio,Carrillo Lugo,M,Juan Pedro Jacinto,Ruiz,Maria Isabel,Armenta,Fructuoso Maria,Ruiz,M
...,...,...,...,...,...,...,...,...,...,...,...,...
35388895,340,104081,Mariano,Olivera Reyes,M,José Agustin,Ruis,Francisca,Ruis,[Unstated],,M
35388896,340,104082,Mariano,Olivera Reyes,M,José Antonio,Higuera,Maria Loreta,,Pantaleon,Higuera,M
35388897,340,104083,Mariano,Olivera Reyes,M,Maria Santa,Soto,Maria Rita,Arrieta,Juan Ysidoro,Soto,F
35388898,340,104084,Mariano,Olivera Reyes,M,Felipe,,Gregoria,,[Unstated],,M


In [12]:
%%time
matcher.match()
# plot distribution scores

CPU times: total: 2h 17min 54s
Wall time: 2h 21min 19s


In [13]:
filename = "matches/1821_census_matches.pkl"

In [14]:
matcher.matched_records

Unnamed: 0,ecpp_id,#ID,Census_Ego_First Name,Census_Ego_Last Name,Census_Sex,Baptisms_SpanishName,Baptisms_Surname,Baptisms_MSpanishName,Baptisms_MSurname,Baptisms_FSpanishName,...,First_Name_Match_Score,Last_Name_Match_Score,Mother_First_Name_Match_Score,Mother_Last_Name_Match_Score,Father_First_Name_Match_Score,Father_Last_Name_Match_Score,Gender_Match_Score,Direct_Total_Match_Score,Mother_Total_Match_Score,Father_Total_Match_Score
0,1,1,Anastacio,Carrillo Lugo,M,Juan Joseph,Salazar,Maria del Loreto,Espinosa,José Loreto Salazar,...,0.000000,0.0,0.000000,0.0,0.0,0.0,1,0.100000,0.100000,0.1
1,1,2,Anastacio,Carrillo Lugo,M,Joseph Ignacio Matheo,Velasco y Lara,Maria Antonia Euvalda,Campos,Joseph Fernando,...,0.000000,0.0,0.000000,0.0,0.0,0.0,1,0.100000,0.100000,0.1
2,1,3,Anastacio,Carrillo Lugo,M,Maria Estefana Valvanera,Villa,Maria Paula,Martines,Juan Joseph,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0,0.000000,0.000000,0.0
3,1,4,Anastacio,Carrillo Lugo,M,Maria Antonia,Quixada,Ju[a]na Maria,Armenta,Vicente,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0,0.000000,0.000000,0.0
4,1,5,Anastacio,Carrillo Lugo,M,Juan Pedro Jacinto,Ruiz,Maria Isabel,Armenta,Fructuoso Maria,...,0.000000,0.0,0.000000,0.0,0.0,0.0,1,0.100000,0.100000,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35388895,340,104081,Mariano,Olivera Reyes,M,José Agustin,Ruis,Francisca,Ruis,[Unstated],...,0.000000,0.0,0.000000,0.0,0.0,0.0,1,0.100000,0.100000,0.1
35388896,340,104082,Mariano,Olivera Reyes,M,José Antonio,Higuera,Maria Loreta,,Pantaleon,...,0.000000,0.0,0.000000,0.0,0.0,0.0,1,0.100000,0.100000,0.1
35388897,340,104083,Mariano,Olivera Reyes,M,Maria Santa,Soto,Maria Rita,Arrieta,Juan Ysidoro,...,0.101603,0.0,0.071397,0.0,0.0,0.0,0,0.045721,0.032129,0.0
35388898,340,104084,Mariano,Olivera Reyes,M,Felipe,,Gregoria,,[Unstated],...,0.000000,0.0,0.000000,0.0,0.0,0.0,1,0.100000,0.100000,0.1


In [15]:
matcher.save_matched_records(filename)

In [None]:
# threshold_value = 0.75
# matcher.filter_records_by_score(threshold_value)