In [1]:
import pandas as pd
from person_matching_functions import *

## Preprocessing

In [2]:
direct_feature_weights = {
    'First Name Match Score': 0.20,
    'Last Name Match Score': 0.35,
    'Gender Match Score': 0.30,
    'Age Match Score': 0.15
}

mother_feature_weights = {
    'Mother First Name Match Score': 0.20,
    'Mother Last Name Match Score': 0.35,
    'Gender Match Score': 0.30,
    'Age Match Score': 0.15,
}

father_feature_weights = {
    'Father First Name Match Score': 0.20,
    'Father Last Name Match Score': 0.35,
    'Gender Match Score': 0.30,
    'Age Match Score': 0.15,
}

In [3]:
# # when no age
# direct_feature_weights = {
#     'First Name Match Score': 0.25,
#     'Last Name Match Score': 0.40,
#     'Gender Match Score': 0.35
#     # 'Age Match Score': 0.15
# }

# mother_feature_weights = {
#     'Mother First Name Match Score': 0.25,
#     'Mother Last Name Match Score': 0.35,
#     'Gender Match Score': 0.4
#     # 'Age Match Score': 0.15,
# }

# father_feature_weights = {
#     'Father First Name Match Score': 0.25,
#     'Father Last Name Match Score': 0.35,
#     'Gender Match Score': 0.4
#     # 'Age Match Score': 0.15,
# }

In [4]:
df = pd.read_pickle('pickles/matched_records_padron_1781.pkl')

In [19]:
df.columns

In [6]:
df['First Name Match Score'] = np.where(df['First Name Match Classifier'] <= 2, 1, 0)
df['Last Name Match Score'] = np.where(
    (df['Census Last Name'].isnull() | df['Baptisms Last Name'].isnull()) | 
    (df['Last Name Match Classifier'] > 2), 0, 1)
df['Mother First Name Match Score'] = np.where(df['Mother First Name Match Classifier'] <= 2, 1, 0)
df['Mother Last Name Match Score'] = np.where(
    df['Mother Baptisms Last Name'].isnull() | 
    (df['Mother Last Name Match Classifier'] > 2), 0, 1)
df['Father First Name Match Score'] = np.where(df['Father First Name Match Classifier'] <= 2, 1, 0)
df['Father Last Name Match Score'] = np.where(
    df['Father Baptisms Last Name'].isnull() | 
    (df['Father Last Name Match Classifier'] > 2), 0, 1)

df['Age Match Score'] = np.where(df['Age Match Range'] <= 3, 1, 0)

df['Gender Match Score'] = (df['Census Gender'] == df['Baptisms Gender']).astype(int)

df.drop(['First Name Match Classifier', 'Last Name Match Classifier', 'Mother First Name Match Classifier',
         'Mother Last Name Match Classifier', 'Father First Name Match Classifier', 'Father Last Name Match Classifier',
         'Age Match Range'], inplace=True, axis=1)

In [7]:
male_df = df[df['Census Gender'] == 'm'].copy()
female_df = df[df['Census Gender'] == 'f'].copy()

In [8]:
for score_name, weight in direct_feature_weights.items():
    df[score_name + ' Weighted'] = df[score_name] * weight

In [9]:
direct_weighted_columns = [score + ' Weighted' for score in direct_feature_weights.keys()]
df['Direct Match Score'] = df[direct_weighted_columns].sum(axis=1)

In [10]:
for score_name, weight in mother_feature_weights.items():
    female_df[score_name + ' Weighted'] = female_df[score_name] * weight

In [11]:
direct_weighted_columns = [score + ' Weighted' for score in mother_feature_weights.keys()]
female_df['Parent Match Score'] = female_df[direct_weighted_columns].sum(axis=1)

In [12]:
for score_name, weight in father_feature_weights.items():
    male_df[score_name + ' Weighted'] = male_df[score_name] * weight

In [13]:
direct_weighted_columns = [score + ' Weighted' for score in father_feature_weights.keys()]
male_df['Parent Match Score'] = male_df[direct_weighted_columns].sum(axis=1)

In [14]:
df

In [15]:
direct_threshold = 0.8
parent_threshold = 0.8

In [16]:
df['Is Direct Match'] = df['Direct Match Score'] >= direct_threshold
female_df['Is Parent Match'] = female_df['Parent Match Score'] >= parent_threshold
male_df['Is Parent Match'] = male_df['Parent Match Score'] >= parent_threshold

In [17]:
is_direct_match = df['Direct Match Score'] >= direct_threshold

In [18]:
direct_df = df[is_direct_match == True]

In [18]:
direct_df

In [19]:
direct_df.columns

In [20]:
filtered_direct_df = direct_df[['ecpp_id', '#ID', 'Census First Name', 'Baptisms First Name', 'Census Last Name', 'Baptisms Last Name', 'Census Age', 'Baptisms Age', 'Direct Match Score', 'Is Direct Match']]

In [21]:
filtered_direct_df.to_csv('matches/direct_matches_1781.csv', index=False)

In [22]:
is_father_match = male_df['Parent Match Score'] >= parent_threshold
is_mother_match = female_df['Parent Match Score'] >= parent_threshold

In [23]:
father_df = male_df[is_father_match == True]
mother_df = female_df[is_mother_match == True]

In [24]:
# filtered_mother_df = parent_df[['ecpp_id', '#ID', 'Census First Name', 'Mother Baptisms First Name', 'Census Last Name', 'Mother Baptisms Last Name', 'Is Parent Match']]

In [25]:
filtered_father_df = father_df[['ecpp_id', '#ID', 'Census First Name', 'Father Baptisms First Name', 'Census Last Name', 'Father Baptisms Last Name', 'Census Age', 'Baptisms Age', 'Is Parent Match']]
filtered_mother_df = mother_df[['ecpp_id', '#ID', 'Census First Name', 'Mother Baptisms First Name', 'Census Last Name', 'Mother Baptisms Last Name', 'Census Age', 'Baptisms Age', 'Is Parent Match']]

In [26]:
filtered_father_df.to_csv('matches/father_matches_1781.csv', index=False)
filtered_mother_df.to_csv('matches/mother_matches_1781.csv', index=False)

# STOP HERE

In [27]:
# census_df = pd.read_csv('data/1790 Census Data Complete.csv')
# baptisms = pd.read_csv('data/Baptisms.csv')
# census_df['ecpp_id'] = range(1, len(census_df) + 1) 

In [28]:
# merged_direct_df = census_df.merge(filtered_direct_df[['ecpp_id']], on='ecpp_id', how='inner')
# merged_father_df = census_df.merge(filtered_father_df[['ecpp_id']], on='ecpp_id', how='inner')
# merged_mother_df = census_df.merge(filtered_mother_df[['ecpp_id']], on='ecpp_id', how='inner')

In [29]:
# merged_direct_df = merged_direct_df.drop_duplicates(subset='ecpp_id', keep='first')
# parent_father_df = merged_father_df.drop_duplicates(subset='ecpp_id', keep='first')
# parent_mother_df = merged_mother_df.drop_duplicates(subset='ecpp_id', keep='first')

In [30]:
# merged_direct_df.to_csv('matches/census_merged_direct_matches.csv', index=False)
# parent_father_df.to_csv('matches/census_merged_father_matches.csv', index=False)
# parent_mother_df.to_csv('matches/census_merged_mother_matches.csv', index=False)

In [31]:
# baptisms_merged_direct_df = baptisms.merge(filtered_direct_df[['#ID']], on='#ID', how='inner')
# baptisms_merged_father_df = baptisms.merge(filtered_father_df[['#ID']], on='#ID', how='inner')
# baptisms_merged_mother_df = baptisms.merge(filtered_mother_df[['#ID']], on='#ID', how='inner')

In [32]:
# baptisms_merged_direct_df = baptisms_merged_direct_df.drop_duplicates(subset='#ID', keep='first')
# baptisms_merged_father_df = baptisms_merged_father_df.drop_duplicates(subset='#ID', keep='first')
# baptisms_merged_mother_df = baptisms_merged_mother_df.drop_duplicates(subset='#ID', keep='first')

In [33]:
# baptisms_merged_direct_df.to_csv('matches/baptisms_merged_direct_matches.csv', index=False)
# baptisms_merged_father_df.to_csv('matches/baptisms_merged_father_matches.csv', index=False)
# baptisms_merged_mother_df.to_csv('matches/baptisms_merged_mother_matches.csv', index=False)

## Examine Matches and Graph

In [34]:
# direct_matches = merged_direct_df.shape[0]
# father_matches = parent_father_df.shape[0]
# mother_matches = parent_mother_df.shape[0]

# census_original = census_df.shape[0]

In [35]:
# direct_match_percent = (direct_matches / census_original) * 100
# remaining_percent = 100 - direct_match_percent

# father_match_percent = (father_matches / census_original) * 100
# father_remaining_percent = 100 - father_match_percent

# mother_match_percent = (mother_matches / census_original) * 100
# mother_remaining_percent = 100 - mother_match_percent

In [36]:
# mother_match_percent

In [37]:
# slices = [direct_match_percent, remaining_percent]
# labels = ['Matched People', 'Not Matched'] 

In [38]:
# colors = sns.color_palette("flare")

In [39]:
# plt.pie(slices, labels=labels, autopct="%1.1f%%", colors=colors)
# plt.title("Pie Chart of Direct Match Percentage")
# plt.show()

In [40]:
# slices = [father_match_percent, father_remaining_percent]
# labels = ['Matched Father', 'Not Matched'] 

In [41]:
# plt.pie(slices, labels=labels, autopct="%1.1f%%", colors=colors)
# plt.title("Pie Chart of Father Match Percentage")
# plt.show()

In [42]:
# slices = [mother_match_percent, mother_remaining_percent]
# labels = ['Matched Mother', 'Not Matched'] 

In [43]:
# plt.pie(slices, labels=labels, autopct="%1.1f%%", colors=colors)
# plt.title("Pie Chart of Mother Match Percentage")
# plt.show()

In [44]:
# baptisms_merged_direct_df.columns

In [45]:
# parent_father_df.columns

In [46]:
# sns.histplot(data=baptisms_merged_father_df, x="Ethnicity", color='grey')

In [47]:
# sns.histplot(data=baptisms_merged_direct_df, x="Ethnicity", color='grey')

In [48]:
# parent_father_df['Race'] = parent_father_df['Race'].str.strip().str.replace(',', '').replace(' ', '').str.title()

In [49]:
# sns.histplot(data=baptisms_merged_mother_df, x="Ethnicity", color='grey')

In [50]:
# sns.histplot(data=parent_father_df, x="Race", color='grey')