<h3> Data Cleaning </h3>

In [107]:
import numpy as np
import pandas as pd

In [108]:
delegates = pd.read_excel("Data/constitutional_convention_1787.xlsx", header = 2)

In [109]:
print("dimensions:", delegates.shape)
delegates.head()

dimensions: (55, 4)


Unnamed: 0,first name,last name,state,sign?
0,William Samuel,Johnson,Connecticut,yes
1,Roger,Sherman,Connecticut,yes
2,Oliver,Ellsworth (Elsworth),Connecticut,no
3,George,Read,Delaware,yes
4,Gunning,Bedford Jr.,Delaware,yes


In [110]:
#quick exploratory analysis
delegates = delegates.rename(columns={"sign?":"sign", 
                                      "first name":"first",
                                      "last name":"last"})

states = delegates.state.unique()
print("list of states:",states)
print("number of states:",len(states),"\n")

state_counts = delegates.state.value_counts()
print(state_counts, "\n")

responses = delegates.sign.unique()
print("response types:", responses)
print("num response types:",len(responses), "\n")

firstnamelen = pd.Series([len(x.split(" ")) for x in delegates['first']])
print("first name length distribution")
print(firstnamelen.value_counts())

lastnamelen = pd.Series([len(x.split(" ")) for x in delegates['last']])
print("last name length distribution")
print(lastnamelen.value_counts())

list of states: [' Connecticut' ' Delaware' ' Georgia' ' Maryland' ' Massachusetts'
 ' New Hampshire' ' New Jersey' ' New York' ' North Carolina'
 ' Pennsylvania' ' South Carolina' ' Virginia']
number of states: 12 

 Pennsylvania      8
 Virginia          7
 New Jersey        5
 Maryland          5
 North Carolina    5
 Delaware          5
 Massachusetts     4
 South Carolina    4
 Georgia           4
 New York          3
 Connecticut       3
 New Hampshire     2
Name: state, dtype: int64 

response types: [' yes' 'no' ' no']
num response types: 3 

first name length distribution
1    47
2     7
4     1
dtype: int64
last name length distribution
2    47
3     7
4     1
dtype: int64


In [111]:
#might want to separate names with parentheses into last name 1/2
sumaltfirst = sum(["(" in x for x in delegates['first']])
print("number of alt first names:", sumaltfirst)

sumaltlast = sum(["(" in x for x in delegates['last']])
print("number of alt last names:", sumaltlast)

number of alt first names: 0
number of alt last names: 4


In [112]:
#cleaning strings
delegates['state'] = [x.strip() for x in delegates.state]
delegates['sign'] = [x.strip() for x in delegates.sign]
delegates['first'] = [x.strip() for x in delegates['first']]
delegates['last'] = [x.strip() for x in delegates['last']]

In [113]:
delegates.head()

Unnamed: 0,first,last,state,sign
0,William Samuel,Johnson,Connecticut,yes
1,Roger,Sherman,Connecticut,yes
2,Oliver,Ellsworth (Elsworth),Connecticut,no
3,George,Read,Delaware,yes
4,Gunning,Bedford Jr.,Delaware,yes


In [114]:
responses = delegates.sign.unique()
print("response types:", responses)
print("num response types:",len(responses), "\n")

response types: ['yes' 'no']
num response types: 2 



In [115]:
loans = pd.read_csv("loan_office_certificates_9_states_cleaned.csv").drop('Unnamed: 0', axis = 1)

In [116]:
loans.columns

Index(['State', 'Year', 'Month', 'Day', 'Title 1', 'First Name 1 ',
       'Last Name 1 ', 'Title 2', 'First Name 2', 'Last Name 2', 'Title 3',
       'First Name 3', 'Last Name 3', 'Face Value', 'Specie Value ', 'notes',
       'original text'],
      dtype='object')

In [117]:
loans = loans.rename(columns = {"First Name 1 ":"First Name 1",
                                "Last Name 1 ":"Last Name 1"})
loans.head()

Unnamed: 0,State,Year,Month,Day,Title 1,First Name 1,Last Name 1,Title 2,First Name 2,Last Name 2,Title 3,First Name 3,Last Name 3,Face Value,Specie Value,notes,original text
0,1,1778,3,13.0,Col,Joshua,Wentworth,,,,,,,200,108.2778,,
1,1,1777,9,2.0,,Charles,Treadwell,,,,,,,200,199.3778,,
2,1,1777,9,10.0,,Stephen,Cleverly,,,,,,,200,194.5111,,
3,1,1777,9,13.0,,David,Griffith,,,,,,,200,192.7111,,
4,1,1777,9,15.0,,John,Mansfield,,,,,,,200,191.5222,,


<h3> Testing FuzzyWuzzy </h3>

In [118]:
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [119]:
#construct full names for loan files
loans['full name 1'] = loans['First Name 1'] + " " + loans['Last Name 1']
loans['full name 2'] = loans['First Name 2'] + " " + loans['Last Name 2']
loans['full name 3'] = loans['First Name 3'] + " " + loans['Last Name 3']

In [120]:
#handle special case
delegates.loc[40, 'last'] = 'Fitzsimons (Fitzsimmons)'

In [121]:
#construct full names for delegate files
delegates['last 2'] = [np.nan if "(" not in x else  x[x.find("(")+1:].replace(")","")  for x in delegates['last']]
delegates['last'] = delegates['last'].apply(lambda x: re.sub(r'\([^)]*\)', '', x).strip())
delegates['full name 1'] = delegates['first'] + " " + delegates['last']
delegates['full name 2'] = delegates['first'] + " " + delegates['last 2']

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [122]:
def fuzzy_merge(lst1, lst2, threshold=80, limit = 100):
    """
    :param df_1: the left list to join
    :param df_2: the right list to join
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    
    delegates = pd.Series([x for x in lst1.unique() if not pd.isnull(x)])
    possible =  lst2.unique().tolist()
    
    #get matches
    #process.extract uses a combination of all four fuzzywuzzy scores
    matches = delegates.apply(lambda x: process.extract(x, possible, limit=limit))
    
    match_df = pd.DataFrame(columns = ['Delegates', 'Loan Matches'])
    
    for delegate, matchset in zip(delegates, matches):
        matchset_thres = [name for name in matchset if name[1] >= threshold]
        if len(matchset_thres) == 0:
            add_df = pd.DataFrame(data = {'Delegates': [delegate], 'Loan Matches': [""], 'Scores': [0]})
            match_df = pd.concat([match_df, add_df])
        else:
            delegate_lst = [delegate] * len(matchset_thres)
            add_df = pd.DataFrame(data = {'Delegates': delegate_lst, 
                                          'Loan Matches': [x[0] for x in matchset_thres],
                                          'Scores': [x[1] for x in matchset_thres]})
            match_df = pd.concat([match_df, add_df])

    return match_df

## Algorithm for Name Matching

In [123]:
# Demonstration on Loan Certificates Dataset

## Step 1: Run fuzzy match with score threshold of 80
## Step 2: Get unique set of names for each pair, get rid of ???
### If one word for either, remove match
### else run word comparison - if there are min(n, m) matches (above 90) then keep match else discard
## Step 3: Go through manually

In [124]:
#run matching on all combinations of full name columns from loan and delegate datasets
#might want to consider lowering the threshold and then running further tests among the matched data
fuzzy_name_results_11 = fuzzy_merge(delegates['full name 1'], loans['full name 1'])
fuzzy_name_results_21 = fuzzy_merge(delegates['full name 2'], loans['full name 1'])
fuzzy_name_results_12 = fuzzy_merge(delegates['full name 1'], loans['full name 2'])
fuzzy_name_results_22 = fuzzy_merge(delegates['full name 2'], loans['full name 2'])
fuzzy_name_results_13 = fuzzy_merge(delegates['full name 1'], loans['full name 3'])
fuzzy_name_results_23 = fuzzy_merge(delegates['full name 2'], loans['full name 3'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [125]:
#combine two lists
fuzzy_name_results_1 = pd.concat([fuzzy_name_results_11, fuzzy_name_results_12, fuzzy_name_results_13]).drop_duplicates().reset_index(drop = True)
fuzzy_name_results_2 = pd.concat([fuzzy_name_results_21, fuzzy_name_results_22, fuzzy_name_results_23]).drop_duplicates().reset_index(drop = True)

In [126]:
#drop results where there were no matchings
fuzzy_name_results_1 = fuzzy_name_results_1[fuzzy_name_results_1['Scores'].apply(lambda x: x != 0)]
fuzzy_name_results_2 = fuzzy_name_results_2[fuzzy_name_results_2['Scores'].apply(lambda x: x != 0)]

In [127]:
fuzzy_match_results = pd.concat([fuzzy_name_results_1, fuzzy_name_results_2]).reset_index(drop = True)

In [128]:
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    #display(fuzzy_match_results)

In [129]:
score_lst = []

In [256]:
def matchFunction(lst1, lst2, score = 90):
    lst1 = list(set(lst1))
    lst2 = list(set(lst2))
    threshold = min(len(lst1), len(lst2))
    scores = []
    matches = 0
    for wd1 in lst1:
        for wd2 in lst2:
            if process.extract(wd1, [wd2])[0][1] > score:
                matches+=1
                scores.append(fuzz.partial_ratio(wd1, wd2))
    score_lst.append(scores)
    return matches >= threshold

In [164]:
fuzzy_step2_1 = fuzzy_match_results[fuzzy_match_results['Loan Matches'].apply(lambda x: len(list(set(x.replace("??", "").strip().split(" "))))>=2)]

In [165]:
fuzzy_step2_1_update = fuzzy_step2_1[[matchFunction(x.split(" "), y.split(" ")) for x, y in zip(fuzzy_step2_1['Delegates'], fuzzy_step2_1['Loan Matches'])]]

In [166]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(fuzzy_step2_1_update)

Unnamed: 0,Delegates,Loan Matches,Scores
0,William Samuel Johnson,William Johnson,95.0
2,William Samuel Johnson,Samuel Johnson,90.0
100,Roger Sherman,Roger Sherman,100.0
101,Oliver Ellsworth,Oliver Ellsworth,100.0
103,George Read,George Read,100.0
154,Gunning Bedford Jr.,Gunning Bedford,95.0
155,John Dickinson,John Dickinson,100.0
257,Jacob Broom,Jacob Broom,100.0
414,William L. Pierce,William Pierce,95.0
556,Luther Martin,Luther Martin,100.0


# Pierce Certificates

In [281]:
#Read in file
pierce = pd.read_excel("Data/Pierce_Certs_cleaned_2019.xlsx")

#clean first and last names
pierce = pierce[pierce['First'].apply(lambda x: not pd.isnull(x))]
pierce_ind = pierce[pierce['Last'].apply(lambda x: type(x) != str)].index
#fix issue with types for names
pierce.loc[pierce_ind, 'Last'] = ['True'] * len(pierce_ind)
#create full name
pierce['full name'] = pierce['First'] + " " + pierce['Last']

#run merghe
pierce_name_results_1 = fuzzy_merge(delegates['full name 1'], pierce['full name'])
pierce_name_results_2 = fuzzy_merge(delegates['full name 2'], pierce['full name'])

#combine two dataframes
pierce_name_results = pd.concat([pierce_name_results_1, pierce_name_results_2]).drop_duplicates().reset_index(drop = True)

#remove names with only one unique name
pierce_name_results_1 = pierce_name_results[pierce_name_results['Loan Matches'].apply(lambda x: len(list(set(x.replace("??", "").strip().split(" "))))>=2)]
#run through step 2 cleaning function
pierce_name_results_1_update = pierce_name_results_1[[matchFunction(x.split(" "), y.split(" "), 90) for x, y in zip(pierce_name_results_1['Delegates'], pierce_name_results_1['Loan Matches'])]]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(pierce_name_results_1_update)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Delegates,Loan Matches,Scores
0,William Samuel Johnson,William Johnson,95.0
2,William Samuel Johnson,Samuel Johnson,90.0
103,George Read,George Read,100.0
272,Richard Bassett,Richard Bassett,100.0
387,Abraham Baldwin,Abraham Baldwine,97.0
431,William L. Pierce,William Pierce,95.0
589,Luther Martin,Martin Luther,95.0
596,John F. Mercer,John Mercer,95.0
792,Nicholas Gilman,Nicholas Gilman,100.0
907,William Paterson,William Paterson,100.0


# Marine Certificates

In [220]:
#Read in file
marine = pd.read_excel("Data/Marine_Liquidated_Debt_Certificates.xlsx", header = 11)
cols = marine.columns = ["Page","JPEG number","Number","Letter","Date of the Certificate: Month",
                         "Date of the Certificate: Day","Date of the Certificate: Year","First name","Last name",
                         "Title","Time when the debt became due: Month","Time when the debt became due: Day",
                         "Time when the debt became due: Year","Dollars","90th","Total Dollars_1","Total Dollars_2",
                         "Line Strike Thorugh: Yes?","Line Strike Thorugh: Note","Notes"]
marine.columns = cols

marine = marine[marine['First name'].apply(lambda x: not pd.isnull(x))]

#create full name
marine['full name'] = marine['First name'] + " " + marine['Last name']

#run merge
marine_name_results_1 = fuzzy_merge(delegates['full name 1'], marine['full name'])
marine_name_results_2 = fuzzy_merge(delegates['full name 2'], marine['full name'])

#combine two dataframes
marine_name_results = pd.concat([marine_name_results_1, marine_name_results_2]).drop_duplicates().reset_index(drop = True)

#remove names with only one unique name
marine_name_results_1 = marine_name_results[marine_name_results['Loan Matches'].apply(lambda x: len(list(set(x.replace("??", "").strip().split(" "))))>=2)]
#run through step 2 cleaning function
marine_name_results_1_update = marine_name_results_1[[matchFunction(x.split(" "), y.split(" ")) for x, y in zip(marine_name_results_1['Delegates'], marine_name_results_1['Loan Matches'])]]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(marine_name_results_1_update)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Delegates,Loan Matches,Scores
0,William Samuel Johnson,Samuel Johnson,90.0


# Connecticut

In [259]:
#Read in file
CT = pd.read_excel("Data/liquidated_debt_certificates_CT.xlsx", header = 11)
cols = CT.columns = ["Register Page","JPEG number","Number","Letter","Date of the Certificate: Month",
                     "Date of the Certificate: Day","Date of the Certificate: Year","First name","Last name",
                     "Title","First name 2","Last name 2","Title 2","Time when the debt became due: Month",
                     "Time when the debt became due: Day", "Time when the debt became due: Year","Dollars","90th",
                     "Line Strike Thorugh: Yes?","Line Strike Thorugh: Note","Notes"]
CT.columns = cols

CT = CT[CT['First name'].apply(lambda x: not pd.isnull(x))]

#create full name
CT['full name 1'] = CT['First name'] + " " + CT['Last name']
CT['full name 2'] = CT['First name 2'] + " " + CT['Last name 2']


#run merge
CT_name_results_1 = fuzzy_merge(delegates['full name 1'], CT['full name 1'])
CT_name_results_2 = fuzzy_merge(delegates['full name 2'], CT['full name 1'])
CT_name_results_3 = fuzzy_merge(delegates['full name 1'], CT['full name 2'])
CT_name_results_4 = fuzzy_merge(delegates['full name 2'], CT['full name 2'])

#combine two dataframes
CT_name_results = pd.concat([CT_name_results_1, CT_name_results_2, CT_name_results_3, CT_name_results_4]).drop_duplicates().reset_index(drop = True)

#remove names with only one unique name
CT_name_results = CT_name_results[CT_name_results['Loan Matches'].apply(lambda x: len(list(set(x.replace("??", "").strip().split(" "))))>=2)]
#run through step 2 cleaning function
CT_name_results_update = CT_name_results[[matchFunction(x.split(" "), y.split(" "), 90) for x, y in zip(CT_name_results['Delegates'], CT_name_results['Loan Matches'])]]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(CT_name_results_update)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Delegates,Loan Matches,Scores


# Delaware

In [263]:
#Read in file
DE = pd.read_excel("Data/liquidated_debt_certificates_DE.xlsx", header = 11)
cols = DE.columns = ["JPEG number","Number","Letter","Date of the Certificate: Month",
                     "Date of the Certificate: Day","Date of the Certificate: Year","Title","First name","Last name",
                     "Time when the debt became due: Month",
                     "Time when the debt became due: Day", "Time when the debt became due: Year","Dollars","90th",
                     "Line Strike Through","Note"]
DE.columns = cols

DE = DE[DE['First name'].apply(lambda x: not pd.isnull(x))]

#create full name
DE['full name 1'] = DE['First name'] + " " + DE['Last name']


#run merge
DE_name_results_1 = fuzzy_merge(delegates['full name 1'], DE['full name 1'])
DE_name_results_2 = fuzzy_merge(delegates['full name 2'], DE['full name 1'])

#combine two dataframes
DE_name_results = pd.concat([DE_name_results_1, DE_name_results_2]).drop_duplicates().reset_index(drop = True)

#remove names with only one unique name
DE_name_results = DE_name_results[DE_name_results['Loan Matches'].apply(lambda x: len(list(set(x.replace("??", "").strip().split(" "))))>=2)]
#run through step 2 cleaning function
DE_name_results_update = DE_name_results[[matchFunction(x.split(" "), y.split(" "), 90) for x, y in zip(DE_name_results['Delegates'], DE_name_results['Loan Matches'])]]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(DE_name_results_update)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Delegates,Loan Matches,Scores
35,George Read,George Read,100.0
53,Jacob Broom,Jacob Broom,100.0
123,William Paterson,William Paterson,100.0
124,William Paterson,William Patterson,97.0
206,William Patterson,William Patterson,100.0
207,William Patterson,William Paterson,97.0


# Massachusetts

In [265]:
#Read in file
MA = pd.read_excel("Data/liquidated_debt_certificates_MA.xlsx", header = 11)
cols = ["JPEG number","Page", "Number","Letter","Date of the Certificate: Month",
                     "Date of the Certificate: Day","Date of the Certificate: Year","First name","Last name","Title",
                     "First name 2","Last name 2", "Title 2","Time when the debt became due: Month",
                     "Time when the debt became due: Day", "Time when the debt became due: Year","Dollars","90th",
                     "Line Strike Through","Note"]
MA = MA[MA.columns[0:20]]
MA.columns = cols

MA = MA[MA['First name'].apply(lambda x: not pd.isnull(x))]

#create full name
MA['full name 1'] = MA['First name'] + " " + MA['Last name']
MA['full name 2'] = MA['First name 2'] + " " + MA['Last name 2']


#run merge
MA_name_results_1 = fuzzy_merge(delegates['full name 1'], MA['full name 1'])
MA_name_results_2 = fuzzy_merge(delegates['full name 2'], MA['full name 1'])
MA_name_results_3 = fuzzy_merge(delegates['full name 1'], MA['full name 2'])
MA_name_results_4 = fuzzy_merge(delegates['full name 2'], MA['full name 2'])

#combine two dataframes
MA_name_results = pd.concat([MA_name_results_1, MA_name_results_2, MA_name_results_3, MA_name_results_4]).drop_duplicates().reset_index(drop = True)

#remove names with only one unique name
MA_name_results = MA_name_results[MA_name_results['Loan Matches'].apply(lambda x: len(list(set(x.replace("??", "").strip().split(" "))))>=2)]
#run through step 2 cleaning function
MA_name_results_update = MA_name_results[[matchFunction(x.split(" "), y.split(" "), 90) for x, y in zip(MA_name_results['Delegates'], MA_name_results['Loan Matches'])]]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(MA_name_results_update)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Delegates,Loan Matches,Scores
296,Elbridge Gerry,Eldbridge Gerry,97.0
299,John Langdon,John Langdon,100.0


# New Hampshire

In [267]:
#Read in file
NH = pd.read_excel("Data/liquidated_debt_certificates_NH.xlsx", header = 11)
cols = ["Page","JPEG number","Number","Letter","Date of the Certificate: Month",
                         "Date of the Certificate: Day","Date of the Certificate: Year","First name","Last name",
                         "Title","Time when the debt became due: Month","Time when the debt became due: Day",
                         "Time when the debt became due: Year","Dollars","90th",
                         "Line Strike Thorugh: Yes?","Line Strike Thorugh: Note","Notes"]
NH.columns = cols

NH = NH[NH['First name'].apply(lambda x: not pd.isnull(x))]

#create full name
NH['full name 1'] = NH['First name'] + " " + NH['Last name']

#run merge
NH_name_results_1 = fuzzy_merge(delegates['full name 1'], NH['full name 1'])
NH_name_results_2 = fuzzy_merge(delegates['full name 2'], NH['full name 1'])

#combine two dataframes
NH_name_results = pd.concat([NH_name_results_1, NH_name_results_2]).drop_duplicates().reset_index(drop = True)

#remove names with only one unique name
NH_name_results = NH_name_results[NH_name_results['Loan Matches'].apply(lambda x: len(list(set(x.replace("??", "").strip().split(" "))))>=2)]
#run through step 2 cleaning function
NH_name_results_update = NH_name_results[[matchFunction(x.split(" "), y.split(" "), 90) for x, y in zip(NH_name_results['Delegates'], NH_name_results['Loan Matches'])]]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(NH_name_results_update)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Delegates,Loan Matches,Scores
54,John Langdon,John Langdon,100.0


# New Jersey

In [278]:
#Read in file
NJ = pd.read_excel("Data/liquidated_debt_certificates_NJ.xlsx", header = 10)
cols = ["JPEG number","Number","Letter","Date of the Certificate: Month",
        "Date of the Certificate: Day","Date of the Certificate: Year","Title","First name","Last name",
        "Title 2","First name 2","Last name 2","Time when the debt became due: Month",
        "Time when the debt became due: Day", "Time when the debt became due: Year","Dollars","90th",
        "Strike Through Number","Note"]
NJ = NJ[NJ.columns[0:19]]
NJ.columns = cols

NJ = NJ[NJ['First name'].apply(lambda x: not pd.isnull(x))]

#create full name
NJ['full name 1'] = NJ['First name'] + " " + NJ['Last name']
NJ['full name 2'] = NJ['First name 2'] + " " + NJ['Last name 2']

#run merge
NJ_name_results_1 = fuzzy_merge(delegates['full name 1'], NJ['full name 1'])
NJ_name_results_2 = fuzzy_merge(delegates['full name 2'], NJ['full name 1'])
NJ_name_results_3 = fuzzy_merge(delegates['full name 1'], NJ['full name 2'])
NJ_name_results_4 = fuzzy_merge(delegates['full name 2'], NJ['full name 2'])

#combine two dataframes
NJ_name_results = pd.concat([NJ_name_results_1, NJ_name_results_2]).drop_duplicates().reset_index(drop = True)

#remove names with only one unique name
NJ_name_results = NJ_name_results[NJ_name_results['Loan Matches'].apply(lambda x: len(list(set(x.replace("??", "").strip().split(" "))))>=2)]
#run through step 2 cleaning function
NJ_name_results_update = NJ_name_results[[matchFunction(x.split(" "), y.split(" "), 90) for x, y in zip(NJ_name_results['Delegates'], NJ_name_results['Loan Matches'])]]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(NJ_name_results_update)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Delegates,Loan Matches,Scores
107,John Dickinson,John Dickinson,100.0
407,David Brearly,Honorable David Brearley,86.0
422,Jonathan Dayton,Jonathan Dayton,100.0
423,Jonathan Dayton,Jonathan I Dayton,95.0
586,Robert Morris,Robert Morris,100.0
602,James Wilson,James Willson,96.0
725,David Brearley,Honorable David Brearley,90.0


# New York

In [282]:
#Read in file
NY = pd.read_excel("Data/liquidated_debt_certificates_NY.xlsx", header = 10)
cols = ["Page", "JPEG number","Number","Letter","Date of the Certificate: Month",
        "Date of the Certificate: Day","Date of the Certificate: Year","First name","Last name",
        "Title","First name 2","Last name 2","Title 2","Time when the debt became due: Month",
        "Time when the debt became due: Day", "Time when the debt became due: Year","Dollars","90th",
        "Strike Through Yes?","Note", "Notes"]
NY = NY[NY.columns[0:21]]
NY.columns = cols

NY = NY[NY['First name'].apply(lambda x: not pd.isnull(x))]

#create full name
NY['full name 1'] = NY['First name'] + " " + NY['Last name']
NY['full name 2'] = NY['First name 2'] + " " + NY['Last name 2']

#run merge
NY_name_results_1 = fuzzy_merge(delegates['full name 1'], NY['full name 1'])
NY_name_results_2 = fuzzy_merge(delegates['full name 2'], NY['full name 1'])
NY_name_results_3 = fuzzy_merge(delegates['full name 1'], NY['full name 2'])
NY_name_results_4 = fuzzy_merge(delegates['full name 2'], NY['full name 2'])

#combine two dataframes
NY_name_results = pd.concat([NY_name_results_1, NY_name_results_2]).drop_duplicates().reset_index(drop = True)

#remove names with only one unique name
NY_name_results = NY_name_results[NY_name_results['Loan Matches'].apply(lambda x: len(list(set(x.replace("??", "").strip().split(" "))))>=2)]
#run through step 2 cleaning function
NY_name_results_update = NY_name_results[[matchFunction(x.split(" "), y.split(" "), 90) for x, y in zip(NY_name_results['Delegates'], NY_name_results['Loan Matches'])]]

print("one issue - william few,william mcwilliams")

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(NY_name_results_update)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




one issue - william few,william mcwilliams


Unnamed: 0,Delegates,Loan Matches,Scores
0,William Samuel Johnson,William Johnson,95.0
218,William Few,William Mc Williams,86.0
275,William L. Pierce,William Pierce,95.0
467,John Langdon,John Langdon,100.0
501,William Livingston,William Livingston,100.0
593,John Lansing Jr.,John Lansing,95.0
744,Alexander Martin,Alexander Martin,100.0
769,James Wilson,James Willson,96.0
823,John Blair,John Blair,100.0


# Pennsylvania P1

In [286]:
PA

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Year,Month,Date,Title,First name,Last name,...,Month.1,Date.1,Dollars,90th,10th,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Notes
0,1.0,434.0,1.0,H,1785.0,6.0,28.0,,Joseph,Forester,...,3.0,5.0,,,,,15.0,,545,
1,1.0,434.0,2.0,I,1785.0,6.0,28.0,,Isaac,Justice,...,11.0,21.0,928.0,,,74,12.0,48.0,1850,
2,1.0,434.0,3.0,L,1785.0,6.0,28.0,,John,Trombo,...,7.0,27.0,,,,,12.0,,1048,
3,1.0,434.0,4.0,A,1785.0,6.0,28.0,,Samuel,Frye,...,7.0,21.0,,,,,16.0,,SP,
4,1.0,434.0,5.0,B,1785.0,6.0,29.0,,Vincent,Calvin,...,7.0,22.0,,,,,53.0,30.0,1850,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,28.0,449.0,954.0,G,1786.0,8.0,29.0,,Ebenezer,Myers,...,11.0,1.0,,,,,32.0,17.0,1850,
954,28.0,449.0,955.0,D,1786.0,8.0,29.0,,Fredrick,Dimbole,...,11.0,9.0,,,,,6.0,60.0,,
955,,,,,,,,,,,...,,,349180.0,87.0,,,130846.0,5.0,,
956,,,,,,,,,Omitted in Page 16 Wrong charged 7/90th whish ...,,...,,,,,,,,,,


In [294]:
#Read in file
PA = pd.read_excel("Data/liquidated_debt_certificates_PA_story.xlsx", header = 11)
cols = ["JPEG number","Number","Letter","Date of the Certificate: Year",
        "Date of the Certificate: Month","Date of the Certificate: Date","Title","First name","Last name","Time when the debt became due: Year",
        "Time when the debt became due: Month","Time when the debt became due: Day",
        "Dollars","90th","10th", "Exchange","Amount in Specie_1", "Amount in Specie_2","Line Strike Thorugh?","Notes"]
PA = PA[PA.columns[1:21]]
PA.columns = cols

PA = PA[PA['First name'].apply(lambda x: not pd.isnull(x))]

#create full name
PA['full name 1'] = PA['First name'] + " " + PA['Last name']

#run merge
PA_name_results_1 = fuzzy_merge(delegates['full name 1'], PA['full name 1'])
PA_name_results_2 = fuzzy_merge(delegates['full name 2'], PA['full name 1'])

#combine two dataframes
PA_name_results = pd.concat([PA_name_results_1, PA_name_results_2]).drop_duplicates().reset_index(drop = True)

#remove names with only one unique name
PA_name_results = PA_name_results[PA_name_results['Loan Matches'].apply(lambda x: len(list(set(x.replace("??", "").strip().split(" "))))>=2)]
#run through step 2 cleaning function
PA_name_results_update = PA_name_results[[matchFunction(x.split(" "), y.split(" ")) for x, y in zip(PA_name_results['Delegates'], PA_name_results['Loan Matches'])]]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(PA_name_results_update)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Delegates,Loan Matches,Scores
76,John Dickinson,John Dickinson,100.0
106,William Houstoun,William Houston,97.0
197,William C. Houston,William Houston,95.0
254,Alexander Martin,Alexander Martin,100.0
268,James Wilson,James Wilson,100.0


# Pennsylvania P2

In [306]:
#Read in file
PA2 = pd.read_excel("Data/liquidated_debt_certificates_PA_stelle.xlsx", header = 11)
cols = ["JPEG number","Number","Letter","Date of the Certificate: Year",
        "Date of the Certificate: Month","Date of the Certificate: Date","Title","First name","Last name","Title 2","First name 2","Last name 2",
        "Time when the debt became due: Year", "Time when the debt became due: Month","Time when the debt became due: Day",
        "Dollars","90th","Line Strike Thorugh?","Notes"]
PA2 = PA2[PA2.columns[1:21]]
PA2.columns = cols

PA2 = PA2[PA2['First name'].apply(lambda x: not pd.isnull(x))]

#create full name
PA2['full name 1'] = PA2['First name'] + " " + PA2['Last name']
PA2['full name 2'] = PA2['First name 2'] + " " + PA2['Last name 2']

#run merge
PA2_name_results_1 = fuzzy_merge(delegates['full name 1'], PA2['full name 1'])
PA2_name_results_2 = fuzzy_merge(delegates['full name 2'], PA2['full name 1'])
PA2_name_results_3 = fuzzy_merge(delegates['full name 1'], PA2['full name 2'])
PA2_name_results_4 = fuzzy_merge(delegates['full name 2'], PA2['full name 2'])


#combine two dataframes
PA2_name_results = pd.concat([PA2_name_results_1, PA2_name_results_2]).drop_duplicates().reset_index(drop = True)

#remove names with only one unique name
PA2_name_results = PA2_name_results[PA2_name_results['Loan Matches'].apply(lambda x: len(list(set(x.replace("??", "").strip().split(" "))))>=2)]
#run through step 2 cleaning function
PA2_name_results_update = PA2_name_results[[matchFunction(x.split(" "), y.split(" ")) for x, y in zip(PA2_name_results['Delegates'], PA2_name_results['Loan Matches'])]]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(PA2_name_results_update)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Delegates,Loan Matches,Scores
0,William Samuel Johnson,William Johnson,95.0
134,Gunning Bedford Jr.,Gunning Bedford,95.0
633,William C. Houston,William Huston,88.0
691,Alexander Hamilton,Alexander Hamilton,100.0
842,Alexander Martin,Alexander Martin,100.0
850,Robert Morris,Robert Morris,100.0
883,James Wilson,James Wilson,100.0
884,James Wilson,James Willson,96.0
886,James Wilson,Jn and James Willson,86.0
969,John Blair,John Blair,100.0


# Rhode Island

In [305]:
#Read in file
RI = pd.read_excel("Data/liquidated_debt_certificates_RI.xlsx", header = 12)
cols =  ["Page", "JPEG number","Number","Letter","Date of the Certificate: Month",
        "Date of the Certificate: Day","Date of the Certificate: Year","First name","Last name",
        "Title","First name 2","Last name 2","Title 2","Time when the debt became due: Month",
        "Time when the debt became due: Day", "Time when the debt became due: Year","Dollars","90th",
        "8th"]
RI = RI[RI.columns[0:19]]
RI.columns = cols

RI = RI[RI['First name'].apply(lambda x: not pd.isnull(x))]

#create full name
RI['full name 1'] = RI['First name'] + " " + RI['Last name']
RI['full name 2'] = RI['First name 2'] + " " + RI['Last name 2']

#run merge
RI_name_results_1 = fuzzy_merge(delegates['full name 1'], RI['full name 1'])
RI_name_results_2 = fuzzy_merge(delegates['full name 2'], RI['full name 1'])
RI_name_results_3 = fuzzy_merge(delegates['full name 1'], RI['full name 2'])
RI_name_results_4 = fuzzy_merge(delegates['full name 2'], RI['full name 2'])

#combine two dataframes
RI_name_results = pd.concat([RI_name_results_1, RI_name_results_2, RI_name_results_3, RI_name_results_4]).drop_duplicates().reset_index(drop = True)

#remove names with only one unique name
RI_name_results = RI_name_results[RI_name_results['Loan Matches'].apply(lambda x: len(list(set(x.replace("??", "").strip().split(" "))))>=2)]
#run through step 2 cleaning function
RI_name_results_update = RI_name_results[[matchFunction(x.split(" "), y.split(" ")) for x, y in zip(RI_name_results['Delegates'], RI_name_results['Loan Matches'])]]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(RI_name_results_update)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Delegates,Loan Matches,Scores
0,William Samuel Johnson,Samuel Johnson,90.0
147,Luther Martin,Luther Martin,100.0
