In [34]:
# python 3.7.6
import re # version 2.2.1
import itertools
import os
import pandas as pd # version 1.0.1
import csv # version 1.0

In [35]:
# navigate to ocr_split folder
os.chdir(r"C:\Users\byron\Dropbox\WIW\1950_WIW\ocr_split")

# different cleaning for specific files
segment=""
for file in os.listdir():
    file_string = open(r"{}".format(file), "r", encoding="utf8").read()
    
    # remove redundant strings and certain non-ASCII chars
    for apos in ["‘", "’"]:
        file_string = file_string.replace(apos, "\'")
    
    for apos in ["“", "”"]:
        file_string = file_string.replace(apos, "\"")
        
    file_string = file_string.replace("..", ".")
    file_string = file_string.replace("  ", " ")
    file_string = file_string.replace("—", "-")
    
    for string in ["Digitized by", "UNIVERSITY OF MICHIGAN", "Original from", "-\n", "_\n", "«", "»", "•", "□", "°", "£", "\t"]:
        file_string = file_string.replace(string, "")
    
    # catch and remove possible photo header formats \nNAME\n
    photo_names_1 = re.findall(r"\n[A-Z]{4,} [A-Z]\.?(?:[ ]?[A-Z]\.?)? [A-Z]{4,}\n", file_string) #catches JOHN C[.][ K.] DOEL
    photo_names_2 = re.findall(r"\n[A-Z]{4,}\.?(?:[ ]?[A-Z]\.?)? [A-Z]{4,}\n", file_string) #catches JOHN[.][ K.] DOEL
    photo_names_3 = re.findall(r"\n[A-Z]{4,}\.?(?:[ ]?[A-Z]\.?)? [A-Z]{4,} [A-Z]{4,}\n", file_string) #catches JOHN[.][ K.] DOEL DOEL
    photo_names_4 = re.findall(r"\n[A-Z]\.?(?:[ ]?[A-Z]\.?)? [A-Z]{4,} [A-Z]{4,}\n", file_string) #catches J[.][ K.] DOEL DOEL
    for photo_name in list(photo_names_1 + photo_names_2 + photo_names_3 + photo_names_4):
        file_string = file_string.replace(photo_name, "\n")
    
    file_string = file_string.replace("\n\n", "\n")
    
    if file == '1950_AB.txt':
        # catch and remove photo header formats NAME\n not starting with A, B
        photo_names_5 = re.findall(r"[A-Z]{4,}[A-Z\., ]+\n", segment)
        
        for name in photo_names_5:
            if name[0] not in "AB":
                file_string = file_string.replace(name, "")
    
    # for remaining files classified by letter
    elif (file !='1950_Z_LATE.txt' and file !='suppl_who_is_who_1950.txt'):
        # catch and remove photo header formats NAME\n not starting with letters in filename + preceding letter
        photo_names_5 = re.findall(r"[A-Z]{4,}[A-Z\., ]+\n", segment)
        
        # retrieve valid first letters of names from filename
        valid_first_letters = re.split(r"_|\.",file)[1]
        #include preceding letter
        previous_letter = chr(ord(valid_first_letters[0]) - 1)
        valid_first_letters = previous_letter + valid_first_letters
        
        # if name does not start with valid letter, remove it
        for name in photo_names_5:
            if name[0] not in valid_first_letters:
                file_string = file_string.replace(name, "")
    
    # no additional processing for LATE and SUPPL files   
    segment += file_string

In [36]:
# final check for non-ascii chars
rem_ascii = re.findall("[^\x00-\x7F]", file_string)
print(set(rem_ascii))

{'\ufeff'}


In [37]:
# find all names+occupations
name_occ_list = re.findall(r"[A-Z(?:Mc)']{4,}[\., ][ ]?[A-Z'\(\)].+?[ ;:j]b[\.,]", segment)

print(len(name_occ_list))
print(pd.DataFrame(name_occ_list))

# split base string into a list of strings, each starting with person name

# format names into name|name1|name2|...
name_template = "({})".format("|".join(re.escape(s) for s in name_occ_list))

# split base string based on names
segment_split = re.split(name_template, segment) 

# join back the names to the front of each string
bio_split = ["".join(x) for x in itertools.zip_longest([""] + segment_split[1::2], segment_split[::2], fillvalue='')]

bio_split[1489]

2789
                                                      0
0                 ABERNATHY, JOSEPH J., unlv. prof.; b.
1     ABRAMS, SAMUEL CHRISTOPHER, clergyman, sch. su...
2     ADAMS, ALGER LEROY, clergyman, journalist, soc...
3     ADAMS, ALTON AUGUSTUS, naval bandmaster (ret.)...
4                 ADAMS, BERNICE WILSON, sales rep.; b.
...                                                 ...
2784  WILLIAMS, ERNEST YOUNG, physician, prof, med. ...
2785  WILLIAMS, IKE, world's lightweight boxing cham...
2786  WILLIAMS, NATHANIEL GREENE, coll.prof., musici...
2787             WILLIAMSON, CHARLES WESLEY, lawyer; b.
2788                 WILSON, EDDIE BYARD, clergyman; b.

[2789 rows x 1 columns]


'MCDONALD, EDWIN KENNETH, physician; b. Birmingham, Ala., May 7, 1892; s. John H. and Ella (Johnson) McDonald; m. Joanna C. McAdams, 1920; four children: Natalie, Edwin, Jr., William\nS., John S. A.B., Fisk U., Nashville, Tenn., 1917; M.D., Northwestern U., 1923; further study, pediatrics, under U. of Chicago fellowship, Childrens Memorial Hosp., Chicago, 111. Began practice of med., 1923; has served on staff of Provident Hosp., Chicago; med. examiner, Am. Woodmen, 1930-31; past pres., Cook Co. Physicians Assn.; mem. Elks, Alpha Phi Alpha. Methodist. Address: 5642 S. State St., Chicago, 111.\n'

In [38]:
# extract required data from each person's string into a list of dictionaries
bio_data = []
for i, person in enumerate(bio_split[1:]): #0th element unnecessary
    
    bio_dict = {}

    # occupation and name
    occ = re.search(r",[ ]?[^A-Z]+[;:j ](?=b[\.,])", name_occ_list[i]) 
    if occ is not None:
        # strip irrelevant chars from sides, then replace any remaining \n within string
        bio_dict["occ"] = occ.group().strip("-–—\n ,;:").replace("\n"," ") 
    else:
        bio_dict["occ"] = None
    
    # name
    if occ is not None:
        # remove occupation from name string by taking first item after split
        bio_dict["name"] = name_occ_list[i].split(occ.group())[0].replace("\n"," ") 
    else:
        # try to find name regardless using pattern
        name = re.search(r"[A-Z\., (?:Mc)]+(?=,)", name_occ_list[i])
        if name is not None:
            bio_dict["name"] =  name.group().replace("\n"," ")
        else:
            bio_dict["name"] = name_occ_list[i]
    
    # birth details: birthdate and birthplace
    birthdetails = re.search(r"(?<=b\.).*?;", person) 
    if birthdetails is not None:
        # date of birth + place of birth
        birthdate = re.search(r"[A-Z][a-zA-Z\. ]+\d+[,\.].*?\d{4}", birthdetails.group())
        if birthdate is not None:
            bio_dict["birthdate"] = birthdate.group().replace("\n"," ")
            # remove birthdate from birthdetails string by taking first item after split
            bio_dict["birthplace"] = birthdetails.group().split(birthdate.group())[0].strip(";, ").replace("\n"," ")
        else:
            bio_dict["birthdate"] = None
            bio_dict["birthplace"] = None  
    else:
        bio_dict["birthdate"] = None
        bio_dict["birthplace"] = None          
            
    # address
    # first check for Business[:]? address:....Home: OR Business[:]? address:...Home address:
    business_home_address = re.search(r"Business[:;]? address[:;]([\s\S]+?)Home(?: address)?[:;]([\s\S]+)", person)
    # if there are indeed 2 addresses
    if business_home_address is not None:
        # curaddress = business address, residence = home address
        bio_dict["curaddress"] = business_home_address.group(1).replace("\n"," ").strip(" ")
        bio_dict["residence"] = business_home_address.group(2).replace("\n"," ").strip(" ")
    else:
        # next check for Address: or Addresses:
        # assume this to be home residence
        bio_dict["curaddress"] = None
        addresses = re.search(r"Address(?:es)?[:;]([\s\S]+)", person)
        if addresses is not None:
            bio_dict["residence"] = addresses.group(1).replace("\n"," ").strip(" ")
        else:
            bio_dict["residence"] = None
    
    bio_data.append(bio_dict)

In [39]:
# convert to dataframe
biodf = pd.DataFrame(bio_data).fillna('')

# format/rearrange df columns
biodf.columns.values
biodf = biodf[['name', 'birthplace', 'birthdate', 'occ', 'curaddress', 'residence']]
biodf.index.name = "persid"

# correct Illinois OCR errors
illinois_errors = [',[ ]?[iI1lL]{3}[\.]?', ',[ ]?[HLDnU][iI1lL]\.', ',[ ]?IUL\.', ',[ ]?m.', ',[ ]?IU']
biodf['curaddress'] = biodf['curaddress'].replace(to_replace = illinois_errors, value = ', Ill.', regex = True)
biodf['residence'] = biodf['residence'].replace(to_replace = illinois_errors, value = ', Ill.', regex = True)
biodf['birthplace'] = biodf['birthplace'].replace(to_replace = illinois_errors, value = ', Ill.', regex = True)

# correct New York errors
biodf['curaddress'] = biodf['curaddress'].replace('New Yorl', 'New York')
biodf['residence'] = biodf['residence'].replace('New Yorl', 'New York')

In [40]:
# cleaning dataframe

# convert name col to combined string
name_string = ''.join(biodf['name'].tolist())
non_letters = re.findall("[^a-zA-Z,\. ]", name_string)
print(set(non_letters))

# find all names with -
biodf[biodf['name'].str.contains('-')] # from WIW pdf, only #2487 has '-' in name
# replace - for all other names
biodf.iloc[224]['name'] = biodf.iloc[224]['name'].replace('-', '')
biodf.iloc[2249]['name'] = biodf.iloc[2249]['name'].replace('-', '')

# find all names with 0,1,8
biodf[biodf['name'].str.contains(r'[0138]')]
# replace 1 with I
biodf['name'] = biodf['name'].str.replace("1", "I")
# replace 0 with O
biodf['name'] = biodf['name'].str.replace("0", "O")
# replace 8 with S
biodf['name'] = biodf['name'].str.replace("8", "S")
# replace (3 with GI
biodf['name'] = biodf['name'].str.replace("\(3", "GI")
# remove *
biodf['name'] = biodf['name'].str.replace("*", "")
# remove names with WHO'S WHO IN 
biodf['name'] = biodf['name'].str.replace("WHO'S WHO IN ", "")

# extract out bracketed names to name_maiden
biodf['name_maiden'] = biodf['name'].str.extract(r"\(([\s\S]+)\)")
biodf['name_maiden'] = biodf['name_maiden'].str.replace("nee ", "").replace("née ", "").replace("Mrs.", "")
biodf["name"] = biodf["name"].str.replace(r"\([\s\S]+\)", "").str.replace(r"  ", " ")

# extract out miscaptured occ strings in name and add to occ
# remove miscaptured b. strings first
biodf["name"] = biodf["name"].str.replace(";b\.", "")
biodf.loc[biodf['name'].str.contains(r"; [\s\S]+"), 'occ'] = (biodf.loc[biodf['name'].str.contains(r"; [\s\S]+"), 'name'].str.extract(r"; ([\s\S]+)", expand=False) + ", ") + biodf.loc[biodf['name'].str.contains(r"; [\s\S]+"), 'occ'].astype(str)
biodf["name"] = biodf["name"].str.replace(";.*", "")

{'8', ')', "'", '0', '*', '(', '-', ';', '3', '1', '\ufeff'}


In [41]:
# read state name csv
os.chdir(r"C:\Users\byron\OneDrive\Documents\University\US\NYU\SPUR\WIW")
statedf = pd.read_csv("state_names.csv")

# create state abbreviation list
state_abbs = statedf[["SNAME1", "SNAME2", "SNAME3", "SNAME4", "SNAME5"]].stack().reset_index()[0].tolist()
state_abbs.remove("O.") # remove problematic abbreviation (in this case, output is unaffected)

# add . to any state abbreviation not ending with . to avoid false match
state_abbs_new = [abb + "." if abb[-1] != "." else abb for abb in state_abbs ]
        
# formulate regex argument
state_abbs_string = "|".join(state_abbs_new).replace(".", "\.")
print(state_abbs_string)

# cut address/residence to state 
# note that this might cut addresses with state abbreviations at start of address
# only alter matches with state name in string

# curaddress almost perfectly accurate, only 2 to be adjusted
biodf.loc[biodf['curaddress'].str.len() > 200, 'curaddress'] = biodf.loc[biodf['curaddress'].str.len() > 200, 'curaddress'].str.extract(rf"(^[\s\S]*?(?:{state_abbs_string}))", expand=False)
# residence is messier, requires more cleaning
biodf.loc[biodf['residence'].str.contains(rf"{state_abbs_string}"), 'residence'] = biodf.loc[biodf['residence'].str.contains(rf"{state_abbs_string}"), 'residence'].str.extract(rf"(^[\s\S]*?(?:{state_abbs_string}))", expand=False)

Ala\.|Ala\.|Alaska\.|Alaska\.|Alas\.|Ariz\.|Ariz\.|Az\.|Ark\.|Ark\.|Calif\.|Calif\.|Ca\.|Cal\.|Colo\.|Colo\.|Conn\.|Conn\.|Ct\.|Del\.|Del\.|De\.|D\.C\.|D\. C\.|Wash\. D\.C\.|Fla\.|Fla\.|Fl\.|Flor\.|Ga\.|Ga\.|Hawaii\.|Hawaii\.|H\.I\.|Idaho\.|Idaho\.|Id\.|Ida\.|Ill\.|Ill\.|Il\.|Ills\.|Ill's\.|Ind\.|Ind\.|In\.|Iowa\.|Iowa\.|Ia\.|Ioa\.|Kans\.|Kan\.|Ks\.|Ka\.|Ky\.|Ky\.|Ken\.|Kent\.|La\.|La\.|Maine\.|Maine\.|Me\.|Md\.|Md\.|Mass\.|Mass\.|Mich\.|Mich\.|Minn\.|Minn\.|Mn\.|Miss\.|Miss\.|Mo\.|Mo\.|Mont\.|Mont\.|Nebr\.|Neb\.|Nev\.|Nev\.|Nv\.|N\.H\.|N\. H\.|N\.J\.|N\. J\.|N\. Jersey\.|N\. Mex\.|N\. M\.|New M\.|N\.Y\.|N\. Y\.|N\. York\.|N\. Y\.|N\.C\.|N\. C\.|N\. Car\.|N\. Dak\.|N\. D\.|NoDak\.|N\.Dak\.|Ohio\.|Ohio\.|Oh\.|Okla\.|Okla\.|Ok\.|Oreg\.|Ore\.|Or\.|Pa\.|Pa\.|Penn\.|Penna\.|R\.I\.|R\. I\.|R I\. \.|R\. Isl\.|R\.Isl\.|S\.C\.|S\. C\.|S\. Car\.|S\. Dak\.|S\. D\.|SoDak\.|S\.Dak\.|Tenn\.|Tenn\.|Tex\.|Texas\.|Tx\.|Utah\.|Utah\.|Ut\.|Vt\.|Vt\.|Va\.|Va\.|Virg\.|Wash\.|Wash\.|Wa\.|Wn\.|W\. Va\.|W\.Va

In [42]:
# find invalid letters from combined string
address_string = ''.join(biodf['curaddress'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", address_string)
print(set(non_letters))

address_string = ''.join(biodf['residence'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", address_string)
print(set(non_letters))

{'&', ':', ')', '?', '*', '/', '^', '(', '#', ';', '$', '|', ']', '►'}
{'■', '?', '%', '~', '#', '<', ':', ')', '/', '!', '^', '"', '\ufeff', '&', '\\', '*', '(', ';', '$'}


In [43]:
# replace all redundant chars
for char in ['►', '\*', '\$', 'c/o', '\^', '\|', '\?', '\([\s\S]+?\)', '■', '<', '\)', '~', '\"', '%', '!', r'\\', '#']: 
    biodf['curaddress'] = biodf['curaddress'].str.replace(char, "")
    biodf['residence'] = biodf['residence'].str.replace(char, "")

biodf['curaddress'] = biodf['curaddress'].str.replace("\]", "l")
biodf['residence'] = biodf['residence'].str.replace("\]", "l")

# find the remaining list of non letters
address_string = ''.join(biodf['curaddress'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", address_string)
print(set(non_letters))

address_string = ''.join(biodf['residence'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", address_string)
print(set(non_letters))

{'/', '&', ':', ';'}
{'&', ':', '/', ';', '\ufeff'}


In [44]:
biodf = biodf.fillna("")
# replace all double spaces with single space
biodf = biodf.replace("  ", " ")

# strip any spaces 
for col in ['name', 'birthplace', 'birthdate', 'occ', 'curaddress', 'residence', 'name_maiden']:
    biodf[col] = biodf[col].str.strip(" ")

biodf.iloc[407]

name                         CARPENTER, MARCUS EDWARD
birthplace                                Jersey City
birthdate                         N.J. March 21, 1907
occ                                         physician
curaddress          99 Storms Ave., Jersey City, N.J.
residence      253 Monticello Ave., Jersey City, N.J.
name_maiden                                          
Name: 407, dtype: object

In [45]:
# Make the names easier for matching by expanding the commonly used acronyms 

# Make a list of common states acronyms (N.Y. is New York) from a csv file. Store it as dictionary.
state_acron={}
with open('state_names_expanded_2.csv', mode='r') as infile:
    # skip the first line (header)
    infile.readline()
    reader = csv.reader(infile)
    for row in reader:
        for i in range(5):
            if row[i] != "": state_acron[row[i]]=row[5]

# Sometimes it's "N Y University", not "N.Y. University" in the data
# So next we create the same dictionary of states but without dots. 
    state_acron_no_dots={}
    with open('state_names_expanded_2.csv', mode='r') as infile:
        infile.readline()
        reader = csv.reader(infile)
        for row in reader:
            for i in range(5):
                row[i] = re.sub("\.", "", row[i]) 
                if row[i] != "": state_acron_no_dots[row[i]]=row[5]                

# create a function that expands the acronyms for a given column 
def expand_acronyms(list_institutions):
    # Define the regex patterns before looping
    pattern1 = re.compile(r'\b(' + '|'.join(sorted(re.escape(k) for k in state_acron)) + r')\b')
    pattern2 = re.compile(r'(\s|,|^)(' + '|'.join(re.escape(key) for key in state_acron_no_dots.keys()) + r')(\s|,|$)')
    # use the state acronyms dictionary to expand the names to the full ones.
    list_institutions_return = list_institutions.copy()
    for i, place in enumerate(list_institutions):
        if place:
            # there is no need to care about the acronyms being separate words, 
            # because at this point the acronyms in the dictionary always have dots.
            place = re.sub(pattern1, lambda m: state_acron.get(m.group(0)), place)
            #### use the acronym dictionary to make changes
            place = re.sub('\.', ' ', place)
            place = re.sub("[\(\)]", "", place)
            #remove multiple spaces and change "U S" -> "US", "Poly tech" -> "Polytech", "De Paul/w" -> "DePaul/w"
            place = re.sub(r'\s+', ' ', place)
            place = place.replace('U S', 
                                  'US')

            #### finally, use the state dictionary without dots
    
            # use the dictionary of state acronyms to clean the list
            place = re.sub(pattern2, lambda m: " " + state_acron_no_dots.get(m.group(0).strip(' ,')) + " ", place)
            #remove double, triple etc spaces
            place = re.sub(r'\s+', ' ', place).strip(' ,')
            #remove spaces before commas
            place = re.sub(r' ,', ',', place)
            #### "Nat. Sciences" is "Natural Sciences", but in all other cases "Nat" means "National".  
            place = re.sub('Nat Sci', 'Natural Sci', place)
            place = re.sub('Nat\s', 'National ', place)
            list_institutions_return[i] = place
#             if i<15 or i>82150:
#                 print(i, place, list_institutions[i])
    return(list_institutions_return)


In [46]:
# replace NaN values for empty cells with an empty string
biodf = biodf.fillna("")

# expand birthplace column state names
bp_expanded = expand_acronyms(list(biodf['birthplace']))
biodf['birthplace_exp'] = bp_expanded
biodf = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_exp', 'birthdate', 'occ', 'curaddress', 'residence']]    

# cities with missing states usually have their states miscaptured in birthdate string
# for entries with expanded birthplace = birthplace AND state abb in birthdate, extract out state to birthplace
biodf.loc[(biodf["birthplace"] == biodf["birthplace_exp"]) & (biodf["birthdate"].str.contains(rf"{state_abbs_string}")), "birthplace"] += ", " + biodf.loc[(biodf["birthplace"] == biodf["birthplace_exp"]) & (biodf["birthdate"].str.contains(rf"{state_abbs_string}")), "birthdate"].str.extract(rf"({state_abbs_string})", expand=False)
# remove state abb from birthdate and clean entry
biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"] = biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"].str.replace(rf"{state_abbs_string}", "")
biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"] = biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"].str.strip(" ,.")

# again, expand birthplace column state names
bp_expanded = expand_acronyms(list(biodf['birthplace']))
biodf['birthplace_exp'] = bp_expanded
biodf = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_exp', 'birthdate', 'occ', 'curaddress', 'residence']] 

# further expand unrecognized abbreviations/errors
missed_abbs = {r'W[Iil][Ss]': "Wisconsin", 
                   "Maas": "Massachusetts", 
                       "Miss/": "Mississippi",
                          "Coon": "Connecticut",
                              "fey": "Kentucky",
                                  "8 C": "South Carolina",
                                      r",[ ]?la": ", Iowa",
                                          r"Col$": "Colorado",
                                              "Teim": "Tennessee",
                                                  "N T": "New York",
                                                      "Mb": "Missouri",
                                                        "Term": "Tennessee",
                                                          "Tez": "Texas"}

# expand these wrong abbreviations
biodf['birthplace_exp'] = biodf['birthplace_exp'].replace(missed_abbs, regex=True)  
    
# remove remaining -s
# this will alter a few words where - was actually intended
biodf['birthplace_exp'] = biodf['birthplace_exp'].str.replace(r"[ ]*-[ ]*", "", regex = True)

biodf.iloc[407]

name                            CARPENTER, MARCUS EDWARD
name_maiden                                             
birthplace                             Jersey City, N.J.
birthplace_exp                   Jersey City, New Jersey
birthdate                                 March 21, 1907
occ                                            physician
curaddress             99 Storms Ave., Jersey City, N.J.
residence         253 Monticello Ave., Jersey City, N.J.
Name: 407, dtype: object

In [47]:
# split birthplace_exp into location and state name

# use expanded state name csv
expstate_df = pd.read_csv("state_names_expanded_2.csv").fillna("")

# function that only returns uniques from list (in order)
def unique(sequence):
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

# create state name list, also remove duplicates
state_names = unique(list(expstate_df["FULLSNAME"]))
        
# formulate regex argument
state_names_string = "|".join(state_names)
print(state_names_string)

# first extract only state name
biodf['birthplace_st'] = biodf['birthplace_exp'].str.extract(fr"\b({state_names_string})\b$")

# then extract only the string before state name (anything behind is removed) by splitting birthplace_exp using birthplace_st
# in this case note that FOREIGN state names apply to all outside US/Canada
# FOREIGN states will have varying formats in birthplace_loc and birthplace_st due to varying foreign birthplace formats in dataset
biodf['birthplace_loc'] = biodf.apply(lambda row : re.split(str(row['birthplace_st']), str(row['birthplace_exp']))[0], axis=1).str.strip(', ')

biodf = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_exp', 'birthplace_loc', 'birthplace_st', 'birthdate', 'occ', 'curaddress', 'residence']]  

Alberta|British Columbia|Manitoba|New Brunswick|Newfoundland and Labrador|Northwest Territories|Nova Scotia|Ontario|Prince Edward Island|Quebec|Saskatchewan|Alabama|Alaska|Arizona|Arkansas|California|Canada|Colorado|Connecticut|Delaware|District of Columbia|Florida|FOREIGN|Puerto Rico|US Virgin Islands|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|West Virginia|Virginia|Washington|Wisconsin|Wyoming|Yukon|Guam|American Samoa


In [48]:
# make a dictionary of foreign/non-foreign states
is_state_foreign={}
with open('state_names_expanded_2.csv', mode='r') as infile:
    # skip the first line (header)
    infile.readline()
    reader = csv.reader(infile)
    for row in reader:
        if row[5] != "": is_state_foreign[row[5]] = (row[8])

# note the definition for FOREIGN column
# 0: US states, including US territories
# 1: foreign states, including Canada
# people without recorded birthplaces have empty entry
biodf['FOREIGN'] = biodf['birthplace_st'].map(is_state_foreign) 
biodf = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_loc', 'birthplace_st', 'FOREIGN', 'birthdate', 'occ', 'curaddress', 'residence']] 

In [49]:
# limit all columns to 100 characters
biodf = biodf.apply(lambda x: x.str.slice(0, 100))

# further limit all columns except curaddress and residence to 70 characters
biodf[['name', 'name_maiden', 'birthplace', 'birthplace_loc', 'birthplace_st', 'FOREIGN', 'birthdate', 'occ']] = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_loc', 'birthplace_st', 'FOREIGN', 'birthdate', 'occ']].apply(lambda x: x.str.slice(0, 70)) 

In [50]:
# write to csv
os.chdir(r"C:\Users\byron\OneDrive\Documents\University\US\NYU\SPUR\WIW")
biodf.to_csv('1950_bio_data_3.csv', encoding="utf-8")

In [51]:
# count how many times a state appears in birthplace_st col
# original dataframe is sorted by count
state_count_df = biodf['birthplace_st'].value_counts().to_frame()
state_count_df = state_count_df.rename(columns = {"birthplace_st": "Count"})
state_count_df.index.name = "State"

# for now, this is sorted alphabetically, not by count
# for the sake of counting, Canadian states have been separated out of FOREIGN states
state_count_df.sort_index() # remove this line if you want alphabetical order
state_count_df.to_csv('1950_state_count.csv', encoding="utf-8")

In [52]:
# count total per vocation
# read vocational index
voca_string = open(r"index_voca.txt", "r", encoding="utf8").read()

# clean text string
voca_string = re.sub(r"U\.[ ]?[8t]\.", "U.S.", voca_string)
for string in ["Digitized by", "Google", "UNIVERSITY OF MICHIGAN", "Original from"]:
        voca_string = voca_string.replace(string, "")
voca_string = voca_string.replace("  ", " ")

# find all vocations
voca_list = re.findall(r"\n.*?[A-Z]{4,}.*?\n", voca_string)
print("Number of vocations:", len(voca_list))

# split base string into a list of strings, each starting with voca

# format vocas into voca|voca1|name2|...
voca_template = "({})".format("|".join(re.escape(s) for s in voca_list))

# split base string based on vocas
voca_split = re.split(voca_template, voca_string) 

# join back the vocas to the front of each string
voca_split = ["".join(x) for x in itertools.zip_longest([""] + voca_split[1::2], voca_split[::2], fillvalue='')]

Number of vocations: 122


In [53]:
# count per vocation
# creates a dataframe with just names, but can be easily edited to include vocation
# this can be copied for geographical distribution file as well
total = 0
name_list = []
for voca in voca_split[2:]: #start from first vocation
    # format: John, D. OR John, D.O. OR John, D.O.E.
    voca_names = re.findall(r"[A-Z].*?,[ ]?[A-Z0-9]\.[ ]?[A-Z0-9]?[\.]?[A-Z0-9]?[\.]?", voca)
    total += len(voca_names)
    df = pd.DataFrame({'name':name_list})
    name_list += voca_names

print("Total number of names:", total) 

# sort alphabetically and also account for names being listed under multiple occupations
# may exclude rare instances where more than one person has same name and initials
name_list = list(set(name_list))
name_list.sort()
voca_df = pd.DataFrame(name_list)
voca_df.columns = ["name"]
voca_df.index.name = "persid"
voca_df["name"] = voca_df["name"].str.upper()

print("Total number of unique people:", len(name_list))

# function to find duplicates
def list_duplicates(seq):
    seen = set()
    seen_add = seen.add
    # adds all elements it doesn't know yet to seen and all other to seen_twice
    seen_twice = set( x for x in seq if x in seen or seen_add(x) )
    # turn the set into a list
    return list(seen_twice)

Total number of names: 3143
Total number of unique people: 2865


In [54]:
# write name list to csv
voca_df.to_csv('1950_voca_names.csv', encoding="utf-8")

In [55]:
# verifying script
print("Total number of people in main text:", len(biodf))
print("Total number of unique people in vocational index:", len(name_list))
print("Extraction rate (%):", len(biodf)/len(set(name_list))*100)

Total number of people in main text: 2789
Total number of unique people in vocational index: 2865
Extraction rate (%): 97.34729493891797
