In [109]:
# python 3.7.6
import csv #version 1.0
import os 
import re #version 2.2.1
import pandas as pd #version 1.0.1
import itertools

In [110]:
# navigate to ocr_split folder
os.chdir(r"C:\Users\byron\Dropbox\WIW\1927_WIW\ocr_split")

# first process index to find total

# read summary index
file_string = open(r"{}".format("1927_ZZINDEX.txt"), "r", encoding="utf8").read()

# find all entries in index using ---
index_list = re.findall(r"\n.*[A-Z].*\n.*[\_\-\–\—\.]{3,}", file_string)
print("Total number of people:", len(index_list))

bio_index_data = []
for person_string in index_list:
    
    bio_dict = {}
    
    # split each entry into name+occ and residence
    info_split = re.split("\n|[\_\-\–\—\.]{3,}", person_string)
    
    # name + occupation
    name_occ_string = info_split[1]
    name_occ_split = re.split("—", name_occ_string)
    
    # if no split, only name but no occupation
    if len(name_occ_split) == 1:
        bio_dict["name"] = name_occ_split[0].strip("-–—\n ^,\/").replace("\n"," ")
        bio_dict["occ"] = None
    else:
        bio_dict["name"] = name_occ_split[0].strip("-–—\n ^,\/").replace("\n"," ")
        bio_dict["occ"] = name_occ_split[1].strip("-–—\n ,").replace("\n"," ")
    
    # residence
    bio_dict["residence"] = info_split[2]
    
    bio_index_data.append(bio_dict)

# use pandas dataframe
index_df = pd.DataFrame(bio_index_data)
index_df.index.name = "persid"
index_df

Total number of people: 1866


Unnamed: 0_level_0,name,occ,residence
persid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"Abbott, Robert Sengstacke",Publisher-Editor.,"4847 Champlain Ave., Chicago, 111"
1,"Adams, Alton Augustus",Bandmaster.,"St. Thomas, Virgin Islands"
2,"Adams, C. P.",Principal.,"Qrambling, La"
3,"Adams, George W.",Physician.,"518 U St., N.W., Washington, D. C"
4,"Adams, James B.",Clergyman.,"170 Halsey St, Brooklyn, N. Y"
...,...,...,...
1861,"Roddy, B. M.",Vice-President.,"390 Beale St, Memphis, Tenn"
1862,"Rooks, William D.","Fraternal Officer. Newberry, S. C",
1863,"Rogers Mem. Baptist Church, Knoxville, Tenn. 2...",Clergyman.,"Drawer 406, Bristol, Tenn"
1864,"Williams, Adam D.",Clergyman.,"315 Auburn Ave., Atlanta, Oa"


In [111]:
# write to csv
os.chdir(r"C:\Users\byron\OneDrive\Documents\University\US\NYU\SPUR\WIW")
index_df.to_csv('1927_bio_data_index.csv', encoding="utf-8")

In [112]:
# second, process main text

# navigate to ocr_split folder
os.chdir(r"C:\Users\byron\Dropbox\WIW\1927_WIW\ocr_split")

# clean and gather information from main text
bio_string = ""
for file in os.listdir()[1:-1]:
    file_string = open(r"{}".format(file), "r", encoding="utf8").read()
    
    # replace duplicates
    file_string = file_string.replace("..", ".")
    file_string = file_string.replace("  ", " ")
    file_string = file_string.replace("—", "-")

    # remove redundant strings and line breaks
    for string in ["Digitized by", "UNIVERSITY OF MICHIGAN", "Original from","-\n","_\n"]:
        file_string = file_string.replace(string, "") 
    
    file_string = file_string.replace("\n\n", "\n")
    
    # remove certain non-ASCII chars
    for apos in ["‘", "’"]:
        file_string = file_string.replace(apos, "\'")

    for apos in ["“", "”"]:
        file_string = file_string.replace(apos, "\"")

    for char in ["±", "■", "©", "™", "•", "£", "°", "«", "»"]:
        file_string = file_string.replace(char, "")
    
    # replace inaccurate OCRs
    file_string = file_string.replace("VICIL SAMUEL H^6.", "VICK, SAMUEL H.-b.")
    file_string = file_string.replace("Wil-liam", "William")
    file_string = file_string.replace("Bonnet Carr6", "Bonnet Carre")
     
    file_string = file_string.replace("„", ",")
    
    bio_string += file_string

# final check for non-ascii chars
rem_ascii = re.findall("[^\x00-\x7F]", bio_string)
print(set(rem_ascii))

{'\ufeff'}


In [113]:
# find all names+occupations using regex
name_occ_list = re.findall(r"[A-Z(?:Mc)]{4,}[ ]?[\., ][ ]?[A-Z(?:Mc)].+?[-–—]{1,}.+?(?=b\.|b,|\.)", bio_string)

print(len(name_occ_list))

# split base string into a list of strings, each starting with person name

# format names into name|name1|name2|...
name_template = "({})".format("|".join(re.escape(s) for s in name_occ_list))

# split base string based on names
bio_string_split = re.split(name_template, bio_string) 

# join back the names to the front of each string
bio_split = ["".join(x) for x in itertools.zip_longest([""] + bio_string_split[1::2], bio_string_split[::2], fillvalue='')]

1998


In [114]:
# extract required data from each person's string into a list of dictionaries
bio_data = []
for i, person in enumerate(bio_split[1:]): # 0th element unnecessary
    
    bio_dict = {}

    # occupation and name
    # match any char except b (if b. is found that implies no occupation listed)
    occ = re.search(r"[-–—]{1,}[^b]+", name_occ_list[i]) 
    if occ is not None:
        # strip irrelevant chars from sides, then replace any remaining \n within string
        bio_dict["occ"] = occ.group().strip("-–—\n ,;:").replace("\n"," ") 
    else:
        bio_dict["occ"] = None
    
    # name
    if occ is not None:
        # remove occupation from name string by taking first item after split
        bio_dict["name"] = name_occ_list[i].split(occ.group())[0].replace("\n"," ") 
    else:
        # try to find name regardless using namestring pattern
        name = re.search(r"[A-Z\., (?:Mc)]+(?=,)", name_occ_list[i])
        if name is not None:
            bio_dict["name"] =  name.group().replace("\n"," ")
        else:
            bio_dict["name"] = name_occ_list[i]
    
    # birth details: birthdate and birthplace
    birthdetails = re.search(r"(?<=b\.|b,).*?;", person) 
    if birthdetails is not None:
        # date of birth + place of birth
        birthdate = re.search(r"[A-Z][a-zA-Z\. ]+\d+[,\.].*?\d{4}", birthdetails.group())
        
        if birthdate is not None:
            bio_dict["birthdate"] = birthdate.group().replace("\n"," ")
            # remove birthdate from birthdetails string by taking second item after split
            bio_dict["birthplace"] = birthdetails.group().split(birthdate.group())[1].strip(";, ").replace("\n"," ")
        else:
            bio_dict["birthdate"] = None
            birthplace = re.search(r"[A-Z][A-Za-z\., ]+,[ ]?[A-Z][A-Za-z\., ]+", birthdetails.group())
            if birthplace is not None:
                bio_dict["birthplace"] = birthplace.group().strip(";, ").replace("\n"," ")
            else:
                bio_dict["birthplace"] = None
        
    # office address and home residence
    # first search for most common: single address
    address = re.search(r";[ ]?(?:Address|Residence),(.*)", person)
    
    if address is not None:
        # assume single address/residence to be home residence
        bio_dict["curaddress"] = None
        bio_dict["residence"] = address.group(1).strip(";, ").replace("\n"," ")
        
    else:
        # now search for office address AND home residence
        # 1st term is always Business Address/Office, 2nd term is always Home Address/Residence
        address_residence = re.search(r"(?:Address,|Office,)(.*?)(?:Address,|Residence,)(.*)", person) 
        
        if address_residence is not None:
            bio_dict["curaddress"] = address_residence.group(1).strip(";, ").replace("\n"," ")
            bio_dict["residence"] = address_residence.group(2).strip(";, ").replace("\n"," ")
        
        else:
            bio_dict["curaddress"] = None
            bio_dict["residence"] = None
    
    bio_data.append(bio_dict)

In [115]:
# convert to dataframe
biodf = pd.DataFrame(bio_data).fillna("")

# rearrange columns
biodf.columns.values
biodf = biodf[['name', 'birthplace', 'birthdate', 'occ', 'curaddress', 'residence']]
biodf.index.name = "persid"

# correct Illinois OCR errors
illinois_errors = ['[,\.][ ]?[TiI1lL]{3}[ \.]', '[,\.][ ]?[HLDnU][TiI1lL][ \.]', '[,\.][ ]?IUL[ \.]', '[,\.][ ]?m.', ',[ ]?IU', ', I1L', ', tit']
biodf['curaddress'] = biodf['curaddress'].replace(to_replace = illinois_errors, value = ', Ill.', regex = True)
biodf['residence'] = biodf['residence'].replace(to_replace = illinois_errors, value = ', Ill.', regex = True)
biodf['birthplace'] = biodf['birthplace'].replace(to_replace = illinois_errors, value = ', Ill.', regex = True)

# correct NY/NC errors
biodf['curaddress'] = biodf['curaddress'].replace('N[ ]?[,\.][ ]?Y[ ]?[\s\S]', 'N. Y.', regex = True)
biodf['residence'] = biodf['residence'].replace('N[ ]?[,\.][ ]?Y[ ]?[\s\S]', 'N. Y.', regex = True)
biodf['curaddress'] = biodf['curaddress'].replace(', L\. lv ', ', N. Y.', regex = True)
biodf['residence'] = biodf['residence'].replace(', L\. lv ', ', N. Y.', regex = True)
biodf['curaddress'] = biodf['curaddress'].replace('N[ ]?[,\.][ ]?C[ ]?[\s\S]', 'N. C.', regex = True)
biodf['residence'] = biodf['residence'].replace('N[ ]?[,\.][ ]?C[ ]?[\s\S]', 'N. C.', regex = True)

In [116]:
# clean names

# convert name col to combined string
name_string = ''.join(biodf['name'].tolist())
non_letters = re.findall("[^a-zA-Z,\. ]", name_string)
print(set(non_letters))

biodf['name'] = biodf['name'].str.replace("1", "I")
biodf['name'] = biodf['name'].str.replace("*", ".")
biodf['name'] = biodf['name'].str.replace("\^", ".")
biodf['name'] = biodf['name'].str.replace("!", "")
biodf['name'] = biodf['name'].str.replace("-b", "")

# extract out bracketed names to name_maiden
biodf['name_maiden'] = biodf['name'].str.extract(r"\(([\s\S]+)\)")

for string in ["nee", "née", r"M[rR][sS]\.", "M[iI][sS]{2}", "Mr\.", "Mme\."]:
    biodf['name_maiden'] = biodf['name_maiden'].str.replace(string, "")
biodf['name_maiden'] = biodf['name_maiden'].str.strip(" .,")    
biodf["name"] = biodf["name"].str.replace(r"\([\s\S]+\)", "").str.replace(r"  ", " ").str.strip(" .,")

# find all names with -
biodf[biodf['name'].str.contains('[^a-zA-Z,\. ]')] 

# final check
name_string = ''.join(biodf['name'].tolist())
non_letters = re.findall("[^a-zA-Z,\. ]", name_string)
print(set(non_letters))

{'!', '-', '^', ')', '"', '*', "'", '('}
{"'"}


In [117]:
# clean address and residence

biodf['curaddress'] = biodf['curaddress'].str.replace(r"[;:,]?[ ]?Home", "")

# read state name csv
os.chdir(r"C:\Users\byron\OneDrive\Documents\University\US\NYU\SPUR\WIW")
statedf = pd.read_csv("state_names.csv")

# create state abbreviation list
state_abbs = statedf[["SNAME1", "SNAME2", "SNAME3", "SNAME4", "SNAME5"]].stack().reset_index()[0].tolist()
state_abbs.remove("O.") # remove problematic abbreviation (in this case, output is unaffected)

# add . to any state abbreviation not ending with . to avoid false match
state_abbs_new = [abb + "." if abb[-1] != "." else abb for abb in state_abbs ]
        
# formulate regex argument
state_abbs_string = "|".join(state_abbs_new).replace(".", "\.")
print(state_abbs_string)

# cut address/residence to state 
# note that this might cut addresses with state abbreviations at start of address
# only alter matches with state name in string

# curaddress
biodf.loc[biodf['curaddress'].str.len() > 200, 'curaddress'] = biodf.loc[biodf['curaddress'].str.len() > 200, 'curaddress'].str.extract(rf"(^[\s\S]*?(?:{state_abbs_string}))", expand=False)
# residence
biodf.loc[biodf['residence'].str.contains(rf"{state_abbs_string}"), 'residence'] = biodf.loc[biodf['residence'].str.contains(rf"{state_abbs_string}"), 'residence'].str.extract(rf"(^[\s\S]*?(?:{state_abbs_string}))", expand=False)

Ala\.|Ala\.|Alaska\.|Alaska\.|Alas\.|Ariz\.|Ariz\.|Az\.|Ark\.|Ark\.|Calif\.|Calif\.|Ca\.|Cal\.|Colo\.|Colo\.|Conn\.|Conn\.|Ct\.|Del\.|Del\.|De\.|D\.C\.|D\. C\.|Wash\. D\.C\.|Fla\.|Fla\.|Fl\.|Flor\.|Ga\.|Ga\.|Hawaii\.|Hawaii\.|H\.I\.|Idaho\.|Idaho\.|Id\.|Ida\.|Ill\.|Ill\.|Il\.|Ills\.|Ill's\.|Ind\.|Ind\.|In\.|Iowa\.|Iowa\.|Ia\.|Ioa\.|Kans\.|Kan\.|Ks\.|Ka\.|Ky\.|Ky\.|Ken\.|Kent\.|La\.|La\.|Maine\.|Maine\.|Me\.|Md\.|Md\.|Mass\.|Mass\.|Mich\.|Mich\.|Minn\.|Minn\.|Mn\.|Miss\.|Miss\.|Mo\.|Mo\.|Mont\.|Mont\.|Nebr\.|Neb\.|Nev\.|Nev\.|Nv\.|N\.H\.|N\. H\.|N\.J\.|N\. J\.|N\. Jersey\.|N\. Mex\.|N\. M\.|New M\.|N\.Y\.|N\. Y\.|N\. York\.|N\. Y\.|N\.C\.|N\. C\.|N\. Car\.|N\. Dak\.|N\. D\.|NoDak\.|N\.Dak\.|Ohio\.|Ohio\.|Oh\.|Okla\.|Okla\.|Ok\.|Oreg\.|Ore\.|Or\.|Pa\.|Pa\.|Penn\.|Penna\.|R\.I\.|R\. I\.|R I\. \.|R\. Isl\.|R\.Isl\.|S\.C\.|S\. C\.|S\. Car\.|S\. Dak\.|S\. D\.|SoDak\.|S\.Dak\.|Tenn\.|Tenn\.|Tex\.|Texas\.|Tx\.|Utah\.|Utah\.|Ut\.|Vt\.|Vt\.|Va\.|Va\.|Virg\.|Wash\.|Wash\.|Wa\.|Wn\.|W\. Va\.|W\.Va

In [118]:
# find invalid letters from combined string
address_string = ''.join(biodf['curaddress'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", address_string)
print(set(non_letters))

residence_string = ''.join(biodf['residence'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", residence_string)
print(set(non_letters))

# replace all redundant chars
for char in ['►', '\*', '\$', 'c/o', '\^', '\|', '\?', r'\([\s\S]+?\)', '■', '<', '\)', '~', '\"', '%', '!', '\\', '#', '\t']: 
    biodf['curaddress'] = biodf['curaddress'].str.replace(char, "")
    biodf['residence'] = biodf['residence'].str.replace(char, "")

# final check
address_string = ''.join(biodf['curaddress'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", address_string)
print(set(non_letters))

residence_string = ''.join(biodf['residence'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", residence_string)
print(set(non_letters))

{'|', '#', ';', '^', '&', '\\', ')', '*', '(', '/'}
{'#', '!', '\t', ';', '^', '&', '*', '/'}
{'&', ';'}
{'&', ';', '/'}


In [119]:
biodf = biodf.fillna("")
# replace all double spaces with single space
biodf = biodf.replace("  ", " ")

# strip any spaces 
for col in ['name', 'birthplace', 'birthdate', 'occ', 'curaddress', 'residence', 'name_maiden']:
    biodf[col] = biodf[col].str.strip(" ")

biodf.head()

Unnamed: 0_level_0,name,birthplace,birthdate,occ,curaddress,residence,name_maiden
persid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,"ABBOTT, ROBERT SENGSTACKE","Savannah, Ga.","Nov. 24, 1870",Pu,"3435 Indiana Ave., Chicago, Ill.","4847 Champlain Ave., Chicago, Ill.",
1,"ADAMS, ALTON AUGUSTUS","St. Thomas, Virgin Islands","Nov. 4, 1889",Bandmaster,"545 P.O. Box, St. Thomas, Virgin Islands","St. Thomas, Virgin Islands.",
2,"ADAMS, C. P",,,Principal,,"Grambling, La.",
3,"ADAMS, GEORGE W",,,Physician,,"518 U St., N. W., Washington, D. C.",
4,"ADAMS, JAMES B","Montezuma, Ga.","Dec. 2, 1892",Clergyman,,"170 Halsey St., Brooklyn, N. Y.",


In [120]:
# Make the names easier for matching by expanding the commonly used acronyms 

#Make a list of common states acronyms (N.Y. is New York) from a csv file. Store it as dictionary.
state_acron={}
with open('state_names_expanded_2.csv', mode='r') as infile:
    # skip the first line (header)
    infile.readline()
    reader = csv.reader(infile)
    for row in reader:
        for i in range(5):
            if row[i] != "": state_acron[row[i]]=row[5]

# Sometimes it's "N Y University", not "N.Y. University" in the data
# So next we create the same dictionary of states but without dots. 
    state_acron_no_dots={}
    with open('state_names_expanded_2.csv', mode='r') as infile:
        infile.readline()
        reader = csv.reader(infile)
        for row in reader:
            for i in range(5):
                row[i] = re.sub("\.", "", row[i]) 
                if row[i] != "": state_acron_no_dots[row[i]]=row[5]                

#create a function that expands the acronyms for a given column 
def expand_acronyms(list_institutions):
    # Define the regex patterns before looping
    pattern1 = re.compile(r'\b(' + '|'.join(sorted(re.escape(k) for k in state_acron)) + r')\b')
    pattern2 = re.compile(r'(\s|,|^)(' + '|'.join(re.escape(key) for key in state_acron_no_dots.keys()) + r')(\s|,|$)')
    # use the state acronyms dictionary to expand the names to the full ones.
    list_institutions_return = list_institutions.copy()
    for i, place in enumerate(list_institutions):
        if place:
            # there is no need to care about the acronyms being separate words, 
            # because at this point the acronyms in the dictionary always have dots.
            place = re.sub(pattern1, lambda m: state_acron.get(m.group(0)), place)
            #### use the acronym dictionary to make changes
            place = re.sub('\.', ' ', place)
            place = re.sub("[\(\)]", "", place)
            #remove multiple spaces and change "U S" -> "US", "Poly tech" -> "Polytech", "De Paul/w" -> "DePaul/w"
            place = re.sub(r'\s+', ' ', place)
            place = place.replace('U S', 
                                  'US')

            #### finally, use the state dictionary without dots
    
            # use the dictionary of state acronyms to clean the list
            place = re.sub(pattern2, lambda m: " " + state_acron_no_dots.get(m.group(0).strip(' ,')) + " ", place)
            #remove double, triple etc spaces
            place = re.sub(r'\s+', ' ', place).strip(' ,')
            #remove spaces before commas
            place = re.sub(r' ,', ',', place)
            #### "Nat. Sciences" is "Natural Sciences", but in all other cases "Nat" means "National".  
            place = re.sub('Nat Sci', 'Natural Sci', place)
            place = re.sub('Nat\s', 'National ', place)
            list_institutions_return[i] = place
#             if i<15 or i>82150:
#                 print(i, place, list_institutions[i])
    return(list_institutions_return)

In [121]:
# replace NaN values for empty cells with an empty string
biodf = biodf.fillna("")

# expand birthplace column state names
bp_expanded = expand_acronyms(list(biodf['birthplace']))
biodf['birthplace_exp'] = bp_expanded
biodf = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_exp', 'birthdate', 'occ', 'curaddress', 'residence']]    

# cities with missing states usually have their states miscaptured in birthdate string
# for entries with expanded birthplace = birthplace AND state abb in birthdate, extract out state to birthplace
biodf.loc[(biodf["birthplace"] == biodf["birthplace_exp"]) & (biodf["birthdate"].str.contains(rf"{state_abbs_string}")), "birthplace"] += ", " + biodf.loc[(biodf["birthplace"] == biodf["birthplace_exp"]) & (biodf["birthdate"].str.contains(rf"{state_abbs_string}")), "birthdate"].str.extract(rf"({state_abbs_string})", expand=False)
# remove state abb from birthdate and clean entry
biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"] = biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"].str.replace(rf"{state_abbs_string}", "")
biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"] = biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"].str.strip(" ,.")

# again, expand birthplace column state names
bp_expanded = expand_acronyms(list(biodf['birthplace']))
biodf['birthplace_exp'] = bp_expanded
biodf = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_exp', 'birthdate', 'occ', 'curaddress', 'residence']]   

# further expand unrecognized abbreviations/errors
missed_abbs = {r'W[Iil][Ss]': "Wisconsin", 
                   "Maas": "Massachusetts", 
                       "Miss/": "Mississippi",
                          "Coon": "Connecticut",
                              "fey": "Kentucky",
                                  "8 C": "South Carolina",
                                      r"Col$": "Colorado",
                                          "Teim": "Tennessee",
                                              "N T": "New York",
                                                  "Mb": "Missouri",
                                                      "N G": "North Carolina",
                                                          ", la": ", Iowa",
                                                           ", ly": ", Kentucky",
                                                              "S G": "South Carolina",
                                                                  "R L": "Rhode Island"}
# expand these abbs
biodf['birthplace_exp'] = biodf['birthplace_exp'].replace(missed_abbs, regex=True)

# remove remaining -s
# this will alter a few words where - was actually intended
biodf['birthplace_exp'] = biodf['birthplace_exp'].str.replace(r"[ ]*-[ ]*", "", regex = True)
    
biodf.head()

Unnamed: 0_level_0,name,name_maiden,birthplace,birthplace_exp,birthdate,occ,curaddress,residence
persid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,"ABBOTT, ROBERT SENGSTACKE",,"Savannah, Ga.","Savannah, Georgia","Nov. 24, 1870",Pu,"3435 Indiana Ave., Chicago, Ill.","4847 Champlain Ave., Chicago, Ill."
1,"ADAMS, ALTON AUGUSTUS",,"St. Thomas, Virgin Islands","St Thomas, Virgin Islands","Nov. 4, 1889",Bandmaster,"545 P.O. Box, St. Thomas, Virgin Islands","St. Thomas, Virgin Islands."
2,"ADAMS, C. P",,,,,Principal,,"Grambling, La."
3,"ADAMS, GEORGE W",,,,,Physician,,"518 U St., N. W., Washington, D. C."
4,"ADAMS, JAMES B",,"Montezuma, Ga.","Montezuma, Georgia","Dec. 2, 1892",Clergyman,,"170 Halsey St., Brooklyn, N. Y."


In [122]:
# split birthplace_exp into location and state name

# use expanded state name csv
expstate_df = pd.read_csv("state_names_expanded_2.csv").fillna("")

# function that only returns uniques from list (in order)
def unique(sequence):
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

# create state name list, also remove duplicates
state_names = unique(list(expstate_df["FULLSNAME"]))
        
# formulate regex argument
state_names_string = "|".join(state_names)
print(state_names_string)

# first extract only state name
biodf['birthplace_st'] = biodf['birthplace_exp'].str.extract(fr"\b({state_names_string})\b$")

# then extract only the string before state name (anything behind is removed) by splitting birthplace_exp using birthplace_st
# in this case note that FOREIGN state names apply to all outside US/Canada
# FOREIGN states will have varying formats in birthplace_loc and birthplace_st due to varying foreign birthplace formats in dataset
biodf['birthplace_loc'] = biodf.apply(lambda row : re.split(str(row['birthplace_st']), str(row['birthplace_exp']))[0], axis=1).str.strip(', ')

biodf = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_exp', 'birthplace_loc', 'birthplace_st', 'birthdate', 'occ', 'curaddress', 'residence']]  

Alberta|British Columbia|Manitoba|New Brunswick|Newfoundland and Labrador|Northwest Territories|Nova Scotia|Ontario|Prince Edward Island|Quebec|Saskatchewan|Alabama|Alaska|Arizona|Arkansas|California|Canada|Colorado|Connecticut|Delaware|District of Columbia|Florida|FOREIGN|Puerto Rico|US Virgin Islands|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|West Virginia|Virginia|Washington|Wisconsin|Wyoming|Yukon|Guam|American Samoa


In [123]:
# make a dictionary of foreign/non-foreign states
is_state_foreign={}
with open('state_names_expanded_2.csv', mode='r') as infile:
    # skip the first line (header)
    infile.readline()
    reader = csv.reader(infile)
    for row in reader:
        if row[5] != "": is_state_foreign[row[5]] = (row[8])

# note the definition for FOREIGN column
# 0: US states, including US territories
# 1: foreign states, including Canada
# people without recorded birthplaces have empty entry
biodf['FOREIGN'] = biodf['birthplace_st'].map(is_state_foreign) 
biodf = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_loc', 'birthplace_st', 'FOREIGN', 'birthdate', 'occ', 'curaddress', 'residence']] 

In [124]:
# limit all columns to 100 characters
biodf = biodf.apply(lambda x: x.str.slice(0, 100))

# further limit all columns except curaddress and residence to 70 characters
biodf[['name', 'name_maiden', 'birthplace', 'birthplace_loc', 'birthplace_st', 'FOREIGN', 'birthdate', 'occ']] = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_loc', 'birthplace_st', 'FOREIGN', 'birthdate', 'occ']].apply(lambda x: x.str.slice(0, 70)) 

In [125]:
# write to csv
os.chdir(r"C:\Users\byron\OneDrive\Documents\University\US\NYU\SPUR\WIW")
biodf.to_csv('1927_bio_data_3.csv', encoding="utf-8")

In [126]:
# count how many times a state appears in birthplace_st col
# original dataframe is sorted by count
state_count_df = biodf['birthplace_st'].value_counts().to_frame()
state_count_df = state_count_df.rename(columns = {"birthplace_st": "Count"})
state_count_df.index.name = "State"

# for now, this is sorted alphabetically, not by count
# for the sake of counting, Canadian states have been separated out of FOREIGN states
state_count_df.sort_index() # remove this line if you want alphabetical order
state_count_df.to_csv('1927_state_count.csv', encoding="utf-8")

In [127]:
# verifying script
# index strangely lower than main, perhaps because index script is simple and may not have matched every person
# main text also has possible false matches for capitalized names
print("Total number of people in index:", len(index_list)) 
print("Total number of people in main text:", len(biodf))

Total number of people in index: 1866
Total number of people in main text: 1998
