In [14]:
# python 3.7.6
import csv # version 1.0
import os 
import re # version 2.2.1
import pandas as pd # version 1.0.1
import itertools

In [15]:
# open,read,and slice file
# this uses the new updated file
fh = open(r"1944_ocr_updated.txt", "r", encoding="utf8").read()
# slice out introduction and end
segment = fh[17354:-4770] 

segment = segment.replace("..", ".")
segment = segment.replace("  ", " ")
segment = segment.replace("—", "-")

# "-\n" indicates words broken by lines, so replace it with empty string
segment = segment.replace("-\n","").replace("_\n","")

# remove redundant strings and certain non-ASCII chars
for apos in ["‘", "’"]:
    segment = segment.replace(apos, "\'")
    
for apos in ["“", "”"]:
    segment = segment.replace(apos, "\"")
        
for char in ["-\n", "_\n", "±", "■", "©", "™", "•", "£", "°", "«", "»", "§"]:
    segment = segment.replace(char, "")

# replace misOCRed letters
for char in ["Ö", "Ó"]:
    segment = segment.replace(char, "O")
segment = segment.replace("ô", "o")

for char in ["ü", "ū"]:
    segment = segment.replace(char, "u")

for char in ["é", "ë"]:
    segment = segment.replace(char, "e")
segment = segment.replace("É", "E")

segment = segment.replace("Ā", "A")
segment = segment.replace("ä", "a")
segment = segment.replace("Č", "C")
segment = segment.replace("Ś", "S")
segment = segment.replace("š", "s")
segment = segment.replace("í", "i")
segment = segment.replace("ź", "z")
segment = segment.replace("€", "C")
segment = segment.replace("„", ".,")

# final check for non-ascii chars
rem_ascii = re.findall("[^\x00-\x7F]", segment)
print(set(rem_ascii))

{'−', '–'}


In [16]:
# find all names using regex
# note that this method misses names without dashes at end
name_list = re.findall(r"[A-Z(?:Mc)]{4,}[\.\,][ ]?[A-Z(?:Mc)][\s\S]+?[-–—]", segment)

print("Names found:" , len(name_list))

# split base string into a list of strings, each starting with person name

# format names into name|name1|name2|...
name_template = "({})".format("|".join(re.escape(s) for s in name_list))

# split base string based on names
segment_split = re.split(name_template, segment) 

# join back the names to the front of each string
bio_split = ["".join(x) for x in itertools.zip_longest([""] + segment_split[1::2], segment_split[::2], fillvalue='')]
print(bio_split[150])

Names found: 2050
BILLUPS, POPE BARROW-Lawyer.
b. Oct. 11, 1889, Athens, Ga.; s. William D. and
Elizabeth (Tucker) Blllups; m. Edna Pierre
Lartigue, April 4, 1919; one adopted child, William Pope Robinson; educ. Fla. Baptist Acad.,
1904-10; Fla. A. & M. Coll., 1911-12; N. Y. Univ.
Law Sch., 1913-16; LL.B., 1916; Stenographer,
Jacksonville, Fla., 1908-14; member of editorial
staff, Atlantic City Review, 1914-15; admitted to
the bar, 1917; practicing lawyer since 1917;
elected Member of Assembly, N. Y. State Legislature, Nov., 1924, and served for term of 1925;
mem. Monarch Lodge No. 45, I. B. P. O. E. of
W. (Exalted Ruler); K. of P. (Grand Lodge
Atty); I. B. P. O. of Moose (Supreme Lodge
Atty.); J. R. McGill Lodge G. U. O. of O. F.;
Clubmen's Beneficial League, Coachmen's Beneficial League, Sampson Lodge No. 65, F. & A.
M.; Ires., 1927-Jan. 1929; A^sn. of Trade and
Commerce; Met. Museum of Art; Museum of
Natural History; Pol. Republican; Relig. A. M.
E. Church; Office, 206 Broadway; Res

In [17]:
# revised approach
bio_data = []
for i, person in enumerate(bio_split[1:]): # 0th element unnecessary
    
    bio_dict = {}
    
    # name
    bio_dict["name"] = name_list[i].strip("-–—").replace("\n"," ") 

    # occupation
    occ = re.search(r"(?<=—)[\s\S]*?(?=b\.)", person) 
    if occ is not None:
        # strip irrelevant chars from sides, then replace any remaining \n within string
        bio_dict["occ"] = occ.group().strip("-–—.\n ,").replace("\n"," ") 
    else:
        # note that "—" might have been recognized as "-"
        # next option: check "-"
        occ2 = re.search(r"(?<=-)[\s\S]*?(?=b\.)", person)
        if occ2 is not None:
            bio_dict["occ"] = occ2.group().strip("-–—.\n ,").replace("\n"," ")
        else:
            bio_dict["occ"] = None
    
    # birth details: birthdate and birthplace
    birthdetails = re.search(r"(?<=b\.).*?;", person) 
    if birthdetails is not None:
        # date of birth
        birthdate = re.search(r"[A-Z][a-zA-Z\. ]+\d+[,\.].*?\d{4}", birthdetails.group())
        if birthdate is not None:
            bio_dict["birthdate"] = birthdate.group()
        else:
            bio_dict["birthdate"] = None
        
        # place of birth
        birthplace = re.search(r"[A-Z][A-Za-z\.\s]+,\s[A-Za-z].*;", birthdetails.group())
        if birthplace is not None:
            bio_dict["birthplace"] = birthplace.group().strip("; ")
        else:
            bio_dict["birthplace"] = None
    else:
        # no birthdetails found
        bio_dict['birthdate'] = None
        bio_dict['birthplace'] = None
    
    # address
    
    # first search for two addresses: address + residence
    address_residence = re.search(r"(?:Address|Office)[,\.]([\s\S]+?)(?:Address|Residence)[,\.]([\s\S]+)", person)
    
    # if two addresses are found
    if address_residence is not None: 
        # first group is curaddress
        bio_dict["curaddress"] = address_residence.group(1).replace("\n"," ").strip(";, ")
        # second group is residence
        bio_dict["residence"] = address_residence.group(2).replace("\n"," ").strip(";, ")
        
    # else, try for one address
    else:
        address = re.search(r"(?:Address|Office|Residence)[,\.]([\s\S]+)", person)
        if address is not None:
            # assume that single address is residence
            bio_dict["residence"] = address.group(1).replace("\n"," ").strip(";, ")
        else:
            bio_dict["curaddress"] = None
            bio_dict["residence"] = None
    
    bio_data.append(bio_dict)

In [18]:
# dataframe
biodf = pd.DataFrame(bio_data).fillna('')

# rearrange columns
biodf.columns.values
biodf = biodf[['name', 'birthplace', 'birthdate', 'occ', 'curaddress', 'residence']]
biodf.index.name = "persid"

# correct Illinois OCR errors
illinois_errors = ['[,\.][ ]?[TiI1lL]{3}[ \.]', '[,\.][ ]?[HLDnU][TiI1lL][ \.]', '[,\.][ ]?IUL[ \.]', '[,\.][ ]?m.', ',[ ]?IU', ', I1L', ', tit']
biodf['curaddress'] = biodf['curaddress'].replace(to_replace = illinois_errors, value = ', Ill.', regex = True)
biodf['residence'] = biodf['residence'].replace(to_replace = illinois_errors, value = ', Ill.', regex = True)
biodf['birthplace'] = biodf['birthplace'].replace(to_replace = illinois_errors, value = ', Ill.', regex = True)

# correct NY/NC errors
biodf['curaddress'] = biodf['curaddress'].replace('N[ ]?[,\.][ ]?Y[ ]?[\s\S]', 'N. Y.', regex = True)
biodf['residence'] = biodf['residence'].replace('N[ ]?[,\.][ ]?Y[ ]?[\s\S]', 'N. Y.', regex = True)
biodf['curaddress'] = biodf['curaddress'].replace(', L\. lv ', ', N. Y.', regex = True)
biodf['residence'] = biodf['residence'].replace(', L\. lv ', ', N. Y.', regex = True)
biodf['curaddress'] = biodf['curaddress'].replace('N[ ]?[,\.][ ]?C[ ]?[\s\S]', 'N. C.', regex = True)
biodf['residence'] = biodf['residence'].replace('N[ ]?[,\.][ ]?C[ ]?[\s\S]', 'N. C.', regex = True)

# additional uncaught states
biodf['curaddress'] = biodf['curaddress'].replace(', N\. Carolina\.', 'N. C.', regex = True)
biodf['residence'] = biodf['residence'].replace(', N\. Carolina\.', 'N. C.', regex = True)
biodf['curaddress'] = biodf['curaddress'].replace(' 9a', ', Ga', regex = True)
biodf['residence'] = biodf['residence'].replace(' 9a', ', Ga', regex = True)
biodf['curaddress'] = biodf['curaddress'].replace(', Mass,', ', Mass.', regex = True)
biodf['residence'] = biodf['residence'].replace(', Mass,', ', Mass.', regex = True)
biodf['curaddress'] = biodf['curaddress'].replace(', Louisiana\.', ', La.', regex = True)
biodf['residence'] = biodf['residence'].replace(', Louisiana\.', ', La.', regex = True)
biodf['curaddress'] = biodf['curaddress'].replace(', FU\.', ', Fla.', regex = True)
biodf['residence'] = biodf['residence'].replace(', FU\.', ', Fla.', regex = True)
biodf['curaddress'] = biodf['curaddress'].replace('Ha\.', 'Pa.', regex = True)
biodf['residence'] = biodf['residence'].replace('Ha\.', 'Pa.', regex = True)
biodf['curaddress'] = biodf['curaddress'].replace('Washington[\.,][ ]?D[\.,][\s\S]+', 'Washington, D. C.', regex = True)
biodf['residence'] = biodf['residence'].replace('Washington[\.,][ ]?D[\.,][\s\S]+', 'Washington, D. C.', regex = True)
biodf['curaddress'] = biodf['curaddress'].replace('Washington[\s\S]{0,10}C[ \.,]', 'Washington, D. C.', regex = True)
biodf['residence'] = biodf['residence'].replace('Washington[\s\S]{0,10}C[ \.,]', 'Washington, D. C.', regex = True)

In [19]:
# cleaning dataframe

# convert name col to combined string
name_string = ''.join(biodf['name'].tolist())
non_letters = re.findall("[^a-zA-Z,\. ]", name_string)
print(set(non_letters))

# replace 8 with S
biodf['name'] = biodf['name'].str.replace("8", "S")
# replace 1 with I
biodf['name'] = biodf['name'].str.replace("1", "I")
# remove ^
biodf['name'] = biodf['name'].str.replace("^", "")
# remove !
biodf['name'] = biodf['name'].str.replace("!", "")

# extract out bracketed names to name_maiden
biodf['name_maiden'] = biodf['name'].str.extract(r"\(([\s\S]+)\)")
biodf['name_maiden'] = biodf['name_maiden'].str.replace("nee ", "").replace("née ", "").replace("Mrs.", "")
biodf["name"] = biodf["name"].str.replace(r"\([\s\S]+\)", "").str.replace("  ", " ")
# clean residual spaces
biodf["name"] = biodf["name"].str.strip(" ")
biodf['name_maiden'] = biodf['name_maiden'].str.strip(" ").fillna('')

# show all remaining names with non valid letters
# should be just apostrophes left
biodf[biodf['name'].str.contains('[^a-zA-Z,\. ]')] 

# for now, names are not fully cleaned: some names have captured parts of other people

{'^', ']', '|', '6', '*', ';', ':', '4', '1', '(', '2', '>', '_', '8', '~', '#', '$', '9', '!', '0', ')', '7', '5', '?', '"', '\x0c', '3', "'", '/'}


Unnamed: 0_level_0,name,birthplace,birthdate,occ,curaddress,residence,name_maiden
persid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
17,"ALEXANDER, ERNEST RPhysician. b. Nashville, Te...","Nashville, Tenn.",,,,"234 W. 139th St., New York, N. Y. He completed...",
31,"ALLISON, ANDREW JAC ""SON","Nashville, Tenn.","Sept. 2, 1892",Alumn: Secretary,Fisk University,"923 18th Ave., N., Nashville, Tenn. In 1918 he...",
70,"WWSW, Pittsburg, Pa. During I933 he made radio...",,Aug. 1. 1887,Clergyman,3301 Indiana Ave.; Home,"3932 Grand Blvd., Chicago, Ill. While Pastor i...",
78,"BAILEY, JOHN ALEXANDER HOLMES Dentist. b. May ...",,May 11. 1894,,120 Hamilton Ave.,"444 S. Central Ave., Columbus, Ohio.",Williams
157,"BLACKWELL, JR., JAMES HEYWARDPhysician. b. Mar...",,Dec. 10. 1922,,,,Prof.) James Heyward and Annie Estelle (Jardon
...,...,...,...,...,...,...,...
1908,"WILSON, ARTHUR JEWELL b. Oct. 30, I900, Omaha,...","Omaha, Neb.","Oct. 30, 1900",,3621 State St.,"5825 Michigan Blvd., Chicago, Ill.",Jewell
1981,"DISHOND, MRS. GERALDYN HODGES *. July 29, IS94...","Sch., Chicago 111., lM-^",,"'12; Univ. of Chicago. 1912'15; Ph.B., 1915; T...","2370 Seventh Avenue, New York, N. Y.","245 West 139th St. New York, N. Y. First Negro...",Powell
1989,"FAULKNER, GEORGIA M. DE BAPTISTESocial Worker....",,,,,"3736 S. Michigan Ave., Chicago, Ill.",Brisco
2025,"PRICE, WILLARD J. b. Sept. I9, ISSI, Danville,...","Danville, Va.","Sept. 19, 1881","Executive Secretary, Urban League",,"40 Putnam Ave., Brooklyn, N. Y. RAGLAND, JOHN ...",#603


In [20]:
# clean address and residence columns

# read state name csv
os.chdir(r"C:\Users\byron\OneDrive\Documents\University\US\NYU\SPUR\WIW")
statedf = pd.read_csv("state_names.csv")

# create state abbreviation list
state_abbs = statedf[["SNAME1", "SNAME2", "SNAME3", "SNAME4", "SNAME5"]].stack().reset_index()[0].tolist()
state_abbs.append("Kansas")

# add . to any state abbreviation not ending with . to avoid false match
state_abbs_new = [abb + "." if abb[-1] != "." else abb for abb in state_abbs ]
        
# formulate regex argument
state_abbs_string = "|".join(state_abbs_new).replace(".", "\.")
print(state_abbs_string)

# cut address/residence to state 
# note that this might cut addresses with state abbreviations at start of address
# only alter matches with state name in string

# first cut to state abbreviation
# curaddress
# not very useful for curaddress, as many do not list state names after address
biodf.loc[biodf['curaddress'].str.len() > 200, 'curaddress'] = biodf.loc[biodf['curaddress'].str.len() > 200, 'curaddress'].str.extract(rf"(^[\s\S]*?(?:{state_abbs_string}))", expand=False)
# residence
biodf.loc[biodf['residence'].str.contains(rf"{state_abbs_string}"), 'residence'] = biodf.loc[biodf['residence'].str.contains(rf"{state_abbs_string}"), 'residence'].str.extract(rf"(^[\s\S]*?(?:{state_abbs_string}))", expand=False)

# next, for remaining uncleaned cells, cut to state name
state_names = statedf["FULLSNAME"].tolist()
state_names_string = "|".join(state_names).replace(".", "\.")
print(state_names_string)
biodf.loc[biodf['curaddress'].str.len() > 200, 'curaddress'] = biodf.loc[biodf['curaddress'].str.len() > 200, 'curaddress'].str.extract(rf"(^[\s\S]*?(?:{state_names_string}))", expand=False)
biodf.loc[biodf['residence'].str.len() > 200, 'residence'] = biodf.loc[biodf['residence'].str.len() > 200, 'residence'].str.extract(rf"(^[\s\S]*?(?:{state_names_string}))", expand=False)

# finally, for remaining uncleaned cells, cut to state abbreviation without '.'
# slice out '\.' from end of every abb
state_abbs_cut = [abb[:-1] for abb in state_abbs_new]
print(state_abbs_cut)
biodf.loc[biodf['curaddress'].str.len() > 200, 'curaddress'] = biodf.loc[biodf['curaddress'].str.len() > 200, 'curaddress'].str.extract(rf"(^[\s\S]*?,[ ]?(?:{state_abbs_cut}))", expand=False)
biodf.loc[biodf['residence'].str.len() > 200, 'residence'] = biodf.loc[biodf['residence'].str.len() > 200, 'residence'].str.extract(rf"(^[\s\S]*?,[ ]?(?:{state_abbs_cut}))", expand=False)

Ala\.|Ala\.|Alaska\.|Alaska\.|Alas\.|Ariz\.|Ariz\.|Az\.|Ark\.|Ark\.|Calif\.|Calif\.|Ca\.|Cal\.|Colo\.|Colo\.|Conn\.|Conn\.|Ct\.|Del\.|Del\.|De\.|D\.C\.|D\. C\.|Wash\. D\.C\.|Fla\.|Fla\.|Fl\.|Flor\.|Ga\.|Ga\.|Hawaii\.|Hawaii\.|H\.I\.|Idaho\.|Idaho\.|Id\.|Ida\.|Ill\.|Ill\.|Il\.|Ills\.|Ill's\.|Ind\.|Ind\.|In\.|Iowa\.|Iowa\.|Ia\.|Ioa\.|Kans\.|Kan\.|Ks\.|Ka\.|Ky\.|Ky\.|Ken\.|Kent\.|La\.|La\.|Maine\.|Maine\.|Me\.|Md\.|Md\.|Mass\.|Mass\.|Mich\.|Mich\.|Minn\.|Minn\.|Mn\.|Miss\.|Miss\.|Mo\.|Mo\.|Mont\.|Mont\.|Nebr\.|Neb\.|Nev\.|Nev\.|Nv\.|N\.H\.|N\. H\.|N\.J\.|N\. J\.|N\. Jersey\.|N\. Mex\.|N\. M\.|New M\.|N\.Y\.|N\. Y\.|N\. York\.|N\. Y\.|N\.C\.|N\. C\.|N\. Car\.|N\. Dak\.|N\. D\.|NoDak\.|N\.Dak\.|Ohio\.|Ohio\.|O\.|Oh\.|Okla\.|Okla\.|Ok\.|Oreg\.|Ore\.|Or\.|Pa\.|Pa\.|Penn\.|Penna\.|R\.I\.|R\. I\.|R I\. \.|R\. Isl\.|R\.Isl\.|S\.C\.|S\. C\.|S\. Car\.|S\. Dak\.|S\. D\.|SoDak\.|S\.Dak\.|Tenn\.|Tenn\.|Tex\.|Texas\.|Tx\.|Utah\.|Utah\.|Ut\.|Vt\.|Vt\.|Va\.|Va\.|Virg\.|Wash\.|Wash\.|Wa\.|Wn\.|W\. Va\.|W

In [21]:
# find invalid letters from combined string
address_string = ''.join(biodf['curaddress'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", address_string)
print(set(non_letters))

address_string = ''.join(biodf['residence'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", address_string)
print(set(non_letters))

biodf = biodf.fillna('')

# replace all redundant chars
for string in ['►', '\*', '\$', 'c/o', '\^', '\|', '\?', '\([\s\S]+?\)', ':', '<', '>', '\)', '\(', '~', '\"', '%', '!', r'\\', '\x0c', '#', '\}', '_', '–']: 
    biodf['curaddress'] = biodf['curaddress'].str.replace(string, "")
    biodf['residence'] = biodf['residence'].str.replace(string, "")

# find the remaining list of non letters
address_string = ''.join(biodf['curaddress'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", address_string)
print(set(non_letters))

address_string = ''.join(biodf['residence'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", address_string)
print(set(non_letters))

{'(', '^', '|', '%', ')', '"', '\x0c', '*', '/', ';', '&', ':', '\\', '#'}
{'^', '*', ';', ':', '<', '(', '>', '_', '$', '#', '%', '!', '–', ')', '&', '?', '}', '"', '\x0c', '/', '\\'}
{'&', '/', ';'}
{'&', '/', ';'}


In [22]:
# Make the names easier for matching by expanding the commonly used acronyms 

# Make a list of common states acronyms (N.Y. is New York) from a csv file. Store it as dictionary.
state_acron={}
with open('state_names_expanded_2.csv', mode='r') as infile:
    # skip the first line (header)
    infile.readline()
    reader = csv.reader(infile)
    for row in reader:
        for i in range(5):
            if row[i] != "": state_acron[row[i]]=row[5]

# Sometimes it's "N Y University", not "N.Y. University" in the data
# So next we create the same dictionary of states but without dots. 
state_acron_no_dots={}
with open('state_names_expanded_2.csv', mode='r') as infile:
    infile.readline()
    reader = csv.reader(infile)
    for row in reader:
        for i in range(5):
            row[i] = re.sub("\.", "", row[i]) 
            if row[i] != "": state_acron_no_dots[row[i]]=row[5]                

#create a function that expands the acronyms for a given column 
def expand_acronyms(list_institutions):
    # Define the regex patterns before looping
    pattern1 = re.compile(r'\b(' + '|'.join(sorted(re.escape(k) for k in state_acron)) + r')\b')
    pattern2 = re.compile(r'(\s|,|^)(' + '|'.join(re.escape(key) for key in state_acron_no_dots.keys()) + r')(\s|,|$)')
    # use the state acronyms dictionary to expand the names to the full ones.
    list_institutions_return = list_institutions.copy()
    for i, place in enumerate(list_institutions):
        if place:
            # there is no need to care about the acronyms being separate words, 
            # because at this point the acronyms in the dictionary always have dots.
            place = re.sub(pattern1, lambda m: state_acron.get(m.group(0)), place)
            #### use the acronym dictionary to make changes
            place = re.sub('\.', ' ', place)
            place = re.sub("[\(\)]", "", place)
            #remove multiple spaces and change "U S" -> "US", "Poly tech" -> "Polytech", "De Paul/w" -> "DePaul/w"
            place = re.sub(r'\s+', ' ', place)
            place = place.replace('U S', 
                                  'US')

            #### finally, use the state dictionary without dots
    
            # use the dictionary of state acronyms to clean the list
            place = re.sub(pattern2, lambda m: " " + state_acron_no_dots.get(m.group(0).strip(' ,')) + " ", place)
            #remove double, triple etc spaces
            place = re.sub(r'\s+', ' ', place).strip(' ,')
            #remove spaces before commas
            place = re.sub(r' ,', ',', place)
            #### "Nat. Sciences" is "Natural Sciences", but in all other cases "Nat" means "National".  
            place = re.sub('Nat Sci', 'Natural Sci', place)
            place = re.sub('Nat\s', 'National ', place)
            list_institutions_return[i] = place
#             if i<15 or i>82150:
#                 print(i, place, list_institutions[i])
    return(list_institutions_return)

In [23]:
expand_acronyms(["Kentuckytown, Tex."])

['Kentuckytown, Texas']

In [24]:
# replace NaN values for empty cells with an empty string
biodf = biodf.fillna("")

# expand birthplace column state names
bp_expanded = expand_acronyms(list(biodf['birthplace']))
biodf['birthplace_exp'] = bp_expanded
biodf = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_exp', 'birthdate', 'occ', 'curaddress', 'residence']]    

# cities with missing states usually have their states miscaptured in birthdate string
# for entries with expanded birthplace = birthplace AND state abb in birthdate, extract out state to birthplace
biodf.loc[(biodf["birthplace"] == biodf["birthplace_exp"]) & (biodf["birthdate"].str.contains(rf"{state_abbs_string}")), "birthplace"] += ", " + biodf.loc[(biodf["birthplace"] == biodf["birthplace_exp"]) & (biodf["birthdate"].str.contains(rf"{state_abbs_string}")), "birthdate"].str.extract(rf"({state_abbs_string})", expand=False)
# remove state abb from birthdate and clean entry
biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"] = biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"].str.replace(rf"{state_abbs_string}", "")
biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"] = biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"].str.strip(" ,.")

# again, expand birthplace column state names
bp_expanded = expand_acronyms(list(biodf['birthplace']))
biodf['birthplace_exp'] = bp_expanded
biodf = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_exp', 'birthdate', 'occ', 'curaddress', 'residence']] 

# further expand unrecognized abbreviations/errors
missed_abbs = {r'W[Iil][Ss]': "Wisconsin", 
                   "Maas": "Massachusetts", 
                       "Miss/": "Mississippi",
                          "Coon": "Connecticut",
                              "fey": "Kentucky",
                                  "8 C": "South Carolina",
                                   "S, C": "South Carolina",
                                      ", la": ", Iowa",
                                          r"Col$": "Colorado",
                                              "Teim": "Tennessee",
                                               ", Twin": ", Tennessee",
                                                ", Term": ", Tennessee",
                                                  "N T": "New York",
                                                      "Mb": "Missouri",
                                                          "X C": "North Carolina",
                                                           "N G": "North Carolina",
                                                            r"North C\b": "North Carolina\b",
                                                              ", la": ", Iowa",
                                                               ", Vs": ", Virginia",
                                                                ", Kims": ", Kansas",
                                                                    "Los Angeles, Cat": "Los Angeles, California",
                                                                       ", Arte": ", Arkansas",
                                                                          "L I": "New York"}

# expand these abbs
biodf['birthplace_exp'] = biodf['birthplace_exp'].replace(missed_abbs, regex=True)

# remove remaining -s
# this will alter a few words where - was actually intended
biodf['birthplace_exp'] = biodf['birthplace_exp'].str.replace(r"[ ]*-[ ]*", "", regex = True)

# cut to state name
# remove Washington from state_names_string to prevent false match
state_names_string_2 = state_names_string.replace("|Washington", "")
biodf.loc[biodf['birthplace_exp'].str.contains(rf"{state_names_string_2}"), 'birthplace_exp'] = biodf.loc[biodf['birthplace_exp'].str.contains(rf"{state_names_string_2}"), 'birthplace_exp'].str.extract(rf"(^[\s\S]*?\b(?:{state_names_string_2})\b)", expand=False)

In [25]:
# split birthplace_exp into location and state name

# use expanded state name csv
expstate_df = pd.read_csv("state_names_expanded_2.csv").fillna("")

# function that only returns uniques from list (in order)
def unique(sequence):
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

# create state name list, also remove duplicates
state_names = unique(list(expstate_df["FULLSNAME"]))
        
# formulate regex argument
state_names_string = "|".join(state_names)
print(state_names_string)

# first extract only state name
biodf['birthplace_st'] = biodf['birthplace_exp'].str.extract(fr"\b({state_names_string})\b$")

# then extract only the string before state name (anything behind is removed) by splitting birthplace_exp using birthplace_st
# in this case note that FOREIGN state names apply to all outside US/Canada
# FOREIGN states will have varying formats in birthplace_loc and birthplace_st due to varying foreign birthplace formats in dataset
biodf['birthplace_loc'] = biodf.apply(lambda row : re.split(str(row['birthplace_st']), str(row['birthplace_exp']))[0], axis=1).str.strip(', ')

biodf = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_exp', 'birthplace_loc', 'birthplace_st', 'birthdate', 'occ', 'curaddress', 'residence']]  
biodf.iloc[696]

Alberta|British Columbia|Manitoba|New Brunswick|Newfoundland and Labrador|Northwest Territories|Nova Scotia|Ontario|Prince Edward Island|Quebec|Saskatchewan|Alabama|Alaska|Arizona|Arkansas|California|Canada|Colorado|Connecticut|Delaware|District of Columbia|Florida|FOREIGN|Puerto Rico|US Virgin Islands|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|West Virginia|Virginia|Washington|Wisconsin|Wyoming|Yukon|Guam|American Samoa


name              EMMA. C. W. GRAY, A.B., A.M., PH.B.   
 ## p. ...
name_maiden                                                    #2I6
birthplace                                  Fraziers Bottom, W. Va.
birthplace_exp                       Fraziers Bottom, West Virginia
birthplace_loc                                      Fraziers Bottom
birthplace_st                                         West Virginia
birthdate                                            April 21, 1885
occ               ss. 
  ## p. 211 (#217) ######################...
curaddress                                                         
residence         West uS Lakin, Mason  w for Colored Boys, He v...
Name: 696, dtype: object

In [26]:
# make a dictionary of foreign/non-foreign states
is_state_foreign={}
with open('state_names_expanded_2.csv', mode='r') as infile:
    # skip the first line (header)
    infile.readline()
    reader = csv.reader(infile)
    for row in reader:
        if row[5] != "": is_state_foreign[row[5]] = (row[8])

# note the definition for FOREIGN column
# 0: US states, including US territories
# 1: foreign states, including Canada
# people without recorded birthplaces have empty entry
biodf['FOREIGN'] = biodf['birthplace_st'].map(is_state_foreign) 
biodf = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_loc', 'birthplace_st', 'FOREIGN', 'birthdate', 'occ', 'curaddress', 'residence']] 

In [27]:
# limit all columns to 100 characters
biodf = biodf.apply(lambda x: x.str.slice(0, 100))

# further limit all columns except curaddress and residence to 70 characters
biodf[['name', 'name_maiden', 'birthplace', 'birthplace_loc', 'birthplace_st', 'FOREIGN', 'birthdate', 'occ']] = biodf[['name', 'name_maiden', 'birthplace', 'birthplace_loc', 'birthplace_st', 'FOREIGN', 'birthdate', 'occ']].apply(lambda x: x.str.slice(0, 70)) 

In [28]:
# write to csv
biodf.to_csv('1944_bio_data_3.csv', encoding="utf-8")

In [29]:
# count how many times a state appears in birthplace_st col
# original dataframe is sorted by count
state_count_df = biodf['birthplace_st'].value_counts().to_frame()
state_count_df = state_count_df.rename(columns = {"birthplace_st": "Count"})
state_count_df.index.name = "State"

# for now, this is sorted alphabetically, not by count
# for the sake of counting, Canadian states have been separated out of FOREIGN states
state_count_df.sort_index() # remove this line if you want alphabetical order
state_count_df.to_csv('1944_state_count.csv', encoding="utf-8")

In [30]:
print("Total number of people in main text:", len(biodf))

Total number of people in main text: 2050
