# Script to convert 1915 text into biographical dataset (csv)

In [167]:
# python 3.7.6
import re # version 2.2.1
import itertools
import os
import pandas as pd # version 1.0.1
import csv # version 1.0

## Collating all text files into one string

In [168]:
# navigate to ocr_split folder
os.chdir(r"C:\Users\byron\Dropbox\WIW\1915_WIW\ocr_split")

# create overall string
segment = ""
# omit first file, which is addenda
for file in os.listdir()[1:]:
    file_string = open(r"{}".format(file), "r", encoding="utf8").read()
    segment += file_string
    
# cleaning
# remove redundant strings and certain non-ASCII chars
for apos in ["‘", "’"]:
    segment = segment.replace(apos, "\'")

for apos in ["“", "”"]:
    segment = segment.replace(apos, "\"")

segment = segment.replace("..", ".")
segment = segment.replace("—", "-")
segment = segment.replace(",,", ",")
segment = segment.replace(".:", ";")

# any "8" surrounded by letters is usually an "s" or "S"
def convert_eight_to_s(text):
    text = re.sub(r"(?<=[A-Z])8(?=[A-Z])", "S", text)
    text = re.sub(r"(?<=[a-z])8(?=[a-z])", "s", text)
    text = re.sub(r"(?<=[a-z])8\b", "s", text)
    text = re.sub(r"(?<=[A-Z]{2})8", "S", text)
    text = re.sub(r"(?<=[a-z]{2})8", "s", text)
    text = re.sub(r"8(?=[a-z]{2})", "S", text)
    text = re.sub(r"8(?=[A-Z]{2})", "S", text)
    return text

# run function twice: for 8 and 88
segment = convert_eight_to_s(segment)
segment = convert_eight_to_s(segment)
# also convert "8t" to "St"
segment = re.sub(r"(?<=\b)8(?=t)", "S", segment)


# "O" is commonly misrecognized as "0"
segment = re.sub(r"(?<=[A-Z])0(?=[A-Z])", "O", segment)

# "S" misrecognized as ""&"
segment = re.sub(r"(?<=[A-Z])&(?=[A-Z])", "S", segment)

# correct certain misrecognized names
segment = segment.replace("MAR LEY", "MARLEY")
segment = segment.replace("PiNN", "PINN")
segment = segment.replace("Jones, Grace", "JONES, Grace")
segment = segment.replace("Jackaon, William", "JACKAON, William")
segment = segment.replace("McCurdy, Theodore", "MCCURDY, Theodore")
segment = segment.replace("ABBOTT, Ebenezer Augustus, Jr.v writer, lecturer;", "ABBOTT, Ebenezer Augustus, Jr., writer, lecturer;")

# "Digitized by..." is often mixed in with actual text
# make this case-insensitive
phrases = ["Who's Who of the Colored Race", "digitized by google", "digitized by", "digitized"]
for phrase in phrases:
    pattern = re.compile(phrase, re.IGNORECASE) # some phrases are poorly recognized and will not be captured here
    segment = pattern.sub("", segment)

for string in ["-\n", "_\n", "«", "»", "•", "□", "°", "£", "\t", "■", "™", "©", "®", "§", "►", "▲", "▼", "„", "€", "¥", "✓", "*", "#", "~"]:
    segment = segment.replace(string, "")

segment = segment.replace("\n\n", "\n")
segment = segment.replace("  ", " ")

## Splitting string into biographical dataset per person

In [169]:
# final check for non-ascii chars
rem_ascii = re.findall("[^\x00-\x7F]", segment)
print(set(rem_ascii))

{'\ufeff'}


In [170]:
# pattern: 1st name is all caps, following names are not
# eg. JOHN, James, physician; born in/at 

# find all names + occupations
# try to include uncaptured 3 letter first names manually
name_occ_list = re.findall(r"(?:[A-Z(?:Mc)']{4,}|COX|RAY|BUM|ISH|POE|LEE)[\.,; ][ ]?[A-Z'\(\)][\s\S]+?[;:]", segment)

print(len(name_occ_list))
print(pd.DataFrame(name_occ_list))

# split base string into a list of strings, each starting with person name

# format names into name|name1|name2|...
name_template = "({})".format("|".join(re.escape(s) for s in name_occ_list))

# split base string based on names
segment_split = re.split(name_template, segment) 

# join back the names to the front of each string
bio_split = ["".join(x) for x in itertools.zip_longest([""] + segment_split[1::2], segment_split[::2], fillvalue='')]

bio_split[500]

1301
                                                      0
0     ABBOTT, Ebenezer Augustus, Jr., writer, lecturer;
1     ABINGTON, George Sexton, principal public school;
2                      ABNER, David, college president;
3                              ADAMS, Luclen, druggist;
4     ADAMS, Moses Samuel, insurance born at Greenvi...
...                                                 ...
1296                         YOUNG, George, bookseller;
1297   YOUNG, Isaac Wilhelm, mayor.\nphysician, editor;
1298          YOUNG, Joseph A., Jr., editor, publisher;
1299  YOUNG, Joseph Franklin, city councilman, real ...
1300      YOUNQ, Nathan Benjamin, college iJapresident;

[1301 rows x 1 columns]


"GENTRY, Emery Marcus, teacher; born at Winchester, Ky., Oct. 14, 1880; son of Jacob and Amelia , (Barnes) Gentry; ed. public schools ' in Clark County, and Berea College,\n' Ky.; B.S., Fisk Univ., Nashville, Tenn., 1905; attended Univ. of Mich., summer school; married Mary . Frankie Whaley, of Maysville, Ky.,\n: June 22, 1910; 2 children: Annamelia,\n: Emery, Jr. Teacher in Clark County, i Ky., 18981903, at Mays Lick, 19058;\n) principal Western High School, Paris, ; Ky., 190810, at 11th Street School, 1 Portsmouth, O., since 1910. Republi-i can. Baptist. Address: 1312 Kinny t St., Portsmouth, Ohio.\n3 "

In [171]:
# extract required data from each person's string into a list of dictionaries
bio_data = []
for i, person in enumerate(bio_split[1:]): # 0th element is unnecessary
    
    bio_dict = {}

    # name and occupation
    # occ always starts with lower caps, in contrast to names
    occ = re.search(r"(?<=,)[ ]?[^A-Z0-9]+?[;:]", name_occ_list[i]) 
    if occ is not None:
        # strip irrelevant chars from sides, then replace any remaining \n within string
        bio_dict["occ"] = occ.group().strip("-–—\n ,;:'").replace("\n"," ") 
    else:
        bio_dict["occ"] = None
    
    # name
    # search for repeating groups with first letter capitalized
    name = re.search(r"^(?:[A-Z][A-Za-z\. \(\)]+,[ ]?)+", name_occ_list[i]) # might need to edit pattern
    if name is not None:
        bio_dict["name"] =  name.group().strip("-–—\n ,;:").replace("\n"," ")
    else:
        bio_dict["name"] = None # can change this later: this is causing a few blank names in csv
    
    # birth details: birthdate and birthplace
    # don't capture preceding "born at/in..."
    birthdetails = re.search(r"(?:born|bom)[\s\S]*?([A-Z][\s\S]+?);", person) 
    if birthdetails is not None:
        # date of birth + place of birth
        birthdate = re.search(r"[A-Z][a-z]{2,3}[\., ]+?\d+[,\.][\., ]+?\d{4}", birthdetails.group(1))
        if birthdate is not None:
            bio_dict["birthdate"] = birthdate.group().replace("\n"," ")
            # remove birthdate from birthdetails string by taking first item after split
            bio_dict["birthplace"] = birthdetails.group(1).split(birthdate.group())[0].strip(" :;, }{'\\<13\^)(").replace("\n"," ")
        else:
            bio_dict["birthdate"] = None
            bio_dict["birthplace"] = None  
    else:
        bio_dict["birthdate"] = None
        bio_dict["birthplace"] = None 
    
    # address
    # permutations: Home: OR Home and office: OR Home: Office: OR Home: Address: OR Address:
    
    # first check for 2 locations: Home:...Office: OR Home:...Address:
    dual_addresses = re.search(r"Home[:;]([\s\S]+?)(?:Office|Address|office|address)[:;]([\s\S]+)", person)
    # if 2 addresses
    if dual_addresses is not None:
        # curaddress = office/address 
        # residence = home
        bio_dict["curaddress"] = dual_addresses.group(1).replace("\n"," ").strip(" ")
        bio_dict["residence"] = dual_addresses.group(2).replace("\n"," ").strip(" ")
    else:
        # else check for single location
        # assume location to be home residence
        bio_dict["curaddress"] = None
        address = re.search(r"(?:Home and office|Address|Home)[:;]([\s\S]+)", person)
        if address is not None:
            bio_dict["residence"] = address.group(1).replace("\n"," ").strip(" ")
        else:
            bio_dict["residence"] = None
    
    bio_data.append(bio_dict)

## Cleaning + formatting dataframe

In [173]:
# convert to dataframe
biodf = pd.DataFrame(bio_data).fillna('')

# format/rearrange df columns
biodf.columns.values
biodf = biodf[['name', 'birthplace', 'birthdate', 'occ', 'curaddress', 'residence']]
biodf.index.name = "persid"

# correct Illinois OCR errors
illinois_errors = [',[ ]?[iI1lL]{3}[\.]?', ',[ ]?[HLDnU][iI1lL]\.', ',[ ]?IUL\.', ',[ ]?m.', ',[ ]?IU']
biodf['curaddress'] = biodf['curaddress'].replace(to_replace = illinois_errors, value = ', Ill.', regex = True)
biodf['residence'] = biodf['residence'].replace(to_replace = illinois_errors, value = ', Ill.', regex = True)
biodf['birthplace'] = biodf['birthplace'].replace(to_replace = illinois_errors, value = ', Ill.', regex = True)

# correct New York errors
biodf['curaddress'] = biodf['curaddress'].replace('New Yorl', 'New York')
biodf['residence'] = biodf['residence'].replace('New Yorl', 'New York')

In [174]:
# cleaning dataframe: name

# convert name col to combined string
name_string = ''.join(biodf['name'].tolist())
non_letters = re.findall("[^a-zA-Z,\. ]", name_string)
print(set(non_letters))

# find all names with 0,1,8
biodf[biodf['name'].str.contains(r'[0138]')]
# replace 1 with I
biodf['name'] = biodf['name'].str.replace("1", "I")
# replace 0 with O
biodf['name'] = biodf['name'].str.replace("0", "O")
# replace 8 with S
biodf['name'] = biodf['name'].str.replace("8", "S")
# replace (3 with GI
biodf['name'] = biodf['name'].str.replace("\(3", "GI")
# remove *
biodf['name'] = biodf['name'].str.replace("*", "")
# remove names with WHO'S WHO IN 
biodf['name'] = biodf['name'].str.replace("WHO'S WHO IN ", "")

# no maiden names to extract

# extract out miscaptured occ strings in name and add to occ
# remove miscaptured b. strings first
biodf["name"] = biodf["name"].str.replace(";b\.", "")
biodf.loc[biodf['name'].str.contains(r"; [\s\S]+"), 'occ'] = (biodf.loc[biodf['name'].str.contains(r"; [\s\S]+"), 'name'].str.extract(r"; ([\s\S]+)", expand=False) + ", ") + biodf.loc[biodf['name'].str.contains(r"; [\s\S]+"), 'occ'].astype(str)
biodf["name"] = biodf["name"].str.replace(";.*", "")

{'(', ')'}


In [175]:
# cleaning dataframe: occ
biodf[biodf['occ'].str.contains('[^A-Za-z\., -]')] 
for char in ['►', '\*', '\$', 'c/o', '\^', '\|', '\?', '\([\s\S]+?\)', '■', '<', '\)', '~', '\"', '%', '!', r'\\', '#', '- > ', '->', '_', '\'', '[', ']', '>', '{', '}', '&']:
    biodf['occ'] = biodf['occ'].str.replace(char, "")
    
# replace any resultant double spaces
biodf['occ'] = biodf['occ'].str.replace("  ", " ")

biodf[biodf['occ'].str.contains('[^A-Za-z\., -]')] 

Unnamed: 0_level_0,name,birthplace,birthdate,occ,curaddress,residence
persid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [176]:
# read state name csv
os.chdir(r"C:\Users\byron\OneDrive\Documents\University\US\NYU\SPUR\WIW")
statedf = pd.read_csv("state_names.csv")

# create state abbreviation list
state_abbs = statedf[["SNAME1", "SNAME2", "SNAME3", "SNAME4", "SNAME5"]].stack().reset_index()[0].tolist()
state_abbs.remove("O.") # remove problematic abbreviation (in this case, output is unaffected)

# add . to any state abbreviation not ending with . to avoid false match
state_abbs_new = [abb + "." if abb[-1] != "." else abb for abb in state_abbs ]
        
# formulate regex argument
state_abbs_string = "|".join(state_abbs_new).replace(".", "\.")
print(state_abbs_string)

# cut address/residence to state 
# note that this might cut addresses with state abbreviations at start of address
# only alter matches with state name in string

# curaddress
biodf.loc[biodf['curaddress'].str.len() > 200, 'curaddress'] = biodf.loc[biodf['curaddress'].str.len() > 200, 'curaddress'].str.extract(rf"(^[\s\S]*?(?:{state_abbs_string}))", expand=False)
# residence is messier, requires more cleaning
biodf.loc[biodf['residence'].str.contains(rf"{state_abbs_string}"), 'residence'] = biodf.loc[biodf['residence'].str.contains(rf"{state_abbs_string}"), 'residence'].str.extract(rf"(^[\s\S]*?(?:{state_abbs_string}))", expand=False)

Ala\.|Ala\.|Alaska\.|Alaska\.|Alas\.|Ariz\.|Ariz\.|Az\.|Ark\.|Ark\.|Calif\.|Calif\.|Ca\.|Cal\.|Colo\.|Colo\.|Conn\.|Conn\.|Ct\.|Del\.|Del\.|De\.|D\.C\.|D\. C\.|Wash\. D\.C\.|Fla\.|Fla\.|Fl\.|Flor\.|Ga\.|Ga\.|Hawaii\.|Hawaii\.|H\.I\.|Idaho\.|Idaho\.|Id\.|Ida\.|Ill\.|Ill\.|Il\.|Ills\.|Ill's\.|Ind\.|Ind\.|In\.|Iowa\.|Iowa\.|Ia\.|Ioa\.|Kans\.|Kan\.|Ks\.|Ka\.|Ky\.|Ky\.|Ken\.|Kent\.|La\.|La\.|Maine\.|Maine\.|Me\.|Md\.|Md\.|Mass\.|Mass\.|Mich\.|Mich\.|Minn\.|Minn\.|Mn\.|Miss\.|Miss\.|Mo\.|Mo\.|Mont\.|Mont\.|Nebr\.|Neb\.|Nev\.|Nev\.|Nv\.|N\.H\.|N\. H\.|N\.J\.|N\. J\.|N\. Jersey\.|N\. Mex\.|N\. M\.|New M\.|N\.Y\.|N\. Y\.|N\. York\.|N\. Y\.|N\.C\.|N\. C\.|N\. Car\.|N\. Dak\.|N\. D\.|NoDak\.|N\.Dak\.|Ohio\.|Ohio\.|Oh\.|Okla\.|Okla\.|Ok\.|Oreg\.|Ore\.|Or\.|Pa\.|Pa\.|Penn\.|Penna\.|R\.I\.|R\. I\.|R I\. \.|R\. Isl\.|R\.Isl\.|S\.C\.|S\. C\.|S\. Car\.|S\. Dak\.|S\. D\.|SoDak\.|S\.Dak\.|Tenn\.|Tenn\.|Tex\.|Texas\.|Tx\.|Utah\.|Utah\.|Ut\.|Vt\.|Vt\.|Va\.|Va\.|Virg\.|Wash\.|Wash\.|Wa\.|Wn\.|W\. Va\.|W\.Va

In [177]:
# find invalid letters from combined string of address and residence
address_string = ''.join(biodf['curaddress'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", address_string)
print(set(non_letters))

residence_string = ''.join(biodf['residence'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", residence_string)
print(set(non_letters))

{'(', '"', '|', '^', '!', '>', ':', '$', ']', '&', ')', '\\', '?', ';', '<'}
{'^', '%', '_', '\\', ';', '<', ')', '(', '/', '\ufeff', '>', '?', '"', '|', '!', ':', '$', '&', ']'}


In [178]:
# replace all redundant chars
for char in ['►', '\*', '\$', 'c/o', '\^', '\|', '\?', '\([\s\S]+?\)', '■', '<', '\)', '~', '\"', '%', '!', r'\\', '#', '- > ', '_']: 
    biodf['curaddress'] = biodf['curaddress'].str.replace(char, "")
    biodf['residence'] = biodf['residence'].str.replace(char, "")

biodf['curaddress'] = biodf['curaddress'].str.replace("\]", "l")
biodf['residence'] = biodf['residence'].str.replace("\]", "l")

# find the remaining list of non letters
address_string = ''.join(biodf['curaddress'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", address_string)
print(set(non_letters))

address_string = ''.join(biodf['residence'].astype(str).tolist())
non_letters = re.findall(r"[^a-zA-Z,\. 0-9\-\']", address_string)
print(set(non_letters))

{'(', '>', ':', '&', ';'}
{'(', '/', '>', ':', '\ufeff', '&', ';'}


In [179]:
biodf = biodf.fillna("")
# replace all double spaces with single space
biodf = biodf.replace("  ", " ")

# strip any spaces 
for col in ['name', 'birthplace', 'birthdate', 'occ', 'curaddress', 'residence']:
    biodf[col] = biodf[col].str.strip(" ")

biodf.iloc[407]

name                                         DETT, R. Nathaniel
birthplace                                                     
birthdate                                                      
occ                                          composer,t pianist
curaddress                 362 Second St., Niagara Falls, N. Y.
residence     Hampton Normal and Agricultural Institute, Ham...
Name: 407, dtype: object

In [180]:
# Make the names easier for matching by expanding the commonly used acronyms 

# Make a list of common states acronyms (N.Y. is New York) from a csv file. Store it as dictionary.
state_acron={}
with open('state_names_expanded_2.csv', mode='r') as infile:
    # skip the first line (header)
    infile.readline()
    reader = csv.reader(infile)
    for row in reader:
        for i in range(5):
            if row[i] != "": state_acron[row[i]]=row[5]

# Sometimes it's "N Y University", not "N.Y. University" in the data
# So next we create the same dictionary of states but without dots. 
    state_acron_no_dots={}
    with open('state_names_expanded_2.csv', mode='r') as infile:
        infile.readline()
        reader = csv.reader(infile)
        for row in reader:
            for i in range(5):
                row[i] = re.sub("\.", "", row[i]) 
                if row[i] != "": state_acron_no_dots[row[i]]=row[5]                

# create a function that expands the acronyms for a given column 
def expand_acronyms(list_institutions):
    # Define the regex patterns before looping
    pattern1 = re.compile(r'\b(' + '|'.join(sorted(re.escape(k) for k in state_acron)) + r')\b')
    pattern2 = re.compile(r'(\s|,|^)(' + '|'.join(re.escape(key) for key in state_acron_no_dots.keys()) + r')(\s|,|$)')
    # use the state acronyms dictionary to expand the names to the full ones.
    list_institutions_return = list_institutions.copy()
    for i, place in enumerate(list_institutions):
        if place:
            # there is no need to care about the acronyms being separate words, 
            # because at this point the acronyms in the dictionary always have dots.
            place = re.sub(pattern1, lambda m: state_acron.get(m.group(0)), place)
            #### use the acronym dictionary to make changes
            place = re.sub('\.', ' ', place)
            place = re.sub("[\(\)]", "", place)
            #remove multiple spaces and change "U S" -> "US", "Poly tech" -> "Polytech", "De Paul/w" -> "DePaul/w"
            place = re.sub(r'\s+', ' ', place)
            place = place.replace('U S', 
                                  'US')

            #### finally, use the state dictionary without dots
    
            # use the dictionary of state acronyms to clean the list
            place = re.sub(pattern2, lambda m: " " + state_acron_no_dots.get(m.group(0).strip(' ,')) + " ", place)
            #remove double, triple etc spaces
            place = re.sub(r'\s+', ' ', place).strip(' ,')
            #remove spaces before commas
            place = re.sub(r' ,', ',', place)
            #### "Nat. Sciences" is "Natural Sciences", but in all other cases "Nat" means "National".  
            place = re.sub('Nat Sci', 'Natural Sci', place)
            place = re.sub('Nat\s', 'National ', place)
            list_institutions_return[i] = place
#             if i<15 or i>82150:
#                 print(i, place, list_institutions[i])
    return(list_institutions_return)


In [181]:
# replace NaN values for empty cells with an empty string
biodf = biodf.fillna("")

# clean birthplace column
# some entries have stray commas/periods with single letters following
biodf['birthplace'] = biodf['birthplace'].str.replace(r"[.,][ ]?[a-z][\s\S]*", "", regex=True)

# expand birthplace column state names
bp_expanded = expand_acronyms(list(biodf['birthplace']))
biodf['birthplace_exp'] = bp_expanded
biodf = biodf[['name', 'birthplace', 'birthplace_exp', 'birthdate', 'occ', 'curaddress', 'residence']]    

# cities with missing states usually have their states miscaptured in birthdate string
# for entries with expanded birthplace = birthplace AND state abb in birthdate, extract out state to birthplace
biodf.loc[(biodf["birthplace"] == biodf["birthplace_exp"]) & (biodf["birthdate"].str.contains(rf"{state_abbs_string}")), "birthplace"] += ", " + biodf.loc[(biodf["birthplace"] == biodf["birthplace_exp"]) & (biodf["birthdate"].str.contains(rf"{state_abbs_string}")), "birthdate"].str.extract(rf"({state_abbs_string})", expand=False)
# remove state abb from birthdate and clean entry
biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"] = biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"].str.replace(rf"{state_abbs_string}", "")
biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"] = biodf.loc[biodf["birthdate"].str.contains(rf"{state_abbs_string}"), "birthdate"].str.strip(" ,.")

# again, expand birthplace column state names
bp_expanded = expand_acronyms(list(biodf['birthplace']))
biodf['birthplace_exp'] = bp_expanded
biodf = biodf[['name', 'birthplace', 'birthplace_exp', 'birthdate', 'occ', 'curaddress', 'residence']] 

# further expand unrecognized abbreviations/errors
missed_abbs = {r'W[Iil][Ss]': "Wisconsin", 
                   "Maas": "Massachusetts", 
                       "British West Indes": "British West Indies",
                       "Miss/": "Mississippi",
                          "Coon": "Connecticut",
                              "fey": "Kentucky",
                                  "8 C": "South Carolina",
                                    "N \^ C": "North Carolina",
                                        "N > C": "North Carolina",
                                        r",[ ]?IN C\b": ", North Carolina",
                                          r",[ ]?la": ", Iowa",
                                            r",[ ]?ky": ", Kentucky",
                                               r",[ ]?Kjr": ", Kentucky",
                                                   r",[ ]?Xy": ", Kentucky",
                                                       r",[ ]?Da": ", Georgia",
                                                           "G&": "Georgia",
                                                               "V&": "Virginia",
                                                                   r"Virginia, I\b": "Virginia",
                                                                   "M&": "Maryland",
                                                                      r"Col$": "Colorado",
                                                                          "Teim": "Tennessee",
                                                                           "TeniL": "Tennessee",
                                                                               "Term": "Tennessee",
                                                                                   "Tenm": "Tennessee",
                                                                                    "Tenn'": "Tennessee",
                                                                                       "Tenth": "Tennessee",
                                                                                          "N T": "New York",
                                                                                              "Mb": "Missouri",
                                                                                                   r",[ ]?0": ", Ohio",
                                                                                                    "Ohio I I": "Ohio",
                                                                                                      "Tez": "Texas"}
# expand these missed abbreviations
biodf['birthplace_exp'] = biodf['birthplace_exp'].replace(missed_abbs, regex=True)  
    
# remove remaining -s
# this will alter a few words where - was actually intended
biodf['birthplace_exp'] = biodf['birthplace_exp'].str.replace(r"[ ]*-[ ]*", "", regex = True)

In [182]:
# split birthplace_exp into location and state name

# use expanded state name csv
expstate_df = pd.read_csv("state_names_expanded_2.csv").fillna("")

# function that returns uniques from list (in order)
def unique(sequence):
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

# create state name list, also remove duplicates
state_names = unique(list(expstate_df["FULLSNAME"]))
        
# formulate regex argument
state_names_string = "|".join(state_names)
print(state_names_string)

# first extract only state name
biodf['birthplace_st'] = biodf['birthplace_exp'].str.extract(fr"\b({state_names_string})\b$")

# then extract only the string before state name (anything behind is removed) by splitting birthplace_exp using birthplace_st
# in this case note that FOREIGN state names apply to all outside US/Canada
# FOREIGN states will have varying formats in birthplace_loc and birthplace_st due to varying foreign birthplace formats in dataset
biodf['birthplace_loc'] = biodf.apply(lambda row : re.split(str(row['birthplace_st']), str(row['birthplace_exp']))[0], axis=1).str.strip(', ')

biodf = biodf[['name', 'birthplace', 'birthplace_exp', 'birthplace_loc', 'birthplace_st', 'birthdate', 'occ', 'curaddress', 'residence']]  

Alberta|British Columbia|Manitoba|New Brunswick|Newfoundland and Labrador|Northwest Territories|Nova Scotia|Ontario|Prince Edward Island|Quebec|Saskatchewan|Alabama|Alaska|Arizona|Arkansas|California|Canada|Colorado|Connecticut|Delaware|District of Columbia|Florida|FOREIGN|Puerto Rico|US Virgin Islands|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|West Virginia|Virginia|Washington|Wisconsin|Wyoming|Yukon|Guam|American Samoa


In [183]:
# make a dictionary of foreign/non-foreign states
is_state_foreign = {}
with open('state_names_expanded_2.csv', mode='r') as infile:
    # skip the first line (header)
    infile.readline()
    reader = csv.reader(infile)
    for row in reader:
        if row[5] != "": is_state_foreign[row[5]] = (row[8])

# note the definition for FOREIGN column
# 0: US states, including US territories
# 1: foreign states, including Canada
# people without recorded birthplaces have empty entry
biodf['FOREIGN'] = biodf['birthplace_st'].map(is_state_foreign) 
biodf = biodf[['name', 'birthplace', 'birthplace_loc', 'birthplace_st', 'FOREIGN', 'birthdate', 'occ', 'curaddress', 'residence']] 

In [184]:
# limit all columns to 100 characters
biodf = biodf.apply(lambda x: x.str.slice(0, 100))

# further limit all columns except curaddress and residence to 70 characters
biodf[['name', 'birthplace', 'birthplace_loc', 'birthplace_st', 'FOREIGN', 'birthdate', 'occ']] = biodf[['name', 'birthplace', 'birthplace_loc', 'birthplace_st', 'FOREIGN', 'birthdate', 'occ']].apply(lambda x: x.str.slice(0, 70)) 

In [185]:
# write to csv
os.chdir(r"C:\Users\byron\OneDrive\Documents\University\US\NYU\SPUR\WIW")
biodf.to_csv('1915_bio_data_2.csv', encoding="utf-8")

## Tracking state count

In [186]:
# count how many times a state appears in birthplace_st col
# original dataframe is sorted by count
state_count_df = biodf['birthplace_st'].value_counts().to_frame()
state_count_df = state_count_df.rename(columns = {"birthplace_st": "Count"})
state_count_df.index.name = "State"

# for now, this is sorted alphabetically, not by count
# for the sake of counting, Canadian states have been separated out of FOREIGN states
state_count_df.sort_index() # remove this line if you want alphabetical order
state_count_df.to_csv('1915_state_count.csv', encoding="utf-8")

In [188]:
# checking script output
print("Total number of people retrieved from main text:", len(biodf))

Total number of people retrieved from main text: 1301
