In [1]:
import gzip
import json
import random
import re
ls = []
for line in gzip.open("all_records_4488912.txt.gz", "rt"):
    l = json.loads(line)['description']['fileUnit']
    try: 
        id = l['localIdentifier']
    except KeyError:
        id = None
    ls.append((l['scopeAndContentNote'], l['title'], l['naId'], id))


In [3]:
random.shuffle(ls)

renamings = {
    "date of birth": "birth date",
    "alias name(s)": "alias",
    "alias name": "alias",
    "country of birth": "country",
    "Father's Name": "father",
    "Mother's Name": "mother"
}

def parse(scope_and_content):
    output = {}
    if scope_and_content.startswith("The following"):
        l = scope_and_content.split("\n")[1:]
        for line in l:
            if (len(line) < 2):
                continue
            k, v = line.rstrip().split(":")
            k = k.strip().lower()
            if k in renamings:
                k = renamings[k]

            output[k] = v.strip()
    if scope_and_content.startswith("This file"):
        l = scope_and_content.split(".  ")[1:]
        for line in l:
            if (len(line) < 2):
                continue
            line = line.rstrip()
            k, v = re.split(r" is listed as | listed as | is ", line)
            k = k.strip().lower()
            if k in renamings:
                k = renamings[k]
            output[k] = v.strip().rstrip(".")
    if len(output) < 1:
        print(scope_and_content)
        raise

    return output
all_rows = []
for row in ls:
    try:
        dicto = parse(row[0])
        dicto['name'] = row[1].split(" for ", 1)[-1]
        dicto['naid'] = row[2]
        dicto['id'] = row[3]
        all_rows.append(dicto)
    except:
        print(row)
        continue

Dockets covering Chicago criminal cases 19786-19940 (1929).
('Dockets covering Chicago criminal cases 19786-19940 (1929).', 'VEO Under Edit Test Description - 02', '176237757', None)
('This file consists of an alien case file for Calogera Alongi.  Date of birth is listed as 0/319/1888.  Country is listed as Italy.  Alias name Calogera Cristi.', 'Alien Case File for Calogera Alongi', '5235742', 'A2574053/085-09-4368/Box 295')
Dockets covering Chicago criminal cases 19786-19940 (1929).
('Dockets covering Chicago criminal cases 19786-19940 (1929).', 'VEO Under Edit Test Description - 01', '176237756', None)


In [4]:
import pandas as pd

In [5]:
g = pd.DataFrame(all_rows)

In [6]:
import random
random.choice(all_rows)

{'birth date': '12/07/1879',
 'country': 'Italy',
 'name': 'Joseph Stellita',
 'naid': '5195283',
 'id': 'A1556888/085-09-4368/Box 207'}

In [7]:
g.columns

Index(['birth date', 'country', 'name', 'naid', 'id', 'date of entry',
       'port of entry', 'father', 'mother', 'alias', 'naturalization date',
       'naturalization location', 'sex', 'father's name', 'mother's name'],
      dtype='object')

In [36]:
cache = {}

In [4]:
import pyarrow as pa
from pyarrow import feather, parquet

feather.read_table("cleaned.feather")

pyarrow.Table
birth date: date32[day]
name: string
country: dictionary<values=string, indices=int16, ordered=0>
sex: dictionary<values=string, indices=int8, ordered=0>
port of entry: dictionary<values=string, indices=int16, ordered=0>
date of entry: date32[day]
naturalization date: date32[day]
alias: string
naid: int64

In [38]:

def reload_cache():
    previous = pd.read_parquet("cache.feather")
    previous[~previous.index.duplicated(keep='first')]

    if (len(cache)):
        combined = pd.concat([pd.DataFrame(cache.values()).set_index("q"), previous])
        combined.to_parquet("cache.parquet")
        return combined
    else:
        return previous
seen = reload_cache()

In [43]:
coords = seen[["lng", "lat"]]
coords

Unnamed: 0_level_0,lng,lat
q,Unnamed: 1_level_1,Unnamed: 2_level_1
Philadelphia,-75.16379,39.95233
,,
"San Ysidro, CA",-117.04308,32.552
"VANCEBORO, ME",-67.42972,45.5634
"Niagara Falls, NY",-79.05671,43.0945
...,...,...
"LAS VEGAS, NV",-115.13722,36.17497
"LAWRENCE, MA",-71.16311,42.70704
"Las Vegas, Nevada",-115.13722,36.17497
"Louisville, KY",-85.75941,38.25424


In [75]:
g.sample(n=100).set_index("country").join(coords).reset_index()\
.rename(columns={'index': 'country', 'lng': 'country.lng', 'lat': "country.lat"})\
.set_index("port of entry").join(coords).reset_index()\
.rename(columns={'index': 'port of entry', 'lng': 'port of entry.lng', 'lat': "port of entry.lat"})\
.set_index("port of entry").join(coords).reset_index()\
.rename(columns={'index': 'naturalization location', 'lng': 'naturalization location.lng', 'lat': "naturalization location.lat"})


Unnamed: 0,naturalization location,country,birth date,name,naid,id,date of entry,father,mother,alias,...,naturalization location.1,sex,father's name,mother's name,country.lng,country.lat,port of entry.lng,port of entry.lat,naturalization location.lng,naturalization location.lat
0,"Boston, MA (IA)",Canada,6/17/1882,Henry Bois,40109405,A2853034/566-016-0023/384,2/8/1884,,,,...,,,,,-113.64258,60.10867,,,,
1,"Boston, MA (IA)",Canada,11/12/1910,Vera Cook,40094669,A2475993/566-016-0023/224,8/28/1928,,,,...,,,,,-113.64258,60.10867,,,,
2,"Buffalo, NY (IA)",Ireland,1/10/1892,Annie Hesson,40174507,A4973934/566-016-0023/1525,5/25/1921,,,,...,"Buffalo, NY",,,,-8,53,,,,
3,"Detroit, MI",Canada,6/23/1898,Myrtle West,7188135,A4349474/566-12-88/Bx 710,11/19/1914,,,,...,,,,,-113.64258,60.10867,-83.04575,42.33143,-83.04575,42.33143
4,"El Paso, TX (IA)",Mexico,2/16/1868,Julia De La Rosa,40070701,A1863531/566-016-0023/599,10/24/1921,,,,...,,,,,-102,23,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1208,,,05/09/1882,Axel Johansen,5291880,A4193774/085-08-0110/Box 71,,,,,...,,,,,,,,,,
1209,,,05/09/1882,Axel Johansen,5291880,A4193774/085-08-0110/Box 71,,,,,...,,,,,,,,,,
1210,,,05/09/1882,Axel Johansen,5291880,A4193774/085-08-0110/Box 71,,,,,...,,,,,,,,,,
1211,,,05/09/1882,Axel Johansen,5291880,A4193774/085-08-0110/Box 71,,,,,...,,,,,,,,,,


In [17]:
from urllib.request import urlopen 
from urllib.parse import urlencode
import json

replacements = [("Curacoa", "Curacao"), ("hiladelhia", 'hiladelphia')]
def search(string, featureclass):
    for junk in [" (Airport)", " (IA)", " (MA)"]:
        string = string.replace(junk, "")
    
    if string in cache:
        return cache[string]
    if string in seen:
        return
    query = {"featureClass":"A", "q":"Mexico", "maxRows": 1, "style":"LONG", "username":"benmschmidt"}
    for replacement in replacements:
        query['q'] = string.replace(*replacement)
    parsed = urlencode(query, doseq=False)    
    remote = urlopen(f"http://api.geonames.org/searchJSON?" + parsed + "&featureClass=P")
    data = json.load(remote)
    cache[string] = data['geonames'][0]
    cache[string]['q'] = string
    return cache[string]

In [18]:
for NA in ['Stateless', "Unknown POE", "Unknown", "UNKNOWN POE", "Born On Board Ship", "Arabian Peninsula"]:
    cache[NA] = {}

In [19]:
for field in ['country']:
    for k in g.groupby(field)['name'].count().sort_values(ascending=False).head(550).index:
        print(k, end="; ")
        try:
            search(k, "A")
        except:
            print("\n\n**" + k + "\n")
            

for field in ['port of entry', 'naturalization location']:
    for k in g.groupby(field)['name'].count().sort_values(ascending=False).head(250).index:
        print(k, end="; ")
        try:
            search(k, "P")
        except:
            print("\n**" + k + "**")

Mexico; Canada; Italy; United Kingdom; Poland; Cuba; Germany; Russia; Austria; Philippines; Ireland; Hungary; Portugal; Japan; Spain; Greece; United States; Lithuania; China, People's Republic Of; Sweden; Turkey; Finland; Yugoslavia; Netherlands; Norway; Czechoslovakia; France; Soviet Union; Romania; Korea; China, People's Republic of; Latvia; Denmark; Ussr; Jamaica; Syria; Belgium; Switzerland; Vietnam; Iran; Colombia; India; The Bahamas; El Salvador; Argentina; China, People'S Republic Of; Nicaragua; Stateless; Lebanon; Ukraine; Australia; Ecuador; Brazil; Peru; Guatemala; Albania; Estonia; Unknown; Dominican Republic; Indonesia; Barbados; Egypt; Costa Rica; Chile; Honduras; Bulgaria; Haiti; Panama; Us; Venezuela; Taiwan; Trinidad And Tobago; Bahamas, The; USSR; Armenia; South Africa; Israel; Guyana; Iraq; Cape Verde; New Zealand; Laos; Jordan; Cambodia; Belize; Malta; Puerto Rico; Bermuda; French Polynesia; Pakistan; Hong Kong; British Virgin Islands; Bolivia; Thailand; Cyprus; St. 

In [139]:
g.groupby('port of entry')['name'].count().sort_values(ascending=False).head(90).sample(10)

port of entry
EAGLE PASS, TX (MA)                                              486
San Ysidro, California                                           513
Laredo Gateway Bridge/Laredo, Texas (Juarez-Lincoln Bridge)      779
Houlton, Maine                                                   761
Port Huron, MI (MA)                                              505
El Paso, TX                                                     1343
San Pedro, CA                                                   1035
ROUSES POINT, NY (SEAPLANE                                       707
NEW YORK, NY (IA)                                              11134
New Orleans, La                                                  456
Name: name, dtype: int64

In [82]:
g.groupby('naturalization location')['name'].count().sort_values(ascending=False).head(20)

naturalization location
Los Angeles, CA              61754
Newark, New Jersey           19813
Boston, MA                   16029
Miami, Florida               15810
Detroit, MI                   4439
El Paso, TX                   4229
New York, NY                  3589
Buffalo, NY                   3550
St. Paul, Minnesota           2743
Denver, Colorado              2707
BOSTON, MA                    1459
Portland, Oregon              1403
San Francisco, California      862
Seattle, WA                    814
Houston, Texas                 783
New Orleans, Louisiana         710
Atlanta, Georgia               618
Portland, Maine                460
Philadelhia, Pennsylvania      420
Newark, NJ                     321
Name: name, dtype: int64

In [14]:
g.groupby('sex')['name'].count().sort_values(ascending=False).head(20)

sex
Female    5711
Male      4247
Name: name, dtype: int64