In [2]:
import pyarrow as pa
from pyarrow import csv
from pyarrow import feather

In [3]:
csvs = csv.read_csv("all_a-files.csv.gz")

In [4]:
df = csvs.to_pandas()

In [5]:
df['naid'] = df['naid'].astype("int")
df['anum'] = df['id'].str.replace("/.*","", regex = True).str.replace("A", "", regex = True).str.replace("^$","-1", regex = True).astype("int")

In [7]:
df

Unnamed: 0,Unnamed: 1,birth date,country,port of entry,date of entry,name,naid,id,naturalization date,naturalization location,alias,father,mother,father's name,mother's name,sex,anum
0,0,03/13/1907,Canada,"Detroit, Michigan",09/27/1968,Annetta Acton,5480529,A18239004/085-08-0653/Box 90,,,,,,,,,18239004
1,1,09/10/1910,Canada,,,William Coulombe,158507169,A8735152/19-0201/Box 558,04/12/1961,"Los Angeles, CA",,,,,,,8735152
2,2,10/30/1911,Cuba,"Miami, FL",08/29/1966,Rosa Maria Pujol De Herrera,158540255,A17374871/19-0201/Box 1184,,,,,,,,,17374871
3,3,8/11/1910,Italy,"New York, NY (IA)",10/28/1954,Domenic Finelli,40225746,A8771184/566-016-0023/1420,7/10/1961,"Boston, MA",,,,,,,8771184
4,4,07/03/1908,Russia,,,Stanislaw Niewiarowski,5400573,A10362869/085-09-4367/Box 138,11/27/1967,"Newark, New Jersey",,,,,,,10362869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1185929,1185929,02/08/1908,Canada,,,Hugh Munro,5189933,A1415015/085-09-4367/Box 5,,,,,,,,,1415015
1185930,1185930,2/10/1900,Ireland,,,John Calvert,146847175,A5193128/566-18-0228/Box 782,,,,,,,,,5193128
1185931,1185931,08/03/1907,Cuba,"Miami, Florida",08/16/1965,Carmen Verde Diaz,5450940,A13314386/085-09-0085/Box 69,,,Carmen Perez,,,,,,13314386
1185932,1185932,3/25/1913,Greece,,,James Kodros,146779755,A2869939/566-18-0228/Box 304,,,,,,,,,2869939


In [8]:
import pandas as pd

dates = pd.to_datetime(df['birth date'], errors="coerce")

In [9]:
df.head()

Unnamed: 0,Unnamed: 1,birth date,country,port of entry,date of entry,name,naid,id,naturalization date,naturalization location,alias,father,mother,father's name,mother's name,sex,anum
0,0,03/13/1907,Canada,"Detroit, Michigan",09/27/1968,Annetta Acton,5480529,A18239004/085-08-0653/Box 90,,,,,,,,,18239004
1,1,09/10/1910,Canada,,,William Coulombe,158507169,A8735152/19-0201/Box 558,04/12/1961,"Los Angeles, CA",,,,,,,8735152
2,2,10/30/1911,Cuba,"Miami, FL",08/29/1966,Rosa Maria Pujol De Herrera,158540255,A17374871/19-0201/Box 1184,,,,,,,,,17374871
3,3,8/11/1910,Italy,"New York, NY (IA)",10/28/1954,Domenic Finelli,40225746,A8771184/566-016-0023/1420,7/10/1961,"Boston, MA",,,,,,,8771184
4,4,07/03/1908,Russia,,,Stanislaw Niewiarowski,5400573,A10362869/085-09-4367/Box 138,11/27/1967,"Newark, New Jersey",,,,,,,10362869


In [10]:
entry_date = pd.to_datetime(df['date of entry'], errors="coerce")
naturalization_date = pd.to_datetime(df['naturalization date'], errors="coerce")

In [11]:
data = pd.DataFrame({
    "birth": dates,
    "name": df['name'],
    "country": df['country'].astype("category"),
    "port": df['port of entry'].astype("category"),
    "date of entry":  entry_date,
    "sex": df['sex'].astype("category"),
    "naturalization date": naturalization_date,
    "alias": df['alias'],
    "naid": df['naid']
})

In [13]:
df['birth_date'] = dates
df['date of entry'] = entry_date
df['naturalization date'] = naturalization_date


In [15]:
import pyarrow.parquet as pq
import pyarrow as pa

pq.write_table(pa.Table.from_pandas(df), "fuller_data.parquet", compression="zstd", compression_level = 19)

In [34]:
tab = pa.table({
    'birth date': pa.Array.from_pandas(dates, type = pa.date32()),
    'name': df['name'],
    'country': df['country'].astype("category"),
    'sex': df['sex'].astype("category"),
    'port of entry': df['port of entry'].astype("category"),
    'date of entry': pa.Array.from_pandas(entry_date, type = pa.date32()),
    'naturalization date': pa.Array.from_pandas(naturalization_date, type = pa.date32()),
    'alias': df['alias'],
    'naid': df['naid'].astype("int32"),
    'anum': df['anum'].astype("int32")
})

In [35]:
tab['anum']

<pyarrow.lib.ChunkedArray object at 0x7ffaa7f18220>
[
  [
    18239004,
    8735152,
    17374871,
    8771184,
    10362869,
    14569589,
    7202432,
    4517853,
    12480197,
    10042720,
    ...
    18212878,
    3482791,
    2817621,
    4169835,
    2679919,
    1415015,
    5193128,
    13314386,
    2869939,
    4028766
  ]
]

In [36]:
feather.write_feather(tab, "cleaned.feather", compression="uncompressed")

In [11]:
g = pa.Array.from_pandas(dates, type = pa.date32())

In [12]:
g

<pyarrow.lib.Date32Array object at 0x1324d6590>
[
  1907-03-13,
  1910-09-10,
  1911-10-30,
  1910-08-11,
  1908-07-03,
  1914-10-01,
  1896-10-13,
  1892-12-21,
  1904-09-09,
  1894-03-24,
  ...
  1909-10-31,
  1896-09-27,
  1893-01-29,
  1898-12-13,
  1909-06-20,
  1908-02-08,
  1900-02-10,
  1907-08-03,
  1913-03-25,
  1902-03-07
]