In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("census_2010_processed.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic,argmax,ethni_act
0,0,Smith,1,2442977,828.19,828.19,70.9,23.11,0.5,0.89,2.19,2.4,pctwhite,nh_white
1,1,Johnson,2,1932812,655.24,1483.42,58.97,34.63,0.54,0.94,2.56,2.36,pctwhite,nh_white
2,2,Williams,3,1625252,550.97,2034.39,45.75,47.68,0.46,0.82,2.81,2.49,pctblack,nh_black
3,3,Brown,4,1437026,487.16,2521.56,57.95,35.6,0.51,0.87,2.55,2.52,pctwhite,nh_white
4,4,Jones,5,1425470,483.24,3004.8,55.19,38.48,0.44,1.0,2.61,2.29,pctwhite,nh_white


In [6]:
df = df[['name','ethni_act']]

In [8]:
df['race_code'] = df.ethni_act.factorize()[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['race_code'] = df.ethni_act.factorize()[0]


In [9]:
df.head()

Unnamed: 0,name,ethni_act,race_code
0,Smith,nh_white,0
1,Johnson,nh_white,0
2,Williams,nh_black,1
3,Brown,nh_white,0
4,Jones,nh_white,0


In [11]:
df = df.rename(columns={'ethni_act': 'race','name':'name_last'})

In [12]:
df.head()

Unnamed: 0,name_last,race,race_code
0,Smith,nh_white,0
1,Johnson,nh_white,0
2,Williams,nh_black,1
3,Brown,nh_white,0
4,Jones,nh_white,0


In [13]:
from sklearn.model_selection import train_test_split

train_df, rest_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['race_code'])
val_df, test_df = train_test_split(rest_df, test_size=0.5, random_state=42, stratify=rest_df['race_code'])

In [14]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(129802, 3)
(16225, 3)
(16226, 3)


In [15]:
train_df.to_csv("data/census_ln_train.csv",index=False)
val_df.to_csv("data/census_ln_val.csv",index=False)
test_df.to_csv("data/census_ln_test.csv",index=False)

In [23]:
!find . -name "census_ln_*.csv" | xargs du -sh

300K	./data/census_ln_val.csv
2.4M	./data/census_ln_train.csv
300K	./data/census_ln_test.csv


## Augmenting data

In [24]:
df.head()

Unnamed: 0,name_last,race,race_code
0,Smith,nh_white,0
1,Johnson,nh_white,0
2,Williams,nh_black,1
3,Brown,nh_white,0
4,Jones,nh_white,0


In [29]:
ln_train_df = pd.read_csv("data/fl_2022_LastName_train.csv.gz")
ln_val_df = pd.read_csv("data/fl_2022_LastName_val.csv.gz")
ln_test_df = pd.read_csv("data/fl_2022_LastName_test.csv.gz")

In [31]:
ln_df = pd.concat([ln_train_df,ln_val_df,ln_test_df])

In [37]:
ln_df = ln_df[['name_last', 'race', 'race_code']]

In [38]:
ln_df.shape

(1056645, 3)

In [39]:
df.shape

(162253, 3)

In [40]:
df_all = ln_df.merge(df, on=['name_last','name_last'], how='left', indicator=True)

In [41]:
df_all.head()

Unnamed: 0,name_last,race_x,race_code_x,race_y,race_code_y,_merge
0,Neaman-Piotrowicz,nh_white,3,,,left_only
1,Chaet,nh_white,3,nh_white,0.0,both
2,Veloz Perez,hispanic,1,,,left_only
3,Spalter,nh_white,3,nh_white,0.0,both
4,Kearns-Edwards,nh_white,3,,,left_only


In [51]:
not_in_df = df_all[df_all['_merge'] == 'left_only'][['name_last','race_x','race_code_x']]

In [55]:
not_in_df = not_in_df.rename(columns={'race_x':'race', 'race_code_x':'race_code'})

In [56]:
not_in_df.head()

Unnamed: 0,name_last,race,race_code
0,Neaman-Piotrowicz,nh_white,3
2,Veloz Perez,hispanic,1
4,Kearns-Edwards,nh_white,3
5,Forgit-Talano,nh_white,3
6,Chotta,nh_white,3


In [57]:
## Sync the race_code
race_id_df = df[['race', 'race_code']].drop_duplicates().sort_values('race_code')
race_to_id = dict(race_id_df.values)

In [58]:
race_to_id

{'nh_white': 0, 'nh_black': 1, 'hispanic': 2, 'asian': 3, 'other': 4}

In [63]:
not_in_df['race_code'] = not_in_df.apply(lambda x: race_to_id[x['race']] ,axis=1)

In [64]:
not_in_df.head()

Unnamed: 0,name_last,race,race_code
0,Neaman-Piotrowicz,nh_white,0
2,Veloz Perez,hispanic,2
4,Kearns-Edwards,nh_white,0
5,Forgit-Talano,nh_white,0
6,Chotta,nh_white,0


In [65]:
final_df = pd.concat([train_df, not_in_df])

In [66]:
final_df.shape

(1034440, 3)

In [67]:
final_df.to_csv("data/census_ln_train_w_ln_ds.csv",index=False)