### Last Name Preprocessing (Train/Validation/Test)

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/fl_reg_name_race_2022.csv.gz')
df.shape

(15455110, 3)

In [3]:
# Remove NA first/last
df.dropna(subset=['name_first', 'name_last'], inplace=True)
print("Size after dropping missing first and last names:", df.shape)

# We assume unknown as missing at random
sdf = df[df.race.isin(['unknown']) == False]
print("Size after dropping unknown:", sdf.shape)
del df

# Drop cases where last name is less than 2 chars
sdf = sdf.drop(sdf[sdf['name_last'].str.len() < 2].index)
print("Size after dropping last names less than 2 chars:", sdf.shape)

Size after dropping missing first and last names: (15454979, 3)
Size after dropping unknown: (15009244, 3)
Size after dropping last names less than 2 chars: (14933334, 3)


In [4]:
sdf['name_last'] = sdf['name_last'].str.title()
sdf['name_last'] = sdf['name_last'].str.replace("[^a-zA-Z' -]", '', regex=True)

In [5]:
# recode race
mapping = {'multi_racial': 'other', 'native_indian': 'other'}
sdf['race'] = sdf['race'].replace(mapping)

In [6]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race', index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

gdf['total_n'] = gdf.sum(axis=1)
gdf.reset_index(inplace=True)
gdf.iloc[:, 1:-1] = gdf.iloc[:, 1:-1].div(gdf.total_n, axis=0)
gdf

race,name_last,asian,hispanic,nh_black,nh_white,other,total_n
0,A Arup,0.0,0.0,0.0,1.0,0.0,1.0
1,A Bitang,0.0,0.0,1.0,0.0,0.0,1.0
2,A De Feria,0.0,1.0,0.0,0.0,0.0,1.0
3,A F R Stephenson,0.0,0.0,0.0,1.0,0.0,1.0
4,A Felix,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
1056640,Zyzanski,0.0,0.0,0.0,1.0,0.0,1.0
1056641,Zyzdryn,0.0,0.0,0.0,1.0,0.0,2.0
1056642,Zyznomyrsky,0.0,0.0,0.0,1.0,0.0,1.0
1056643,Zzaman,0.0,0.0,0.0,0.0,1.0,2.0


In [7]:
races = sorted(sdf.race.unique().tolist())
print(races)

def get_race_idx(val, races):
    race_idx = races.index(val)
    return race_idx

['asian', 'hispanic', 'nh_black', 'nh_white', 'other']


In [9]:
# for one set of analysis, we define 'true race/ethincity' = where max prob (so modal race = true race)
gdf['race'] = gdf[races].idxmax(axis=1)
gdf['race_code'] = gdf['race'].apply(lambda c: get_race_idx(c,races))

In [10]:
gdf.to_csv("train_validation_test/fl_2022_lastname.csv.gz", index = False, compression="gzip")

## Split dataset in train, validation and test

In [11]:
train_df, rest_df = train_test_split(gdf, test_size=0.2, random_state=42, stratify=gdf['race_code'])

In [12]:
val_df, test_df = train_test_split(rest_df, test_size=0.5, random_state=42, stratify=rest_df['race_code'])

In [13]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [14]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(845316, 9)
(105664, 9)
(105665, 9)


In [15]:
train_df.groupby('race').agg({'name_last':'nunique'})

race,name_last
race,Unnamed: 1_level_1
asian,29184
hispanic,259689
nh_black,83227
nh_white,450098
other,23118


In [16]:
val_df.groupby('race').agg({'name_last':'nunique'})

race,name_last
race,Unnamed: 1_level_1
asian,3648
hispanic,32461
nh_black,10403
nh_white,56262
other,2890


In [17]:
test_df.groupby('race').agg({'name_last':'nunique'})

race,name_last
race,Unnamed: 1_level_1
asian,3648
hispanic,32461
nh_black,10404
nh_white,56262
other,2890


## Save datasets

In [18]:
train_df.to_csv("data/fl_2022_LastName_train.csv.gz",index=False,compression="gzip")
val_df.to_csv("data/fl_2022_LastName_val.csv.gz",index=False,compression="gzip")
test_df.to_csv("data/fl_2022_LastName_test.csv.gz",index=False,compression="gzip")