### Full Name Dataset (Train/Validation/Test)

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/fl_reg_name_race_2022.csv.gz')
df.shape

(15455110, 3)

In [3]:
# Remove NA first/last
df.dropna(subset=['name_first', 'name_last'], inplace=True)
print("Size after dropping missing first and last names:", df.shape)

# We assume unknown as missing at random
sdf = df[df.race.isin(['unknown']) == False]
print("Size after dropping unknown:", sdf.shape)
del df

# Drop cases where last name is less than 2 chars
sdf = sdf.drop(sdf[sdf['name_last'].str.len() < 2].index)
print("Size after dropping last names less than 2 chars:", sdf.shape)

Size after dropping missing first and last names: (15454979, 3)
Size after dropping unknown: (15009244, 3)
Size after dropping last names less than 2 chars: (14933334, 3)


In [4]:
# Full Name
sdf['name_first'] = sdf.name_first.str.strip().str.title()
sdf['name_last'] = sdf.name_last.str.strip().str.title()
sdf['full_name'] = sdf['name_last'] + ' ' + sdf['name_first']
# Remove special chars
sdf['full_name'] = sdf['full_name'].str.replace("[^a-zA-Z' -]", '', regex=True)

In [5]:
# recode race
mapping = {'multi_racial': 'other', 'native_indian': 'other'}
sdf['race'] = sdf['race'].replace(mapping)

In [6]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['full_name','race'], as_index=False)['race'].agg(['count'])
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race', index='full_name')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

gdf['total_n'] = gdf.sum(axis=1)
gdf.reset_index(inplace=True)
gdf.iloc[:, 1:-1] = gdf.iloc[:, 1:-1].div(gdf.total_n, axis=0)

gdf

race,full_name,asian,hispanic,nh_black,nh_white,other,total_n
0,A Arup Erik,0.0,0.0,0.0,1.0,0.0,1.0
1,A Bitang Ahmad,0.0,0.0,1.0,0.0,0.0,1.0
2,A De Feria Graciela,0.0,1.0,0.0,0.0,0.0,1.0
3,A F R Stephenson John Alexander,0.0,0.0,0.0,1.0,0.0,1.0
4,A Felix Noehmi,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
9018613,Zyzdryn Krzysztof,0.0,0.0,0.0,1.0,0.0,1.0
9018614,Zyznomyrsky John,0.0,0.0,0.0,1.0,0.0,1.0
9018615,Zzaman Md,0.0,0.0,0.0,0.0,1.0,1.0
9018616,Zzaman Mohammad,0.0,0.0,0.0,0.0,1.0,1.0


In [7]:
races = sorted(sdf.race.unique().tolist())
print(races)

def get_race_idx(val, races):
    race_idx = races.index(val)
    return race_idx

['asian', 'hispanic', 'nh_black', 'nh_white', 'other']


In [9]:
# for one set of analysis, we define 'true race/ethincity' = where max prob (so modal race = true race)
gdf['race'] = gdf[races].idxmax(axis=1)
gdf['race_code'] = gdf['race'].apply(lambda c: get_race_idx(c,races))

In [10]:
gdf.head()

race,full_name,asian,hispanic,nh_black,nh_white,other,total_n,race.1,race_code
0,A Arup Erik,0.0,0.0,0.0,1.0,0.0,1.0,nh_white,3
1,A Bitang Ahmad,0.0,0.0,1.0,0.0,0.0,1.0,nh_black,2
2,A De Feria Graciela,0.0,1.0,0.0,0.0,0.0,1.0,hispanic,1
3,A F R Stephenson John Alexander,0.0,0.0,0.0,1.0,0.0,1.0,nh_white,3
4,A Felix Noehmi,0.0,1.0,0.0,0.0,0.0,1.0,hispanic,1


In [11]:
gdf.to_csv("train_validation_test/gdf_fullname.csv.gz", index = False, compression="gzip")

## Split dataset in train, validation and test

In [13]:
train_df, rest_df = train_test_split(gdf, test_size=0.2, random_state=42, stratify=gdf['race_code'])
val_df, test_df = train_test_split(rest_df, test_size=0.5, random_state=42, stratify=rest_df['race_code'])

In [14]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [15]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(7214894, 9)
(901862, 9)
(901862, 9)


In [16]:
train_df.groupby('race').agg({'full_name':'nunique'})

race,full_name
race,Unnamed: 1_level_1
asian,206042
hispanic,1308198
nh_black,1067770
nh_white,4421898
other,210986


In [17]:
val_df.groupby('race').agg({'full_name':'nunique'})

race,full_name
race,Unnamed: 1_level_1
asian,25755
hispanic,163525
nh_black,133471
nh_white,552738
other,26373


In [18]:
test_df.groupby('race').agg({'full_name':'nunique'})

race,full_name
race,Unnamed: 1_level_1
asian,25756
hispanic,163525
nh_black,133471
nh_white,552737
other,26373


## Save datasets

In [19]:
train_df.to_csv("data/fl_2022_FullName_train.csv.gz", index=False, compression="gzip")
val_df.to_csv("data/fl_2022_FullName_val.csv.gz", index=False, compression="gzip")
test_df.to_csv("data/fl_2022_FullName_test.csv.gz", index=False, compression="gzip")