In [34]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# Preprocessing data

In [35]:
df = pd.read_csv('./data/fl_reg_name_race_2022.csv.gz')

In [36]:
df.head()

Unnamed: 0,name_last,name_first,race
0,Hessler-Smith,Jason,nh_white
1,Rogers,Renee,nh_white
2,Bartolome,Crystal,nh_white
3,Bailey,Donna,nh_white
4,Carlson,Greggory,nh_white


In [37]:
df.describe()

Unnamed: 0,name_last,name_first,race
count,15454992,15455022,15455110
unique,1341195,641103,8
top,Smith,Michael,nh_white
freq,79362,153753,9446851


In [38]:
df['race'].unique()

array(['nh_white', 'nh_black', 'other', 'hispanic', 'asian',
       'native_indian', 'unknown', 'multi_racial'], dtype=object)

## Drop None Values

In [39]:
df.dropna(subset=['name_first', 'name_last'], inplace=True)

In [40]:
df.describe()

Unnamed: 0,name_last,name_first,race
count,15454908,15454908,15454908
unique,1341176,641095,8
top,Smith,Michael,nh_white
freq,79362,153753,9446749


## Drop Last name and first name of length 1

In [41]:
df = df.drop(df[df['name_last'].str.len() < 2].index)

In [42]:
df = df.drop(df[df['name_first'].str.len() < 2].index)

In [43]:
df.describe()

Unnamed: 0,name_last,name_first,race
count,15366690,15366690,15366690
unique,1340617,641055,8
top,Smith,Michael,nh_white
freq,79297,153752,9383680


## Make all names title case

In [44]:
df['name_first'] = df['name_first'].str.title()
df['name_last'] = df['name_last'].str.title()

## Remove Special Characters

In [45]:
df['name_last'] = df['name_last'].str.replace("[^a-zA-Z' -]", '', regex=True)

In [46]:
df.head()

Unnamed: 0,name_last,name_first,race
0,Hessler-Smith,Jason,nh_white
1,Rogers,Renee,nh_white
2,Bartolome,Crystal,nh_white
3,Bailey,Donna,nh_white
4,Carlson,Greggory,nh_white


## Drop duplicates

In [47]:
df[df[['name_last','race']].duplicated()]

Unnamed: 0,name_last,name_first,race
52,Gruber,Linda,nh_white
122,Taylor,Robert,nh_white
127,Bailey,Pamela,nh_white
138,Johnson,Ashley,nh_black
146,Mobley,Robert,nh_black
...,...,...,...
15455105,Ballew,Christina,nh_white
15455106,Watts,Mark,nh_white
15455107,Mcrae,Evelyn,nh_white
15455108,Ward,Stephanie,nh_white


In [48]:
df[df['name_last'] == "Porter"]

Unnamed: 0,name_last,name_first,race
136,Porter,Paula,nh_white
550,Porter,Paula,nh_black
7329,Porter,Wendell,nh_white
7557,Porter,Anthony,nh_white
9200,Porter,Kevin,nh_white
...,...,...,...
15448598,Porter,William,nh_white
15448772,Porter,Kyle,nh_white
15451135,Porter,Jean,nh_white
15451767,Porter,Annette,nh_white


In [49]:
df = df.drop_duplicates(['name_last','race'],keep= 'last')

In [50]:
df[df['name_last'] == "Porter"]

Unnamed: 0,name_last,name_first,race
14952661,Porter,Marisyd,asian
15029071,Porter,Amber,multi_racial
15222442,Porter,Anna,other
15337979,Porter,Dennis,unknown
15369699,Porter,Lila,native_indian
15378779,Porter,Cristopher,hispanic
15438806,Porter,Orrick,nh_black
15454870,Porter,Ashley,nh_white


In [51]:
df.shape

(1471841, 3)

In [52]:
len(df)

1471841

## Drop and merge columns

In [53]:
# dropping unknown column
df = df.drop(df[df['race'] == 'unknown'].index)

In [54]:
# combine multi_racial and native_indian to other
mapping = {'multi_racial': 'other', 'native_indian': 'other'}
df['race'] = df['race'].replace(mapping)

In [55]:
df.groupby('race').agg({'name_last':'nunique'})

Unnamed: 0_level_0,name_last
race,Unnamed: 1_level_1
asian,68672
hispanic,389609
nh_black,137271
nh_white,609707
other,115218


In [56]:
df['race_code'] = df.race.factorize()[0]

## Split dataset in train, validation and test

In [57]:
train_df, rest_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['race_code'])

In [58]:
val_df, test_df = train_test_split(rest_df, test_size=0.5, random_state=42, stratify=rest_df['race_code'])

In [59]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [60]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(1079199, 4)
(134900, 4)
(134900, 4)


In [61]:
df.groupby('race').agg({'name_last':'nunique'})

Unnamed: 0_level_0,name_last
race,Unnamed: 1_level_1
asian,68672
hispanic,389609
nh_black,137271
nh_white,609707
other,115218


In [62]:
train_df.groupby('race').agg({'name_last':'nunique'})

Unnamed: 0_level_0,name_last
race,Unnamed: 1_level_1
asian,54938
hispanic,311687
nh_black,109817
nh_white,487765
other,95708


In [63]:
val_df.groupby('race').agg({'name_last':'nunique'})

Unnamed: 0_level_0,name_last
race,Unnamed: 1_level_1
asian,6867
hispanic,38961
nh_black,13727
nh_white,60971
other,14068


In [64]:
test_df.groupby('race').agg({'name_last':'nunique'})

Unnamed: 0_level_0,name_last
race,Unnamed: 1_level_1
asian,6867
hispanic,38961
nh_black,13727
nh_white,60971
other,14060


## Save datasets

In [65]:
train_df.to_csv("data/fl_2022_LastName_train.csv.gz",index=False,compression="gzip")
val_df.to_csv("data/fl_2022_LastName_val.csv.gz",index=False,compression="gzip")
test_df.to_csv("data/fl_2022_LastName_test.csv.gz",index=False,compression="gzip")

In [66]:
!du -sh data/fl_2022_LastName_*

1.3M	data/fl_2022_LastName_test.csv.gz
11M	data/fl_2022_LastName_train.csv.gz
1.3M	data/fl_2022_LastName_val.csv.gz
