In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# Preprocessing data

In [45]:
df = pd.read_csv('./data/fl_reg_name_race_2022.csv.gz')

In [46]:
df.head()

Unnamed: 0,name_last,name_first,race
0,Hessler-Smith,Jason,nh_white
1,Rogers,Renee,nh_white
2,Bartolome,Crystal,nh_white
3,Bailey,Donna,nh_white
4,Carlson,Greggory,nh_white


In [47]:
df.describe()

Unnamed: 0,name_last,name_first,race
count,15454992,15455022,15455110
unique,1341195,641103,8
top,Smith,Michael,nh_white
freq,79362,153753,9446851


In [48]:
df['race'].unique()

array(['nh_white', 'nh_black', 'other', 'hispanic', 'asian',
       'native_indian', 'unknown', 'multi_racial'], dtype=object)

## Drop None Values

In [49]:
df.dropna(subset=['name_first', 'name_last'], inplace=True)

In [50]:
df.describe()

Unnamed: 0,name_last,name_first,race
count,15454908,15454908,15454908
unique,1341176,641095,8
top,Smith,Michael,nh_white
freq,79362,153753,9446749


## Drop Last name and first name of length 1

In [51]:
df = df.drop(df[df['name_last'].str.len() < 2].index)

In [52]:
df = df.drop(df[df['name_first'].str.len() < 2].index)

In [53]:
df.describe()

Unnamed: 0,name_last,name_first,race
count,15366690,15366690,15366690
unique,1340617,641055,8
top,Smith,Michael,nh_white
freq,79297,153752,9383680


## Make all names title case

In [54]:
df['name_first'] = df['name_first'].str.title()
df['name_last'] = df['name_last'].str.title()

## Remove Special Characters

In [56]:
df['full_name'] = df['name_last'] + ' ' + df['name_first']

In [57]:
df['full_name'] = df['full_name'].str.replace("[^a-zA-Z' -]", '', regex=True)

In [58]:
df.head()

Unnamed: 0,name_last,name_first,race,full_name
0,Hessler-Smith,Jason,nh_white,Hessler-Smith Jason
1,Rogers,Renee,nh_white,Rogers Renee
2,Bartolome,Crystal,nh_white,Bartolome Crystal
3,Bailey,Donna,nh_white,Bailey Donna
4,Carlson,Greggory,nh_white,Carlson Greggory


## Drop duplicates

In [59]:
df[df[['full_name','race']].duplicated()]

Unnamed: 0,name_last,name_first,race,full_name
837,Moser,Patricia,nh_white,Moser Patricia
928,Johnson,Tiffany,nh_black,Johnson Tiffany
1247,Perry,Charles,nh_white,Perry Charles
2120,Johnson,Ashley,nh_black,Johnson Ashley
2285,Johnson,Clayton,nh_white,Johnson Clayton
...,...,...,...,...
15455104,Ballentine,Robert,nh_white,Ballentine Robert
15455106,Watts,Mark,nh_white,Watts Mark
15455107,Mcrae,Evelyn,nh_white,Mcrae Evelyn
15455108,Ward,Stephanie,nh_white,Ward Stephanie


In [60]:
df[df['full_name'] == "Porter Paula"]

Unnamed: 0,name_last,name_first,race,full_name
136,Porter,Paula,nh_white,Porter Paula
550,Porter,Paula,nh_black,Porter Paula
263636,Porter,Paula,nh_white,Porter Paula
1527456,Porter,Paula,nh_white,Porter Paula
7563599,Porter,Paula,nh_white,Porter Paula
7631191,Porter,Paula,nh_white,Porter Paula
8383292,Porter,Paula,nh_white,Porter Paula
8945658,Porter,Paula,nh_white,Porter Paula
9402546,Porter,Paula,nh_white,Porter Paula
10682106,Porter,Paula,nh_white,Porter Paula


In [61]:
df = df.drop_duplicates(['full_name','race'],keep= 'last')

In [62]:
df[df['full_name'] == "Porter Paula"]

Unnamed: 0,name_last,name_first,race,full_name
550,Porter,Paula,nh_black,Porter Paula
14637476,Porter,Paula,nh_white,Porter Paula


In [63]:
df.shape

(10001779, 4)

In [64]:
len(df)

10001779

## Drop and merge columns

In [65]:
# dropping unknown column
df = df.drop(df[df['race'] == 'unknown'].index)

In [66]:
# combine multi_racial and native_indian to other
mapping = {'multi_racial': 'other', 'native_indian': 'other'}
df['race'] = df['race'].replace(mapping)

In [67]:
df.groupby('race').agg({'full_name':'nunique'})

Unnamed: 0_level_0,full_name
race,Unnamed: 1_level_1
asian,278290
hispanic,1690573
nh_black,1492989
nh_white,5734701
other,390648


In [68]:
df['race_code'] = df.race.factorize()[0]

## Split dataset in train, validation and test

In [69]:
train_df, rest_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['race_code'])

In [70]:
val_df, test_df = train_test_split(rest_df, test_size=0.5, random_state=42, stratify=rest_df['race_code'])

In [71]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [72]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(7678780, 5)
(959847, 5)
(959848, 5)


In [73]:
df.groupby('race').agg({'full_name':'nunique'})

Unnamed: 0_level_0,full_name
race,Unnamed: 1_level_1
asian,278290
hispanic,1690573
nh_black,1492989
nh_white,5734701
other,390648


In [74]:
train_df.groupby('race').agg({'full_name':'nunique'})

Unnamed: 0_level_0,full_name
race,Unnamed: 1_level_1
asian,222632
hispanic,1352458
nh_black,1194391
nh_white,4587761
other,314216


In [75]:
val_df.groupby('race').agg({'full_name':'nunique'})

Unnamed: 0_level_0,full_name
race,Unnamed: 1_level_1
asian,27829
hispanic,169057
nh_black,149299
nh_white,573470
other,40061


In [76]:
test_df.groupby('race').agg({'full_name':'nunique'})

Unnamed: 0_level_0,full_name
race,Unnamed: 1_level_1
asian,27829
hispanic,169058
nh_black,149299
nh_white,573470
other,40068


## Save datasets

In [77]:
train_df.to_csv("data/fl_2022_FullName_train.csv.gz",index=False,compression="gzip")
val_df.to_csv("data/fl_2022_FullName_val.csv.gz",index=False,compression="gzip")
test_df.to_csv("data/fl_2022_FullName_test.csv.gz",index=False,compression="gzip")

In [78]:
!du -sh data/fl_2022_FullName_*

13M	data/fl_2022_FullName_test.csv.gz
101M	data/fl_2022_FullName_train.csv.gz
13M	data/fl_2022_FullName_val.csv.gz


In [79]:
train_df.head()

Unnamed: 0,name_last,name_first,race,full_name,race_code
0,Baxla,Phyllis,nh_white,Baxla Phyllis,0
1,Ludwin,Ron,nh_white,Ludwin Ron,0
2,Signer Welton,Jessica,nh_white,Signer Welton Jessica,0
3,Stamps,Joshua,nh_white,Stamps Joshua,0
4,Vassell,Lillie,nh_black,Vassell Lillie,1
