In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# Preprocessing data

In [2]:
df = pd.read_csv('./data/fl_reg_name_race_2022.csv.gz')

In [3]:
df.head()

Unnamed: 0,name_last,name_first,race
0,Hessler-Smith,Jason,nh_white
1,Rogers,Renee,nh_white
2,Bartolome,Crystal,nh_white
3,Bailey,Donna,nh_white
4,Carlson,Greggory,nh_white


In [4]:
df.describe()

Unnamed: 0,name_last,name_first,race
count,15454992,15455022,15455110
unique,1341195,641103,8
top,Smith,Michael,nh_white
freq,79362,153753,9446851


In [5]:
df['race'].unique()

array(['nh_white', 'nh_black', 'other', 'hispanic', 'asian',
       'native_indian', 'unknown', 'multi_racial'], dtype=object)

## Drop None Values

In [11]:
df.dropna(subset=['name_first', 'name_last'], inplace=True)

In [12]:
df.describe()

Unnamed: 0,name_last,name_first,race
count,15454908,15454908,15454908
unique,1341176,641095,8
top,Smith,Michael,nh_white
freq,79362,153753,9446749


## Drop Last name and first name of length 1

In [7]:
df = df.drop(df[df['name_last'].str.len() < 2].index)

In [8]:
df = df.drop(df[df['name_first'].str.len() < 2].index)

In [9]:
df.describe()

Unnamed: 0,name_last,name_first,race
count,15366773,15366804,15366891
unique,1340636,641063,8
top,Smith,Michael,nh_white
freq,79297,153752,9383782


## Remove Special Characters

In [10]:
df['full_name'] = df['name_last'] + ' ' + df['name_first']

In [11]:
df['full_name'] = df['full_name'].str.replace("[^a-zA-Z' -]", '', regex=True)

In [12]:
df.head()

Unnamed: 0,name_last,name_first,race,full_name
0,Hessler-Smith,Jason,nh_white,Hessler-Smith Jason
1,Rogers,Renee,nh_white,Rogers Renee
2,Bartolome,Crystal,nh_white,Bartolome Crystal
3,Bailey,Donna,nh_white,Bailey Donna
4,Carlson,Greggory,nh_white,Carlson Greggory


## Drop duplicates

In [13]:
df[df[['full_name','race']].duplicated()]

Unnamed: 0,name_last,name_first,race,full_name
837,Moser,Patricia,nh_white,Moser Patricia
928,Johnson,Tiffany,nh_black,Johnson Tiffany
1247,Perry,Charles,nh_white,Perry Charles
2120,Johnson,Ashley,nh_black,Johnson Ashley
2285,Johnson,Clayton,nh_white,Johnson Clayton
...,...,...,...,...
15455101,Jones,Margaret,nh_white,Jones Margaret
15455102,Hartley,Brandon,nh_white,Hartley Brandon
15455104,Ballentine,Robert,nh_white,Ballentine Robert
15455106,Watts,Mark,nh_white,Watts Mark


In [14]:
df[df['full_name'] == "Porter Paula"]

Unnamed: 0,name_last,name_first,race,full_name
136,Porter,Paula,nh_white,Porter Paula
550,Porter,Paula,nh_black,Porter Paula
1527456,Porter,Paula,nh_white,Porter Paula
7563599,Porter,Paula,nh_white,Porter Paula
7631191,Porter,Paula,nh_white,Porter Paula
8383292,Porter,Paula,nh_white,Porter Paula
10682106,Porter,Paula,nh_white,Porter Paula


In [15]:
df = df.drop_duplicates(['full_name','race'],keep= 'last')

In [17]:
df[df['full_name'] == "Porter Paula"]

Unnamed: 0,name_last,name_first,race,full_name
550,Porter,Paula,nh_black,Porter Paula
10682106,Porter,Paula,nh_white,Porter Paula


In [18]:
df.shape

(10811419, 4)

In [19]:
len(df)

10811419

## Drop and merge columns

In [20]:
# dropping unknown column
df = df.drop(df[df['race'] == 'unknown'].index)

In [21]:
# combine multi_racial and native_indian to other
mapping = {'multi_racial': 'other', 'native_indian': 'other'}
df['race'] = df['race'].replace(mapping)

In [22]:
df.groupby('race').agg({'full_name':'nunique'})

Unnamed: 0_level_0,full_name
race,Unnamed: 1_level_1
asian,288197
hispanic,1789315
nh_black,1592766
nh_white,6318521
other,399865


In [None]:
df['race_code'] = df.race.factorize()[0]

## Split dataset in train, validation and test

In [29]:
train_df, rest_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['race_code'])

In [30]:
val_df, test_df = train_test_split(rest_df, test_size=0.5, random_state=42, stratify=rest_df['race_code'])

In [31]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [32]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(8317660, 5)
(1039708, 5)
(1039708, 5)


In [33]:
df.groupby('race').agg({'full_name':'nunique'})

Unnamed: 0_level_0,full_name
race,Unnamed: 1_level_1
asian,288197
hispanic,1789315
nh_black,1592766
nh_white,6318521
other,399865


In [34]:
train_df.groupby('race').agg({'full_name':'nunique'})

Unnamed: 0_level_0,full_name
race,Unnamed: 1_level_1
asian,230558
hispanic,1431452
nh_black,1274213
nh_white,5054817
other,321190


In [35]:
val_df.groupby('race').agg({'full_name':'nunique'})

Unnamed: 0_level_0,full_name
race,Unnamed: 1_level_1
asian,28819
hispanic,178932
nh_black,159277
nh_white,631852
other,40735


In [36]:
test_df.groupby('race').agg({'full_name':'nunique'})

Unnamed: 0_level_0,full_name
race,Unnamed: 1_level_1
asian,28820
hispanic,178931
nh_black,159276
nh_white,631852
other,40744


## Save datasets

In [37]:
train_df.to_csv("data/fl_2022_FullName_train.csv.gz",index=False,compression="gzip")
val_df.to_csv("data/fl_2022_FullName_val.csv.gz",index=False,compression="gzip")
test_df.to_csv("data/fl_2022_FullName_test.csv.gz",index=False,compression="gzip")

In [38]:
!du -sh data/fl_2022_FullName_*

15M	data/fl_2022_FullName_test.csv.gz
115M	data/fl_2022_FullName_train.csv.gz
15M	data/fl_2022_FullName_val.csv.gz


In [39]:
train_df.head()

Unnamed: 0,name_last,name_first,race,full_name,race_code
0,Peirce,Christine,nh_white,Peirce Christine,0
1,PONTARI,KENNETH,nh_white,PONTARI KENNETH,0
2,Gregory,Alexandrea,nh_black,Gregory Alexandrea,1
3,Lloyd,Josiane,nh_white,Lloyd Josiane,0
4,Gonzalez,Mavis,hispanic,Gonzalez Mavis,3
