# LSTM model to do naam parser

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

## Data preprocessing

### Extract data from Florida dataset

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Colab/parsernaam/data/fl_reg_data.csv")

df.head()

Unnamed: 0.1,Unnamed: 0,name_first,name_last,gender,birth_date,race
0,0,Kathryn,Binkley,F,05/03/1976,"White, Not Hispanic"
1,1,Lakaya,Brock,F,11/23/1982,"Black, Not Hispanic"
2,2,Charles,Fontaine,M,11/11/1982,"White, Not Hispanic"
3,3,Suzanne,Posselt,F,08/20/1954,"White, Not Hispanic"
4,4,Bala,Haeseler,M,11/13/1980,"White, Not Hispanic"


In [4]:
df = df[['name_first','name_last']]
df.head()

Unnamed: 0,name_first,name_last
0,Kathryn,Binkley
1,Lakaya,Brock
2,Charles,Fontaine
3,Suzanne,Posselt
4,Bala,Haeseler


In [5]:
df.shape

(8953005, 2)

In [6]:
df_last_name = pd.DataFrame(df['name_last'].unique(), columns=['name'])
df_first_name = pd.DataFrame(df['name_first'].unique(), columns=['name'])

print(df_last_name.shape)
print(df_first_name.shape)

(873292, 1)
(428417, 1)


### Extract data from census

In [7]:
cn_train_df = pd.read_csv("/content/drive/MyDrive/Colab/ethnicolor/data/census_ln_train.csv")
cn_val_df = pd.read_csv("/content/drive/MyDrive/Colab/ethnicolor/data/census_ln_val.csv")
cn_test_df = pd.read_csv("/content/drive/MyDrive/Colab/ethnicolor/data/census_ln_test.csv")

In [8]:
cn_df = pd.concat([cn_train_df, cn_val_df, cn_test_df])
cn_df = cn_df.drop(['race','race_code'], axis=1)
cn_df = cn_df.rename(columns={'name_last': 'name'})
cn_df.head()

Unnamed: 0,name
0,Tamashiro
1,Pitkin
2,Gaydos
3,Ramaglia
4,Pampinella


In [9]:
df_last_name = pd.concat([df_last_name, cn_df])
df_last_name.drop_duplicates(inplace=True)
df_first_name.drop_duplicates(inplace=True)

print(df_last_name.shape)
print(df_first_name.shape)

(898679, 1)
(428417, 1)


In [10]:
df_last_name['name_type'] = 0
df_first_name['name_type'] = 1

final_df = pd.concat([df_last_name, df_first_name])
print(final_df.shape)

(1327096, 2)


In [11]:
final_df.dropna(inplace=True)
final_df.reset_index(drop=True, inplace=True)

final_df = final_df.drop(final_df[final_df['name'].str.len() < 2].index)
final_df['name'] = final_df['name'].str.replace("[^a-zA-Z' -]", '', regex=True)
final_df['name'] = final_df.name.str.strip().str.title()

print(final_df.shape)

(1327043, 2)


In [12]:
final_df.head()

Unnamed: 0,name,name_type
0,Binkley,0
1,Brock,0
2,Fontaine,0
3,Posselt,0
4,Haeseler,0


## Split data to train, val and test

In [13]:
from sklearn.model_selection import train_test_split

train_df, rest_df = train_test_split(final_df, test_size=0.2, random_state=42, stratify=final_df['name_type'])
val_df, test_df = train_test_split(final_df, test_size=0.5, random_state=42, stratify=final_df['name_type'])

In [14]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(1061634, 2)
(663521, 2)
(663522, 2)


In [15]:
train_df.to_csv('/content/drive/MyDrive/Colab/parsernaam/data/naamparser_train.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/Colab/parsernaam/data/naamparser_val.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/Colab/parsernaam/data/naamparser_test.csv', index=False)