### Name Parser: Predict First Name or Last

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import math
import random
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv('data/fl_reg_name_race_2022.csv.gz',
                 usecols = ['name_first', 'name_last'], nrows = 10000)

df.dropna(inplace=True)
print("Size after dropping missing first or last names:", df.shape)

# Drop cases where first name is less than 2 chars
df.drop(df[df['name_first'].str.len() < 2].index, inplace = True)
print("Size after dropping first names less than 2 chars:", df.shape)

# Drop cases where first name is less than 2 chars
df.drop(df[df['name_last'].str.len() < 2].index, inplace = True)
print("Size after dropping last names less than 2 chars:", df.shape)

# Drop duplicates
df.drop_duplicates(inplace = True)
print("Size after dropping duplicates:", df.shape)

Size after dropping missing first or last names: (100, 2)
Size after dropping first names less than 2 chars: (99, 2)
Size after dropping last names less than 2 chars: (99, 2)
Size after dropping duplicates: (99, 2)


In [8]:
df_long = pd.melt(df, value_vars=['name_last', 'name_first'], 
                  var_name='name_type', 
                  value_name='name')

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

# Preprocess the data
all_characters = list(set(''.join(df_long['name'].values)))
num_characters = len(all_characters)
char_to_idx = {char: i for i, char in enumerate(all_characters)}
idx_to_char = {i: char for i, char in enumerate(all_characters)}

# Convert the names to sequences of character indices
df_long['name_indices'] = df_long['name'].apply(lambda x: [char_to_idx[char] for char in x])

# Define the LSTM classifier model
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input):
        embedded = self.embedding(input)
        embedded = embedded.permute(1, 0, 2)  # Transpose dimensions
        output, _ = self.lstm(embedded)
        output = self.fc(output[-1])
        return output

# Set the hyperparameters
# Set the hyperparameters
input_size = hidden_size  # Update the input size to match the hidden size
hidden_size = 128
output_size = 2  # Binary classification: 'name_last' or 'name_first'
num_epochs = 10
batch_size = 16
learning_rate = 0.01

# Create the model
model = LSTMClassifier(input_size, hidden_size, output_size)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Convert the name sequences to PyTorch tensors
name_indices = list(df_long['name_indices'])
max_seq_length = max(len(name) for name in name_indices)
padded_name_indices = [name + [0] * (max_seq_length - len(name)) for name in name_indices]
input_data = torch.LongTensor(padded_name_indices)
target_data = torch.LongTensor(df_long['name_type'].map({'name_last': 0, 'name_first': 1}).values)

# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for i in range(0, len(input_data), batch_size):
        # Get the mini-batch
        inputs = input_data[i:i+batch_size]
        targets = target_data[i:i+batch_size]

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Calculate loss
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Print the average loss for this epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / (len(input_data) / batch_size):.4f}')

Epoch [1/10], Loss: 3.1351
Epoch [2/10], Loss: 0.5805
Epoch [3/10], Loss: 1.4548
Epoch [4/10], Loss: 0.7841
Epoch [5/10], Loss: 1.0268
Epoch [6/10], Loss: 0.9406
Epoch [7/10], Loss: 0.9339
Epoch [8/10], Loss: 0.9489
Epoch [9/10], Loss: 0.9295
Epoch [10/10], Loss: 0.9464


In [20]:
idx_to_label = {0: 'name_last', 1: 'name_first'}

In [24]:
# Define the inference function
def predict_name_type(model, name):
    # Preprocess the input name
    name_indices = [char_to_idx[char] for char in name]
    padded_name_indices = name_indices + [0] * (max_seq_length - len(name_indices))
    input_data = torch.LongTensor(padded_name_indices).unsqueeze(0)

    # Set the model to evaluation mode
    model.eval()

    # Perform the forward pass
    with torch.no_grad():
        output = model(input_data)

    # Apply softmax to convert output logits into probabilities
    probabilities = nn.functional.softmax(output, dim=1)

    # Get the predicted label index
    predicted_label_index = torch.argmax(probabilities, dim=1).item()

    # Map the predicted label index to the original name type
    predicted_name_type = idx_to_label[predicted_label_index]

    return predicted_name_type

# Example usage
name = "John"
predicted_type = predict_name_type(model, name)
print(f"The predicted name type for '{name}' is: {predicted_type}")

The predicted name type for 'John' is: name_first
