### Name Parser: Predict First Name or Last

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [14]:
df = pd.read_csv('data/fl_reg_name_race_2022.csv.gz',
                 usecols = ['name_first', 'name_last'], nrows = 1000)

df.dropna(inplace=True)
print("Size after dropping missing first or last names:", df.shape)

# Drop cases where first name is less than 2 chars
df.drop(df[df['name_first'].str.len() < 2].index, inplace = True)
print("Size after dropping first names less than 2 chars:", df.shape)

# Drop cases where first name is less than 2 chars
df.drop(df[df['name_last'].str.len() < 2].index, inplace = True)
print("Size after dropping last names less than 2 chars:", df.shape)

# Drop duplicates
df.drop_duplicates(inplace = True)
print("Size after dropping duplicates:", df.shape)

Size after dropping missing first or last names: (1000, 2)
Size after dropping first names less than 2 chars: (994, 2)
Size after dropping last names less than 2 chars: (994, 2)
Size after dropping duplicates: (990, 2)


In [15]:
df_long = pd.melt(df, value_vars=['name_last', 'name_first'], 
                  var_name='name_type', 
                  value_name='name')

In [16]:
# Preprocess the data
all_characters = list(set(''.join(df_long['name'].values)))
num_characters = len(all_characters)
char_to_idx = {char: i for i, char in enumerate(all_characters)}
idx_to_char = {i: char for i, char in enumerate(all_characters)}

# Convert the names to sequences of character indices
df_long['name_indices'] = df_long['name'].apply(lambda x: [char_to_idx[char] for char in x])

In [17]:
# Convert the name sequences to PyTorch tensors
name_indices = list(df_long['name_indices'])
max_seq_length = max(len(name) for name in name_indices)
padded_name_indices = [name + [0] * (max_seq_length - len(name)) for name in name_indices]
input_data = torch.LongTensor(padded_name_indices)
target_data = torch.LongTensor(df_long['name_type'].map({'name_last': 0, 'name_first': 1}).values)

In [18]:
train_input_data, test_input_data, train_target_data, test_target_data = train_test_split(
    input_data, target_data, test_size=0.2, random_state=42)
train_input_data, val_input_data, train_target_data, val_target_data = train_test_split(
    train_input_data, train_target_data, test_size=0.2, random_state=42)

In [21]:
# Define the LSTM classifier model
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input):
        embedded = self.embedding(input)
        embedded = embedded.permute(1, 0, 2)  # Transpose dimensions
        output, _ = self.lstm(embedded)
        output = self.fc(output[-1])
        return output

In [24]:
# Set the hyperparameters
hidden_size = 128
input_size = hidden_size
output_size = 2  # Binary classification: 'name_last' or 'name_first'
num_epochs = 10
batch_size = 64
learning_rate = 0.01

# Create the model
model = LSTMClassifier(input_size, hidden_size, output_size)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [25]:
# Training loop with early stopping
best_val_loss = float('inf')
patience = 3  # Number of epochs to wait for improvement in validation loss
num_epochs_without_improvement = 0
for epoch in range(num_epochs):
    running_loss = 0.0
    for i in range(0, len(train_input_data), batch_size):
        # Get the mini-batch
        inputs = train_input_data[i:i+batch_size]
        targets = train_target_data[i:i+batch_size]

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Calculate loss
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Print the average training loss for this epoch
    train_loss = running_loss / (len(train_input_data) / batch_size)
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}')

    # Evaluate on the validation set
    with torch.no_grad():
        val_outputs = model(val_input_data)
        val_loss = criterion(val_outputs, val_target_data)
        val_loss = val_loss.item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}')

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        num_epochs_without_improvement = 0
    else:
        num_epochs_without_improvement += 1
        if num_epochs_without_improvement == patience:
            print(f'Early stopping triggered after {epoch+1} epochs without improvement.')
            break

Epoch [1/10], Train Loss: 0.7317
Epoch [1/10], Validation Loss: 0.6913
Epoch [2/10], Train Loss: 0.7009
Epoch [2/10], Validation Loss: 0.6917
Epoch [3/10], Train Loss: 0.7006
Epoch [3/10], Validation Loss: 0.6920
Epoch [4/10], Train Loss: 0.7003
Epoch [4/10], Validation Loss: 0.6934
Early stopping triggered after 4 epochs without improvement.


In [26]:
idx_to_label = {0: 'name_last', 1: 'name_first'}

In [28]:
# Define the inference function
def predict_name_type(model, name):
    # Preprocess the input name
    name_indices = [char_to_idx[char] for char in name]
    padded_name_indices = name_indices + [0] * (max_seq_length - len(name_indices))
    input_data = torch.LongTensor(padded_name_indices).unsqueeze(0)

    # Set the model to evaluation mode
    model.eval()

    # Perform the forward pass
    with torch.no_grad():
        output = model(input_data)

    # Apply softmax to convert output logits into probabilities
    probabilities = nn.functional.softmax(output, dim=1)

    # Get the predicted label index
    predicted_label_index = torch.argmax(probabilities, dim=1).item()

    # Map the predicted label index to the original name type
    predicted_name_type = idx_to_label[predicted_label_index]

    return predicted_name_type

# Example usage
name = "John"
predicted_type = predict_name_type(model, name)
print(f"The predicted name type for '{name}' is: {predicted_type}")

The predicted name type for 'John' is: name_first


In [29]:
with torch.no_grad():
    test_outputs = model(test_input_data)
    test_loss = criterion(test_outputs, test_target_data)
    predicted_labels = torch.argmax(test_outputs, dim=1)
    accuracy = (predicted_labels == test_target_data).sum().item() / len(test_target_data)
    print(f'Test Loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}')

Test Loss: 0.6932, Accuracy: 0.4899
