### Name Parsing With Context

In [1]:
import pandas as pd
import random
import time
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils.rnn as rnn_utils
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [2]:
df = pd.read_csv("../data/fl_reg_name_race_2022.csv.gz")
df.shape

(8909970, 3)

In [3]:
df.head()

Unnamed: 0,last_name,first_name,race
0,Binkley,Kathryn,5.0
1,Brock,Lakaya,3.0
2,Fontaine,Charles,5.0
3,Posselt,Suzanne,5.0
4,Haeseler,Bala,5.0


In [4]:
df.drop_duplicates(subset=['last_name', 'first_name'], inplace = True)

In [7]:
tokenized_sentences = []
pos_labels = []

random.seed(42)

for _, row in df.sample(n = 1000000).iterrows():
  if random.random() < 0.5:
    tokenized_sentences.append([row['last_name'], row['first_name']])
    pos_labels.append(['last_name', 'first_name'])
  else:
    tokenized_sentences.append([row['first_name'], row['last_name']])
    pos_labels.append(['first_name', 'last_name'])

In [8]:
print(tokenized_sentences[:10])
print(pos_labels[:10])

[['Borinquen', 'Medina-Torres'], ['Uddin', 'Asma'], ['Mecoli', 'Donald'], ['Ibargoyen', 'Yansari'], ['Judy', 'Bales'], ['Marlyn', 'Diaz Alvarado'], ['Holly', 'Korman'], ['Masso', 'Justina'], ['Drake', 'Brandon'], ['Swabowicz', 'Michael']]
[['first_name', 'last_name'], ['last_name', 'first_name'], ['last_name', 'first_name'], ['last_name', 'first_name'], ['first_name', 'last_name'], ['first_name', 'last_name'], ['first_name', 'last_name'], ['last_name', 'first_name'], ['last_name', 'first_name'], ['last_name', 'first_name']]


In [9]:
vocab = {word: idx for idx, word in enumerate(set(word for sentence in tokenized_sentences for word in sentence))}
pos_tags = {tag: idx for idx, tag in enumerate(set(tag for label_set in pos_labels for tag in label_set))}

In [10]:
# Convert words and tags to indices
tokenized_sentences_idx = [[vocab[word] for word in sentence] for sentence in tokenized_sentences]
pos_labels_idx = [[pos_tags[tag] for tag in label_set] for label_set in pos_labels]

In [11]:
# Split the data into training and testing sets (90% train, 10% test)
train_sentences, test_sentences, train_pos_labels, test_pos_labels = train_test_split(
    tokenized_sentences_idx, pos_labels_idx, test_size=0.1, random_state=42
)

# Check the shapes of the sets
print("Training set shapes:", len(train_sentences), len(train_pos_labels))
print("Testing set shapes:", len(test_sentences), len(test_pos_labels))

Training set shapes: 900000 900000
Testing set shapes: 100000 100000


In [12]:
# Hyperparameters
embedding_dim = 100
hidden_dim = 128
vocab_size = len(vocab)
output_size = len(pos_tags)

# Simple BiLSTM model for POS tagging
class BiLSTMPOSTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_size):
        super(BiLSTMPOSTagger, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.hidden2pos = nn.Linear(hidden_dim * 2, output_size)
    
    def forward(self, sentence):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        pos_space = self.hidden2pos(lstm_out.view(len(sentence), -1))
        pos_scores = F.log_softmax(pos_space, dim=1)
        return pos_scores

In [13]:
# Initialize the model, loss function, and optimizer
model = BiLSTMPOSTagger(embedding_dim, hidden_dim, vocab_size, output_size)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
log_interval = 100  # Log every 100 iterations

# Training loop
num_epochs = 3
best_test_loss = float('inf')
patience = 2  # Number of epochs to wait for improvement

for epoch in range(num_epochs):
    start_time = time.time()
    model.train()  # Set the model to training mode
    total_loss = 0.0

    # Training
    for sentence, tags in zip(train_sentences, train_pos_labels):
        model.zero_grad()
        sentence_in = torch.tensor(sentence, dtype=torch.long)
        targets = torch.tensor(tags, dtype=torch.long)

        tag_scores = model(sentence_in)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Log at specified intervals
        if i % log_interval == 0 and i > 0:
            avg_loss = total_loss / log_interval
            print(f"Iteration {i}/{len(train_sentences)}, Avg. Loss: {avg_loss:.4f}")
            total_loss = 0.0
            
    # Calculate average training loss for this epoch
    avg_train_loss = total_loss / len(train_sentences)
    print(f"Epoch {epoch + 1}/{num_epochs}, Time: {epoch_time:.2f} seconds")

    # Evaluate on the test set
    model.eval()  # Set the model to evaluation mode
    total_test_loss = 0.0

    with torch.no_grad():
        for sentence, tags in zip(test_sentences, test_pos_labels):
            sentence_in = torch.tensor(sentence, dtype=torch.long)
            targets = torch.tensor(tags, dtype=torch.long)

            tag_scores = model(sentence_in)
            loss = loss_function(tag_scores, targets)
            total_test_loss += loss.item()

    # Calculate average test loss for this epoch
    avg_test_loss = total_test_loss / len(test_sentences)

    # Print the losses for this epoch
    print(f"Epoch {epoch + 1}/{num_epochs}, Avg. Training Loss: {avg_train_loss:.4f}, Avg. Test Loss: {avg_test_loss:.4f}")

    # Check for early stopping
    if avg_test_loss < best_test_loss:
        best_test_loss = avg_test_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping. Test loss hasn't improved for", patience, "epochs.")
            break

Epoch 1/3, Avg. Training Loss: 0.2509, Avg. Test Loss: 0.1945
Epoch 2/3, Avg. Training Loss: 0.1574, Avg. Test Loss: 0.1728
Epoch 3/3, Avg. Training Loss: 0.1187, Avg. Test Loss: 0.1708


In [15]:
torch.save(model.state_dict(), 'naamparser_pos_model.pth')

In [14]:
def predict_pos_tags(model, vocab, pos_tags, sentences):
    """
    Predicts POS tags for tokenized sentences.

    Args:
        model (nn.Module): Trained POS tagging model.
        vocab (dict): Vocabulary mapping words to indices.
        pos_tags (dict): POS tag mapping.
        sentences (list of list): Tokenized sentences.

    Returns:
        list of list: Predicted POS tags for each word in each sentence.
    """
    # Convert words to indices
    sentences_idx = [[vocab.get(word, 0) for word in sentence] for sentence in sentences]

    # Predict POS tags
    predicted_tags = []
    with torch.no_grad():
        model.eval()  # Set the model to evaluation mode
        for sentence_idx in sentences_idx:
            sentence_in = torch.tensor(sentence_idx, dtype=torch.long)
            tag_scores = model(sentence_in)
            _, predicted = torch.max(tag_scores, dim=1)
            predicted_tags.append([list(pos_tags.keys())[list(pos_tags.values()).index(tag)] for tag in predicted.numpy()])

    return predicted_tags

# Example usage
sample_sentences = [['Jon', 'Smith'], ['Rodriguez', 'Julia']]
predicted_tags = predict_pos_tags(model, vocab, pos_tags, sample_sentences)

print("Predicted POS tags:")
for i in range(len(sample_sentences)):
    print("Sentence:", ' '.join(sample_sentences[i]))
    print("POS tags:", ' '.join(predicted_tags[i]))
    print()

Predicted POS tags:
Sentence: Jon Smith
POS tags: first_name last_name

Sentence: Rodriguez Julia
POS tags: last_name first_name

