### Naamkaran: Name Generator

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import math
import random
import pandas as pd
import numpy as np

In [2]:
df = df = pd.read_csv('data/fl_reg_name_race_2022.csv.gz',
                 usecols = ['name_first'])
df.dropna(subset=['name_first'], inplace=True)
print("Size after dropping missing first names:", df.shape)

# Drop cases where first name is less than 2 chars
df.drop(df[df['name_first'].str.len() < 2].index, inplace = True)
print("Size after dropping first names less than 2 chars:", df.shape)

# Drop duplicates
df.drop_duplicates('name_first', inplace = True)
print("Size after dropping first name duplicates:", df.shape)

Size after dropping missing first names: (15455080, 1)
Size after dropping first names less than 2 chars: (15366955, 1)
Size after dropping first name duplicates: (641077, 1)


In [3]:
# Set the seed
np.random.seed(1234)

# Take a random sample of 10000 rows
df_sample = df.sample(n=10000)

In [4]:
# Preprocess the data
text = ''.join(df_sample['name_first'].tolist())
chars = list(set(text))
char_to_idx = {char: i for i, char in enumerate(chars)}
char_to_idx['<eos>'] = len(char_to_idx)
idx_to_char = {i: char for i, char in enumerate(chars)}

In [5]:
# Prepare the training data
seq_length = 30

def prepare_training_data(text, seq_length):
    input_seqs = []
    target_seqs = []
    for i in range(len(text) - seq_length):
        input_seq = text[i:i + seq_length]
        target_seq = text[i + 1:i + seq_length + 1]
        if all(char in char_to_idx for char in input_seq):
            input_seqs.append([char_to_idx[char] for char in input_seq])
            target_seqs.append([char_to_idx[char] for char in target_seq])
    return input_seqs, target_seqs

# Prepare the training data
input_seqs, target_seqs = prepare_training_data(text, seq_length)

# Convert input and target sequences to PyTorch tensors
input_seqs = torch.LongTensor(input_seqs)
target_seqs = torch.LongTensor(target_seqs)

In [6]:
class CharTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharTransformer, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.positional_encoding = PositionalEncoding(hidden_size)
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(hidden_size, nhead=8),
            num_layers=6
        )
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input):
        seq_len, batch_size = input.shape

        embedded = self.embedding(input)
        embedded = self.positional_encoding(embedded)

        hidden = self.init_hidden(batch_size)

        output = self.transformer_decoder(embedded, hidden)
        output = self.fc(output)

        return output

    def init_hidden(self, batch_size):
        return torch.zeros(6, batch_size, self.hidden_size)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
# Define the hyperparameters
input_size = len(chars)
hidden_size = 128
output_size = len(chars)
num_epochs = 1
batch_size = 64
learning_rate = 0.001

# Create the model
model = CharTransformer(input_size, hidden_size, output_size)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    # Shuffle the training data
    indices = torch.randperm(len(input_seqs))
    input_shuffled = input_seqs[indices]
    target_shuffled = target_seqs[indices]

    # Mini-batch gradient descent
    for i in range(0, len(input_seqs), batch_size):
        # Get the mini-batch
        inputs = input_shuffled[i:i+batch_size]
        targets = target_shuffled[i:i+batch_size]

        # Forward pass
        outputs = model(inputs)

        # Reshape targets to match outputs
        targets = targets.view(-1)

        # Calculate loss
        loss = criterion(outputs.view(-1, output_size), targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print the training loss for this epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

In [None]:
def generate_sequence(model, seed_sequence, length, temperature=1.0):
    model.eval()  # Set the model to evaluation mode
    hidden = model.init_hidden(1)  # Initialize the hidden state

    # Generate the initial characters using the seed sequence
    for char in seed_sequence:
        input_char = torch.LongTensor([[char_to_idx[char]]])
        output, hidden = model(input_char, hidden)

    # Generate the remaining characters
    sequence = seed_sequence
    for _ in range(length):
        input_char = torch.LongTensor([[char_to_idx[sequence[-1]]]])
        output, hidden = model(input_char, hidden)

        # Apply temperature to the output logits
        output_logits = output.view(-1).div(temperature)
        probabilities = torch.softmax(output_logits, dim=0)

        # Check if the generated character is the <eos> token
        if idx_to_char[probabilities.argmax().item()] == "<eos>":
            break

        # Sample the next character from the probability distribution
        selected_char_idx = torch.multinomial(probabilities, num_samples=1)
        selected_char = idx_to_char[selected_char_idx.item()]

        sequence += selected_char

    return sequence

# Set the random seed for reproducibility
seed = 42
random.seed(seed)
torch.manual_seed(seed)

# Set the initial character and generate a sequence
initial_character = 'H'
generated_sequence = generate_sequence(model, initial_character, length=20)
print(generated_sequence)

In [None]:
def generate_sequence(model, seed_sequence, max_length, temperature=1.0):
    model.eval()  # Set the model to evaluation mode
    hidden = None  # No hidden state is used in transformer models

    # Convert the seed sequence to tensor
    input_tensor = torch.LongTensor([[char_to_idx[char] for char in seed_sequence]])

    # Generate the remaining characters
    sequence = seed_sequence
    for _ in range(max_length):
        output = model(input_tensor)  # Forward pass through the model

        # Apply temperature to the output logits
        output_logits = output[:, -1, :]  # Consider only the last predicted character
        output_logits = output_logits.squeeze() / temperature
        probabilities = torch.softmax(output_logits, dim=0)

        # Sample the next character from the probability distribution
        selected_char_idx = torch.multinomial(probabilities, num_samples=1)
        selected_char = idx_to_char[selected_char_idx.item()]

        # Append the selected character to the sequence
        sequence += selected_char

        # Convert the selected character to tensor for the next iteration
        input_tensor = torch.LongTensor([[char_to_idx[selected_char]]])

    return sequence

# Set the initial seed sequence and generate a new sequence
seed_sequence = 'H'
generated_sequence = generate_sequence(model, seed_sequence, max_length=20)
print(generated_sequence)