In [None]:
# For tips on running notebooks in Google Colab, see
# https://pytorch.org/tutorials/beginner/colab
%matplotlib inline

In [None]:
!wget https://birg.dev/name_gender_dataset.csv

--2024-11-25 04:00:10--  https://birg.dev/name_gender_dataset.csv
Resolving birg.dev (birg.dev)... 35.222.21.171
Connecting to birg.dev (birg.dev)|35.222.21.171|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3774591 (3.6M) [text/csv]
Saving to: ‘name_gender_dataset.csv’


2024-11-25 04:00:11 (4.04 MB/s) - ‘name_gender_dataset.csv’ saved [3774591/3774591]



In [None]:
import pandas as pd
data = pd.read_csv('name_gender_dataset.csv',index_col=0)
data.head()

Unnamed: 0_level_0,Gender,Count,Probability
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
James,M,5304407,0.014517
John,M,5260831,0.014398
Robert,M,4970386,0.013603
Michael,M,4579950,0.012534
William,M,4226608,0.011567


In [None]:
data['Gender'].value_counts()

Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
F,89749
M,57520


In [None]:
from io import open
import glob
import os

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

Turning Names into Tensors
==========================

Now that we have all the names organized, we need to turn them into
Tensors to make any use of them.

To represent a single letter, we use a \"one-hot vector\" of size
`<1 x n_letters>`. A one-hot vector is filled with 0s except for a 1 at
index of the current letter, e.g. `"b" = <0 1 0 0 0 ...>`.

To make a word we join a bunch of those into a 2D matrix
`<line_length x 1 x n_letters>`.

That extra 1 dimension is because PyTorch assumes everything is in
batches - we\'re just using a batch size of 1 here.


In [None]:
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line,desired_length):
    tensor = torch.zeros(desired_length, 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

print(letterToTensor('J'))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])


In [None]:
max_length = max(data.index.str.len()) # need this for our padding
max_length

25

In [None]:
vectors = []
for name in data.index:
  vectors.append(lineToTensor(name,max_length))
vectors = torch.stack(vectors)
vectors.shape

torch.Size([147269, 25, 1, 57])

In [None]:
import numpy as np
values = pd.get_dummies(data['Gender']).astype(np.float32).values

In [None]:
y = torch.tensor(values)
y.shape

torch.Size([147269, 2])

In [None]:
# this is saying we have 2,770 names. each name has a max length of 21

In [None]:
# here is a loop just showing how we are going to do this
from torch.utils.data import TensorDataset, DataLoader

# Create a Dataset
dataset = TensorDataset(vectors, y)

# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=20, shuffle=True)

# Iterate through the DataLoader
for batch_features, batch_targets in dataloader:
    print("Batch Features:", batch_features.shape)
    print("Batch Targets:", batch_targets.shape)
    break

Batch Features: torch.Size([20, 25, 1, 57])
Batch Targets: torch.Size([20, 2])


Creating the Network
====================

Before autograd, creating a recurrent neural network in Torch involved
cloning the parameters of a layer over several timesteps. The layers
held hidden state and gradients which are now entirely handled by the
graph itself. This means you can implement a RNN in a very \"pure\" way,
as regular feed-forward layers.

This RNN module implements a \"vanilla RNN\" an is just 3 linear layers
which operate on an input and hidden state, with a `LogSoftmax` layer
after the output.


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, 2)
        #self.softmax = nn.LogSoftmax(dim=1)
        #self.drop = nn.Dropout(0.5)

    def forward(self, input, hidden, last_output, last_hidden):
        hidden = F.tanh(self.i2h(input) + self.h2h(hidden))
        #hidden = self.drop(hidden)
        output = self.h2o(hidden)
        #output = self.softmax(output)
        # Create a mask where condition_matrix is zero (mask is 1 where condition_matrix is zero)
        mask_zero = ((input != 0).sum(dim=1)).unsqueeze(1)

        # Create the output matrix by using matrix multiplication
        output_matrix = mask_zero * output + (1 - mask_zero) * last_output
        #import pdb; pdb.set_trace()
        mask_zero = mask_zero.repeat(1, last_hidden.shape[1])
        hidden_matrix = mask_zero * hidden + (1 - mask_zero) * last_hidden
        return output_matrix, hidden_matrix

    def initHidden(self,num=1):
        return torch.zeros(num, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the selected device
rnn = rnn.to(device)

Using device: cpu


To run a step of this network we need to pass an input (in our case, the
Tensor for the current letter) and a previous hidden state (which we
initialize as zeros at first). We\'ll get back the output (probability
of each language) and a next hidden state (which we keep for the next
step).


In [None]:
%%timeit
# here is a loop just showing how we are going to do this
from torch.utils.data import TensorDataset, DataLoader

# Create a Dataset
dataset = TensorDataset(vectors.to(device), y.to(device))

# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=1000, shuffle=True)

# Iterate through the DataLoader
for batch_features, batch_targets in dataloader:
    next_hidden = rnn.initHidden(num=batch_features.shape[0]).to(device)

    output = torch.zeros(batch_features.shape[0],1).to(device)
    for i in range(batch_features.shape[1]):
        input = batch_features[:,i,:,:].squeeze(1) # need this to get the right shape for parallel processing
        output, next_hidden = rnn(input, next_hidden, output, next_hidden)

    #print(output)
    break

KeyboardInterrupt: 

In [None]:
%%timeit
# here is a loop just showing how we are going to do this
from torch.utils.data import TensorDataset, DataLoader

# Create a Dataset
dataset = TensorDataset(vectors.to(device), y.to(device))

# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Iterate through the DataLoader
times = 1000
j = 1
for batch_features, batch_targets in dataloader:
    next_hidden = rnn.initHidden(num=batch_features.shape[0]).to(device)

    output = torch.zeros(batch_features.shape[0],1).to(device)
    for i in range(batch_features.shape[1]):
        input = batch_features[:,i,:,:].squeeze(1) # need this to get the right shape for parallel processing
        output, next_hidden = rnn(input, next_hidden, output, next_hidden)

    #print(output)
    if j == times:
      break
    j+=1

6.07 s ± 325 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


For the sake of efficiency we don\'t want to be creating a new Tensor
for every step, so we will use `lineToTensor` instead of
`letterToTensor` and use slices. This could be further optimized by
precomputing batches of Tensors.


Training
========

Preparing for Training
----------------------


We will also want a quick way to get a training example (a name and its
language):


Training the Network
====================


In [None]:
criterion = nn.BCEWithLogitsLoss()
import torch.optim as optim

In [None]:
n_hidden = 256
rnn = RNN(n_letters, n_hidden).to(device)

# here is a loop just showing how we are going to do this
from torch.utils.data import DataLoader, TensorDataset, random_split

# Create a Dataset
dataset = TensorDataset(vectors.to(device), y.to(device))

# Define split sizes
train_size = int(0.7 * len(dataset))  # for training
test_size = len(dataset) - train_size  # Remaining for testing

# Split the dataset
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders for both subsets
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

# Print the sizes of the splits
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

optimizer = optim.Adam(rnn.parameters(), lr=0.001)

num_epochs = 10000

for epoch in range(num_epochs):
    rnn.train()  # Set model to training mode
    running_loss = 0.0

    # Iterate through the DataLoader
    for batch_features, batch_targets in train_loader:
        next_hidden = rnn.initHidden(num=batch_features.shape[0]).to(device)
        # Zero the gradient buffers
        optimizer.zero_grad()

        output = torch.zeros(batch_features.shape[0],1).to(device)
        for i in range(batch_features.shape[1]):
            input = batch_features[:,i,:,:].squeeze(1) # need this to get the right shape for parallel processing
            output, next_hidden = rnn(input, next_hidden, output, next_hidden)
        loss = criterion(output, batch_targets)

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

        # Accumulate loss
        running_loss += loss.item()

    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(dataloader):.4f}")

Train dataset size: 103088
Test dataset size: 44181


KeyboardInterrupt: 

Evaluating the Results
======================


In [None]:
rnn.eval()
outputs = []
targets = []
for batch_features, batch_targets in test_loader:
    next_hidden = rnn.initHidden(num=batch_features.shape[0]).to(device)
    # Zero the gradient buffers
    optimizer.zero_grad()

    output = torch.zeros(batch_features.shape[0],1).to(device)
    for i in range(batch_features.shape[1]):
        input = batch_features[:,i,:,:].squeeze(1) # need this to get the right shape for parallel processing
        output, next_hidden = rnn(input, next_hidden, output, next_hidden)

    outputs.append(output)
    targets.append(batch_targets)

In [None]:
predictions = torch.cat(outputs,0).detach().cpu().numpy()
targets = torch.cat(targets,0).detach().cpu().numpy()
predicted_rating = scaler.inverse_transform(predictions)
actual_rating = scaler.inverse_transform(targets)

In [None]:
np.median(np.abs(predicted_rating - actual_rating))

0.86075974

In [None]:
np.median(np.abs(np.mean(actual_rating) - actual_rating))

0.78233147

In [None]:
data.head()

Unnamed: 0,professor_name,star_rating,last,first
0,Mimi Kline,5.0,Kline,Mimi
1,Dennis Jones,4.3,Jones,Dennis
2,Mario Perez,2.8,Perez,Mario
3,Eddie Tapia,3.1,Tapia,Eddie
4,S Saini,4.8,Saini,S
