The goal of this notebook is to build a parity predictor using a RNN.

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim


if torch.cuda.is_available():
    print("PyTorch supports CUDA!")
    rnn = nn.RNN(input_size=10, hidden_size=20, num_layers=1)
    print("showing how you have to turn on CUDA to take advantage of it")
    print("CUDA is turned on", any(p.is_cuda for p in rnn.parameters()))
    rnn.cuda()
    print("CUDA is turned on", any(p.is_cuda for p in rnn.parameters()))
    # NOTE: you can also turn it on globally with 
    #torch.cuda.set_device(0)
    # but first I want to see how much faster it makes things on my laptop
else:
    print("PyTorch does not support CUDA.")


PyTorch supports CUDA!
showing how you have to turn on CUDA to take advantage of it
CUDA is turned on False
CUDA is turned on True


In [29]:
import torch
import torch.nn as nn
import torch.optim as optim

import random

def generate_parity_data(seq_length=8, batch_size=16, device='cpu'):
    """
    Generate random binary sequences of length `seq_length`.
    Return: 
      x: Tensor of shape [batch_size, seq_length, 1]
      y: Tensor of shape [batch_size, seq_length, 1] (the parity at each time step)
    """
    x = torch.randint(0, 2, (batch_size, seq_length, 1), device=device).float()  # 0/1 bits
    # We’ll compute cumulative sums (mod 2) along dimension 1
    y = x.cumsum(dim=1) % 2  # shape [batch_size, seq_length, 1]
    return x, y


In [30]:
# looks good
x,y = generate_parity_data(8, 1)
print(x)
print(y)

tensor([[[1.],
         [0.],
         [0.],
         [1.],
         [0.],
         [0.],
         [0.],
         [1.]]])
tensor([[[1.],
         [1.],
         [1.],
         [0.],
         [0.],
         [0.],
         [0.],
         [1.]]])


In [35]:
class SimpleParityRNN(nn.Module):
    def __init__(self, input_size=1, hidden_size=4, output_size=1, use_cuda=False):
        super(SimpleParityRNN, self).__init__()
        
        # Our mini RNN cell that updates memory each step
        self.rnn_cell = nn.RNNCell(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        
        # Move the entire model to CUDA if requested
        if use_cuda:
            self.cuda()
    
    def forward(self, x):
        batch_size, seq_len, _ = x.shape
        # Start memory at zeros on the correct device (CPU or GPU) otherwise it will be on the wrong device and you will get an error
        h_t = torch.zeros(batch_size, self.rnn_cell.hidden_size, device=x.device)
        
        # We'll collect guesses for each time step
        outputs = []
        for t in range(seq_len):
            x_t = x[:, t, :]      # The bit at time t
            h_t = self.rnn_cell(x_t, h_t)  # Update memory
            out_t = self.fc(h_t)          # Make a guess
            outputs.append(out_t.unsqueeze(1))
        
        # Stack all time-step outputs into one tensor
        return torch.cat(outputs, dim=1)

In [36]:
def doit(num_epochs, seq_length, batch_size, learning_rate=0.01, use_cuda=False, print_progress=False):
    model = SimpleParityRNN(use_cuda=use_cuda)
    criterion = nn.BCEWithLogitsLoss()  
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    accuracy = 0.0
    
    for epoch in range(num_epochs):
        # 1) Generate random data
        x, y = generate_parity_data(seq_length, batch_size, device='cuda' if use_cuda else 'cpu')
        
        # 2) Forward pass
        preds = model(x)  # preds shape: [batch_size, seq_length, 1]
        
        # 3) Compute loss
        loss = criterion(preds.view(-1, 1), y.view(-1, 1))
        
        # 4) Backprop + update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # --- Measure accuracy occasionally, e.g. every 200 epochs ---
        if (epoch+1) % 200 == 0:
            # Convert logits to 0/1 predictions
            preds_binary = (torch.sigmoid(preds) > 0.5).float()  # shape same as preds
            # Compare with ground truth
            correct = (preds_binary == y).float().sum().item() 
            total = y.numel()  # total number of bits predicted
            accuracy = correct / total

            if print_progress:
                print(f"Epoch {epoch+1}/{num_epochs}, "
                      f"Loss: {loss.item():.4f}, "
                      f"Accuracy: {accuracy*100:.2f}%")
    return accuracy

    


In [37]:
import time
for use_cuda in [True, False]:
    cur_time = time.time()
    print(f"Using CUDA: {use_cuda}")
    accuracies = [doit(num_epochs = 2000, seq_length = 8, batch_size = 16, use_cuda=use_cuda) for _ in range(10)]
    dt = time.time() - cur_time
    print("avg. accuracy", sum(accuracies) / len(accuracies), "took", dt, "seconds")


Using CUDA: True
avg. accuracy 0.7828125 took 129.87962079048157 seconds
Using CUDA: False
avg. accuracy 0.80390625 took 35.75992679595947 seconds


I'm seeing 80%+ accuracy across a few different sequences, not bad. can we get closer to 100?

As usual, there's a bazillion parameters, plus we're only doing 1 layer. let's just try some variety

For timing:
* interestingly, with cuda is much slower than without: 80s vs 50s. I'm guessing the overhead here is affecting it.

First. Let's switch over to nn.RNN instead of nn.RNNCell so we get the forward part just taken care of, and we can add layers.

In [38]:
class ParityRNN2(nn.Module):
    def __init__(self, input_size=1, hidden_size=4, output_size=1, num_layers=1, use_cuda=False):
        super(ParityRNN2, self).__init__()
        
        # Here, we specify batch_first=True so input is [batch_size, seq_len, input_size]
        self.rnn = nn.RNN(input_size=input_size,
                          hidden_size=hidden_size,
                          num_layers=num_layers,
                          batch_first=True)
        
        # After the RNN processes the sequence, we have hidden_size outputs at each timestep.
        self.fc = nn.Linear(hidden_size, output_size)

        # Move the entire model to CUDA if requested
        if use_cuda:
            self.cuda()
        
    def forward(self, x):
        """
        x shape: [batch_size, seq_len, input_size] (because batch_first=True)
        
        The RNN returns:
          outputs: [batch_size, seq_len, hidden_size]
          hidden:  [num_layers, batch_size, hidden_size] 
                   (since num_layers=1, shape is [1, batch_size, hidden_size])
        """
        outputs, hidden = self.rnn(x)
        
        # outputs has one hidden state vector of length `hidden_size` for each timestep
        # We want to turn that into a parity prediction for each timestep:
        logits = self.fc(outputs)  # shape: [batch_size, seq_len, output_size]
        
        return logits


In [46]:
def doit2(num_epochs, seq_length, batch_size, learning_rate=0.01, num_layers=1, use_cuda=False):
    model = ParityRNN2(use_cuda=use_cuda, num_layers=num_layers)
    criterion = nn.BCEWithLogitsLoss()  
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    accuracy = 0.0
    
    for epoch in range(num_epochs):
        # 1) Generate random data
        x, y = generate_parity_data(seq_length, batch_size, device='cuda' if use_cuda else 'cpu')
        
        # 2) Forward pass
        preds = model(x)  # preds shape: [batch_size, seq_length, 1]
        
        # 3) Compute loss
        loss = criterion(preds.view(-1, 1), y.view(-1, 1))
        
        # 4) Backprop + update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # figure out final accuracy            
    x, y = generate_parity_data(seq_length, batch_size, device='cuda' if use_cuda else 'cpu')
    
    # 2) Forward pass
    preds = model(x)  # preds shape: [batch_size, seq_length, 1]
    
    # 3) Compute loss
    loss = criterion(preds.view(-1, 1), y.view(-1, 1))
        
    # Convert logits to 0/1 predictions
    preds_binary = (torch.sigmoid(preds) > 0.5).float()  # shape same as preds
    # Compare with ground truth
    correct = (preds_binary == y).float().sum().item() 
    total = y.numel()  # total number of bits predicted
    accuracy = correct / total

    return accuracy

    


In [47]:
# apples to apples the last one
import time
for use_cuda in [True, False]:
    cur_time = time.time()
    print(f"Using CUDA: {use_cuda}")
    accuracies = [doit2(num_epochs = 2000, seq_length = 8, batch_size = 16, use_cuda=use_cuda) for _ in range(10)]
    dt = time.time() - cur_time
    print("avg. accuracy", sum(accuracies) / len(accuracies), "took", dt, "seconds")


Using CUDA: True
avg. accuracy 0.7265625 took 24.932602167129517 seconds
Using CUDA: False
avg. accuracy 0.67265625 took 20.379727363586426 seconds


So using an nn.RNN node vs an nn.RNNCell put cuda at 25s and cpu at 20s. so almost on par. clearly some hidden back and forth with the RNNCell, which makes sense.

Next: let's see what happens if we reduce the epochs but make the batch size a lot bigger:

In [53]:
# apples to apples the last one
import time
for use_cuda in [True, False]:
    cur_time = time.time()
    print(f"Using CUDA: {use_cuda}")
    accuracies = [doit2(num_epochs = 500, seq_length = 8, batch_size = 4096, use_cuda=use_cuda) for _ in range(10)]
    dt = time.time() - cur_time
    print("avg. accuracy", sum(accuracies) / len(accuracies), "took", dt, "seconds")


Using CUDA: True
avg. accuracy 0.7861114501953125 took 8.18805742263794 seconds
Using CUDA: False
avg. accuracy 0.793572998046875 took 20.55060124397278 seconds


okay 6s on cuda vs 22s on GPU, but the accuracy is balls

# Wrapup

* Good RNN refresher. 
* make sure to take GPU parallelism into account
