In [5]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.transforms as transforms


In [6]:
# Set device
if torch.cuda.is_available():
    device = "gpu"
elif torch.has_mps:
    device = "mps"
else:
    device = "cpu"
device = torch.device(device)
print(device)


mps


<img src="images/bilstm.jpeg" style="width:600px;height:300px;">


# Bidirectional LSTM

In [7]:
# Hyperparameters
input_size = 28  # Number of features for each time-step but we don't have to explicitely say this
sequence_length = 28
num_layers = 2
num_classes = 10
hidden_size = 256  # Number of nodes in each time-step
learning_rate = 0.001
batch_size = 64
num_epochs = 2


In [26]:
# Create a basic RNN
class BRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes) -> None:
        super(BRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(
            input_size, hidden_size, num_layers, batch_first=True, bidirectional=True
        )
        self.fc = nn.Linear(
            hidden_size * 2, num_classes
        )  # hidden_size*2: because there is forward and backward

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        # *2 because there is forward and backward
        # x.size(0): batch size
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        out, (hidden_state, cell_state) = self.lstm(x, (h0, c0))
        out = self.fc(
            out[:, -1, :]
        )  # take the last hidden state to send into linear layer
        return out


In [27]:
# Load data
train_dataset = datasets.MNIST(
    root="dataset/", train=True, transform=transforms.ToTensor(), download=True
)
test_dataset = datasets.MNIST(
    root="dataset/", train=False, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)


In [28]:
# Initialize network
model = BRNN(input_size, hidden_size, num_layers, num_classes).to(device)
model


BRNN(
  (lstm): LSTM(28, 256, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=10, bias=True)
)

In [29]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [30]:
criterion


CrossEntropyLoss()

In [31]:
optimizer


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [32]:
# This cell took ~18 minutes for 2 epochs on MacBook Pro (16-inch, 2021) Apple M1 Pro 16 GB

# Train Network loop
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Get the data to CUDA of MPS if possible
        data = data.to(device=device).squeeze(1)  # Nx1x28x28 -> Nx28x28
        targets = targets.to(device=device)

        # Forward
        scores = model(data)
        loss = criterion(scores, targets)

        # Backward
        optimizer.zero_grad()
        loss.backward()

        # Gardient descent or adam step
        optimizer.step()


In [33]:
# Check accuracy on training data and test to see how good our model is
def check_accuracy(loader, model):
    if loader.dataset.train:
        print("Checking accuracy on training data.")
    else:
        print("Checking accuracy on test data.")

    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device).squeeze(1)
            y = y.to(device=device)
            # x = x.reshape(x.shape[0], -1)

            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

        print(
            f"Got {num_correct} / {num_samples} with accuracy \
            {float(num_correct)/float(num_samples)*100:.2f}"
        )

    model.train()


In [34]:
# This cell took 30 seconds for 10 epochs on MacBook Pro (16-inch, 2021) Apple M1 Pro 16 GB

check_accuracy(train_loader, model)
check_accuracy(test_loader, model)


Checking accuracy on training data.
Got 0 / 60000 with accuracy             0.00
Checking accuracy on test data.
Got 0 / 10000 with accuracy             0.00


In [None]:
# Why do I get 0? :))