In [1]:
import os
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.optim.lr_scheduler import StepLR


In [8]:
# Hyperparameters
n_epochs = 2
batch_size_train = 64
batch_size_test = 1000
learning_rate = 0.01
momentum = 0.5
log_interval = 10

random_seed = 1
torch.backends.cudnn.enabled = False
torch.manual_seed(random_seed)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

sequence_length = 28
input_size = 28
hidden_size = 128
num_layers = 2
num_classes = 10
batch_size = 100
num_epochs = 2
learning_rate = 0.01

os.makedirs('files',exist_ok=True)
train_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('files/', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=batch_size_train, shuffle=True)

test_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('files/', train=False, download=False,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=batch_size_test, shuffle=True)

In [9]:
# Recurrent neural network (many-to-one)
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(Net, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # Set initial hidden and cell states 
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out
        

# Instantiate the model with hyperparameters
model = Net(input_size, hidden_size, num_layers, num_classes).to(device)
traced_model = torch.jit.trace(model, torch.rand(3, 28, 28).to(device))
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [10]:
import time


scaler = torch.cuda.amp.GradScaler()
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    start = time.time()
    for i, (images, labels) in enumerate(train_loader):
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad(set_to_none=True)
        # Forward pass
        outputs = traced_model(images)
        # Casts operations to mixed precision
        # with torch.cuda.amp.autocast():
        loss = criterion(outputs, labels)

        # # Scales the loss, and calls backward()
        # # to create scaled gradients
        # scaler.scale(loss).backward()

        # # Unscales gradients and calls
        # # or skips optimizer.step()
        # scaler.step(optimizer)

        # # Updates the scale for next iteration
        # scaler.update()
        
        # # Backward and optimize
        
        loss.backward()

        # Gradient Norm Clipping
        #nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2)

        # Gradient Value Clipping
        nn.utils.clip_grad_value_(model.parameters(), clip_value=10)

        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

    end = time.time()
    print(f'Epoch #{epoch + 1} took {end - start} sec(s)')

Epoch [1/2], Step [100/938], Loss: 0.6650
Epoch [1/2], Step [200/938], Loss: 0.3047
Epoch [1/2], Step [300/938], Loss: 0.0894
Epoch [1/2], Step [400/938], Loss: 0.4946
Epoch [1/2], Step [500/938], Loss: 0.2050
Epoch [1/2], Step [600/938], Loss: 0.3227
Epoch [1/2], Step [700/938], Loss: 0.0832
Epoch [1/2], Step [800/938], Loss: 0.0634
Epoch [1/2], Step [900/938], Loss: 0.1024
Epoch #1 took 45.64427042007446 sec(s)
Epoch [2/2], Step [100/938], Loss: 0.1422
Epoch [2/2], Step [200/938], Loss: 0.1408
Epoch [2/2], Step [300/938], Loss: 0.2437
Epoch [2/2], Step [400/938], Loss: 0.3446
Epoch [2/2], Step [500/938], Loss: 0.0675
Epoch [2/2], Step [600/938], Loss: 0.0800
Epoch [2/2], Step [700/938], Loss: 0.1066
Epoch [2/2], Step [800/938], Loss: 0.1555
Epoch [2/2], Step [900/938], Loss: 0.0688
Epoch #2 took 46.03595018386841 sec(s)


In [75]:
class ConvNet(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.conv = torch.nn.Conv2d(3, 10, 3)

    def forward(self, x):
        return torch.nn.ReLU(self.conv(x))

In [84]:
n = torch.nn.ReLU()
example_input = torch.rand(10, 3, 50, 50)
traced = torch.jit.trace(n, example_input)


In [85]:
import time


def run_tests(model, input):
    start = time.time()
    for _ in range(int(1e4)):
        model(input)
    end = time.time()

    print(end - start)

In [89]:
run_tests(traced, example_input)

0.16025280952453613
