In [1]:
import copy, numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets

In [2]:
'''
STEP 1: LOADING DATASET
'''
train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)
 
test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms.ToTensor())

In [3]:
'''
STEP 2: MAKING DATASET ITERABLE
'''
 
batch_size = 50
n_iters = 6000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)
 
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)
 
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

In [4]:
class GRU(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim):
        super(GRU, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        weights = {}
        for l in range(layer_dim):
            # update_gate
            weights[f'x_u{l}'] = nn.Parameter(torch.Tensor(input_dim,hidden_dim))
            weights[f'h_u{l}'] = nn.Parameter(torch.Tensor(hidden_dim,hidden_dim))
            
            # reset gate
            weights[f'x_r{l}'] = nn.Parameter(torch.Tensor(input_dim,hidden_dim))
            weights[f'h_r{l}'] = nn.Parameter(torch.Tensor(hidden_dim,hidden_dim))
            
            # ?? gate
            weights[f'x_n{l}'] = nn.Parameter(torch.Tensor(input_dim,hidden_dim))
            weights[f'h_n{l}'] = nn.Parameter(torch.Tensor(hidden_dim,hidden_dim))
            
            
            input_dim = hidden_dim
        
        self.weights = nn.ParameterDict(weights)
        self.init_weights()              
        
    def init_weights(self):
        for p in self.parameters():
            if p.data.ndimension() >= 2:
                nn.init.xavier_uniform_(p.data)
            else:
                nn.init.zeros_(p.data)      
                
    def forward(self,x,h0):
        batch_size,seq_len,_ = x.shape

        for t in range(seq_len):
            x_t = x[:,t,:]
            
            for l in range(self.layer_dim):
                h_t_1 = h0[l]
                r_t = torch.sigmoid(x_t@self.weights[f'x_r{l}'] + h_t_1@self.weights[f'h_r{l}'])
                z_t = torch.sigmoid(x_t@self.weights[f'x_u{l}'] + h_t_1@self.weights[f'h_u{l}'])
                h_t_new = torch.tanh((r_t*h_t_1)@self.weights[f'h_n{l}'] + x_t@self.weights[f'x_n{l}'])
                h_t = (1-z_t)*h_t_1 + z_t*h_t_new
                x_t = h_t
                h0[l] = h_t
         
        return h0[-1],h0
            


In [5]:
class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(GRUModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim
         
        # Number of hidden layers
        self.layer_dim = layer_dim

        self.gru = GRU(input_dim, hidden_dim, layer_dim)
                
        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)
     
    def forward(self, x):
        # Initialize hidden state with zeros
        #######################
        #  USE GPU FOR MODEL  #
        #######################
        if torch.cuda.is_available():
            h0 = [ torch.zeros(x.size(0), self.hidden_dim).cuda() for l in range(self.layer_dim) ]
        else:
            h0 = [ torch.zeros(x.size(0), self.hidden_dim) for l in range(self.layer_dim) ]
         
        out, hn = self.gru(x,h0)
        

        out = self.fc(out) 

        return out

In [6]:
'''
STEP 4: INSTANTIATE MODEL CLASS
'''
input_dim = 28
hidden_dim = 100
layer_dim = 3  # ONLY CHANGE IS HERE FROM ONE LAYER TO TWO LAYER
output_dim = 10
 
model = GRUModel(input_dim, hidden_dim, layer_dim, output_dim)

#######################
#  USE GPU FOR MODEL  #
#######################
 
if torch.cuda.is_available():
    model.cuda()

In [7]:
'''
STEP 5: INSTANTIATE LOSS CLASS
'''
criterion = nn.CrossEntropyLoss()
 
'''
STEP 6: INSTANTIATE OPTIMIZER CLASS
'''
learning_rate = 0.0001
 
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  
 
'''
STEP 7: TRAIN THE MODEL
'''
 
# Number of steps to unroll
seq_dim = 28 
 
iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images as Variable
        #######################
        #  USE GPU FOR MODEL  #
        #######################
        if torch.cuda.is_available():
            images = images.view(-1, seq_dim, input_dim).cuda()
            labels = labels.cuda()
        else:
            images = images.view(-1, seq_dim, input_dim)

             
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()
         
        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        outputs = model(images)
         
        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)
         
        # Getting gradients w.r.t. parameters
#         torch.autograd.set_detect_anomaly(True)
        loss.backward()
         
        # Updating parameters
        optimizer.step()
         
        iter += 1
         
        if iter % 200 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                #######################
                #  USE GPU FOR MODEL  #
                #######################
                if torch.cuda.is_available():
                    images = images.view(-1, seq_dim, input_dim).cuda()

                 
                # Forward pass only to get logits/output
                outputs = model(images)
                 
                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)
                 
                # Total number of labels
                total += labels.size(0)
                 
                # Total correct predictions
                #######################
                #  USE GPU FOR MODEL  #
                #######################
                if torch.cuda.is_available():
                    correct += (predicted.cpu() == labels.cpu()).sum()
                else:
                    correct += (predicted == labels).sum()
             
            accuracy = 100 * correct / total
             
            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))

Iteration: 200. Loss: 1.4411970376968384. Accuracy: 53
Iteration: 400. Loss: 0.7872260212898254. Accuracy: 72
Iteration: 600. Loss: 0.5232601761817932. Accuracy: 82
Iteration: 800. Loss: 0.5487017035484314. Accuracy: 86
Iteration: 1000. Loss: 0.4447202980518341. Accuracy: 89
Iteration: 1200. Loss: 0.4258045554161072. Accuracy: 91
Iteration: 1400. Loss: 0.16629232466220856. Accuracy: 92
Iteration: 1600. Loss: 0.1530718356370926. Accuracy: 94
Iteration: 1800. Loss: 0.2008625864982605. Accuracy: 94
Iteration: 2000. Loss: 0.10680635273456573. Accuracy: 95
Iteration: 2200. Loss: 0.28648844361305237. Accuracy: 95
Iteration: 2400. Loss: 0.12206625193357468. Accuracy: 95
Iteration: 2600. Loss: 0.14958076179027557. Accuracy: 95
Iteration: 2800. Loss: 0.17163632810115814. Accuracy: 95
Iteration: 3000. Loss: 0.28216132521629333. Accuracy: 96
Iteration: 3200. Loss: 0.10992380231618881. Accuracy: 96
Iteration: 3400. Loss: 0.07329907268285751. Accuracy: 96
Iteration: 3600. Loss: 0.1497219204902649. 