In [1]:
import copy, numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets

In [2]:
'''
STEP 1: LOADING DATASET
'''
train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)
 
test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms.ToTensor())

In [3]:
'''
STEP 2: MAKING DATASET ITERABLE
'''
 
batch_size = 50
n_iters = 6000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)
 
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)
 
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

In [4]:
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim):
        super(LSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        weights = {}
        for l in range(layer_dim):
            # forget_gate
            weights[f'x_f{l}'] = nn.Parameter(torch.Tensor(input_dim,hidden_dim))
            weights[f'h_f{l}'] = nn.Parameter(torch.Tensor(hidden_dim,hidden_dim))
            weights[f'b_f{l}'] = nn.Parameter(torch.Tensor(hidden_dim))
            
            # input gate
            weights[f'x_i{l}'] = nn.Parameter(torch.Tensor(input_dim,hidden_dim))
            weights[f'h_i{l}'] = nn.Parameter(torch.Tensor(hidden_dim,hidden_dim))
            weights[f'b_i{l}'] = nn.Parameter(torch.Tensor(hidden_dim))
            
            # candidate gate
            weights[f'x_C{l}'] = nn.Parameter(torch.Tensor(input_dim,hidden_dim))
            weights[f'h_C{l}'] = nn.Parameter(torch.Tensor(hidden_dim,hidden_dim))
            weights[f'b_C{l}'] = nn.Parameter(torch.Tensor(hidden_dim))
            
            # output gate
            weights[f'x_o{l}'] = nn.Parameter(torch.Tensor(input_dim,hidden_dim))
            weights[f'h_o{l}'] = nn.Parameter(torch.Tensor(hidden_dim,hidden_dim))
            weights[f'b_o{l}'] = nn.Parameter(torch.Tensor(hidden_dim))
            
            input_dim = hidden_dim
        
        self.weights = nn.ParameterDict(weights)
        self.init_weights()              
        self.init_biases() 
        
    def init_weights(self):
        for p in self.parameters():
            if p.data.ndimension() >= 2:
                nn.init.xavier_uniform_(p.data)
            else:
                nn.init.zeros_(p.data)
                
    def init_biases(self):
        for names in self.weights:
                for name in filter(lambda n: "weights[f'b_" in n,  names):
                    bias = getattr(self.weights, name)
                    n = bias.size(0)
                    start, end = n//4, n//2
                    bias.data[start:end].fill_(1.)
          
                
    def forward(self,x,init_states):
        h0, c0 = init_states
        batch_size,seq_len,_ = x.shape

        for t in range(seq_len):
            x_t = x[:,t,:]
            
            for l in range(self.layer_dim):
                h_t_1 = h0[l]
                c_t_1 = c0[l]
                f_t = torch.sigmoid(x_t@self.weights[f'x_f{l}'] + h_t_1@self.weights[f'h_f{l}']+ self.weights[f'b_f{l}'])
                i_t = torch.sigmoid(x_t@self.weights[f'x_i{l}'] + h_t_1@self.weights[f'h_i{l}']+ self.weights[f'b_i{l}'])
                o_t = torch.sigmoid(x_t@self.weights[f'x_o{l}'] + h_t_1@self.weights[f'h_o{l}']+ self.weights[f'b_o{l}'])
                g_t = torch.tanh(x_t@self.weights[f'x_C{l}'] + h_t_1@self.weights[f'h_C{l}']+ self.weights[f'b_C{l}'])                
                c_t = f_t*c_t_1 + i_t*g_t
                h_t = o_t*torch.tanh(c_t)
                x_t = h_t
                c0[l] = c_t
                h0[l] = h_t
         
        return h0[-1],(h0,c0)
            


In [5]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim
         
        # Number of hidden layers
        self.layer_dim = layer_dim

        self.lstm = LSTM(input_dim, hidden_dim, layer_dim)
                
        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)
     
    def forward(self, x):
        # Initialize hidden state with zeros
        #######################
        #  USE GPU FOR MODEL  #
        #######################
        if torch.cuda.is_available():
            h0 = [ torch.zeros(x.size(0), self.hidden_dim).cuda() for l in range(self.layer_dim) ]
        else:
            h0 = [ torch.zeros(x.size(0), self.hidden_dim) for l in range(self.layer_dim) ]
         
        # Initialize cell state
        if torch.cuda.is_available():
            c0 = [ torch.zeros(x.size(0), self.hidden_dim).cuda() for l in range(self.layer_dim) ]
        else:
            c0 = [ torch.zeros( x.size(0), self.hidden_dim) for l in range(self.layer_dim) ]
         
        out, (hn, cn) = self.lstm(x, (h0,c0))
        

        out = self.fc(out) 

        return out

In [6]:
'''
STEP 4: INSTANTIATE MODEL CLASS
'''
input_dim = 28
hidden_dim = 100
layer_dim = 3  # ONLY CHANGE IS HERE FROM ONE LAYER TO TWO LAYER
output_dim = 10
 
model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)

#######################
#  USE GPU FOR MODEL  #
#######################
 
if torch.cuda.is_available():
    model.cuda()

In [7]:
'''
STEP 5: INSTANTIATE LOSS CLASS
'''
criterion = nn.CrossEntropyLoss()
 
'''
STEP 6: INSTANTIATE OPTIMIZER CLASS
'''
learning_rate = 0.0001
 
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  
 
'''
STEP 7: TRAIN THE MODEL
'''
 
# Number of steps to unroll
seq_dim = 28 
 
iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images as Variable
        #######################
        #  USE GPU FOR MODEL  #
        #######################
        if torch.cuda.is_available():
            images = images.view(-1, seq_dim, input_dim).cuda()
            labels = labels.cuda()
        else:
            images = images.view(-1, seq_dim, input_dim)

             
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()
         
        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        outputs = model(images)
         
        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)
         
        loss.backward()
         
        # Updating parameters
        optimizer.step()
         
        iter += 1
         
        if iter % 200 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                #######################
                #  USE GPU FOR MODEL  #
                #######################
                if torch.cuda.is_available():
                    images = images.view(-1, seq_dim, input_dim).cuda()

                 
                # Forward pass only to get logits/output
                outputs = model(images)
                 
                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)
                 
                # Total number of labels
                total += labels.size(0)
                 
                # Total correct predictions
                #######################
                #  USE GPU FOR MODEL  #
                #######################
                if torch.cuda.is_available():
                    correct += (predicted.cpu() == labels.cpu()).sum()
                else:
                    correct += (predicted == labels).sum()
             
            accuracy = 100 * correct / total
             
            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))

Iteration: 200. Loss: 1.557484745979309. Accuracy: 54
Iteration: 400. Loss: 0.8189647793769836. Accuracy: 77
Iteration: 600. Loss: 0.76825350522995. Accuracy: 85
Iteration: 800. Loss: 0.38425514101982117. Accuracy: 89
Iteration: 1000. Loss: 0.42029285430908203. Accuracy: 90
Iteration: 1200. Loss: 0.4237569570541382. Accuracy: 91
Iteration: 1400. Loss: 0.2881256937980652. Accuracy: 92
Iteration: 1600. Loss: 0.2643578052520752. Accuracy: 93
Iteration: 1800. Loss: 0.16959862411022186. Accuracy: 92
Iteration: 2000. Loss: 0.1939341425895691. Accuracy: 93
Iteration: 2200. Loss: 0.2385956197977066. Accuracy: 94
Iteration: 2400. Loss: 0.16800548136234283. Accuracy: 94
Iteration: 2600. Loss: 0.12149292975664139. Accuracy: 95
Iteration: 2800. Loss: 0.08006231486797333. Accuracy: 95
Iteration: 3000. Loss: 0.2669249176979065. Accuracy: 95
Iteration: 3200. Loss: 0.1236453652381897. Accuracy: 95
Iteration: 3400. Loss: 0.30171868205070496. Accuracy: 95
Iteration: 3600. Loss: 0.15748625993728638. Accu