In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# hyper parameters
input_size = 8 # we have a p+1 dimensional vector for each time step
hidden_size = 2
num_epochs = 10
learning_rate = 0.1
n_classes = 8

In [3]:
class RecurrentNeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RecurrentNeuralNet, self).__init__()
        self.rnn1 = nn.RNN(input_size=input_size,hidden_size=hidden_size,batch_first=True)
        self.rnn2 = nn.RNN(input_size=hidden_size,hidden_size=4,batch_first=True)
        self.rnn3 = nn.RNN(input_size=4,hidden_size=8,batch_first=True)
        
        self.l1 = nn.Linear(8, n_classes)
    
    def forward(self, x):
        out,h_n = self.rnn1(x)
        out,h_n = self.rnn2(out)
        out,h_n = self.rnn3(out)
        out = self.l1(out)
        
        return out


In [4]:
class LSTMNeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTMNeuralNet, self).__init__()
        self.lstm1 = nn.LSTM(input_size=input_size,hidden_size=hidden_size,batch_first=True)
        self.lstm2 = nn.LSTM(input_size=hidden_size,hidden_size=4,batch_first=True)
        self.lstm3 = nn.LSTM(input_size=4,hidden_size=8,batch_first=True)
        
        self.l1 = nn.Linear(8, n_classes)
    
    def forward(self, x):
        out,(h_n,c_n) = self.lstm1(x)
        out,(h_n,c_n) = self.lstm2(out)
        out,(h_n,c_n) = self.lstm3(out)
        
        out = self.l1(out)
        
        return out


In [5]:
class AttnRecurrentNeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(AttnRecurrentNeuralNet, self).__init__()
        self.rnn1 = nn.RNN(input_size=input_size,hidden_size=hidden_size,batch_first=True)
        self.rnn2 = nn.RNN(input_size=hidden_size,hidden_size=4,batch_first=True)
        self.rnn3 = nn.RNN(input_size=4,hidden_size=8,batch_first=True)
        self.cos = nn.CosineSimilarity(dim=2, eps=1e-6)
        self.sig = nn.Sigmoid()
        self.l1 = nn.Linear(8, n_classes)
    
    def forward(self, x):
        out,h_n = self.rnn1(x)
        out,h_n = self.rnn2(out)
        out,h_n = self.rnn3(out) # O_T vector
        e = self.cos(out, x)

        
        alpha = torch.transpose(torch.div(torch.transpose(self.sig(e),0,1),torch.sum(self.sig(e),1)),0,1) # attention weights

        out = self.l1((x*alpha.unsqueeze(-1)))
        
        return out


In [6]:
# create one hot encodings
def generate_inputs_labels(n_batches, batch_size):
    encodings = {'E' : 0, 'a' : 1, 'b' : 2, 'c' : 3, 'd' : 4, 'X' : 5, 'Y' : 6, 'B' : 7}
    labels = ['XXX'  , 'XXY'  , 'XYX'  , 'XYY' , 'YXX'  , 'YXY'  , 'YYX'  , 'YYY' ]

    input_lis = []
    label_lis = []
    for i in range(n_batches):
        input_lis.append([])
        label_lis.append([])

        for k in range(batch_size):
            input_lis[i].append([])
            label_lis[i].append([])

            seq_len = np.random.randint(100,111)
            t1 = np.random.randint(9,20)
            t2 = np.random.randint(32,43)
            t3 = np.random.randint(65,76)
            label = np.random.choice(labels)
            if seq_len < 110:                                                     
                for j in range(seq_len, 110):
                    input_lis[i][k].append(F.one_hot(torch.tensor(encodings['E']), len(encodings)).numpy())
            for j in range(seq_len):
                if j == 0:
                    c = 'E'
                elif j == t1:
                    c = label[0]
                elif j == t2:
                    c = label[1]
                elif j == t3:
                    c = label[2]
                elif j == seq_len-1:
                    c = 'B'
                else:
                    c = np.random.choice(['a','b','c','d'])
                input_lis[i][k].append(F.one_hot(torch.tensor(encodings[c]), len(encodings)).numpy())
            label_lis[i][k] = F.one_hot(torch.tensor(labels.index(label)), n_classes).numpy()   
            input_lis[i][k] = np.array(input_lis[i][k])
            label_lis[i][k] = np.array(label_lis[i][k])

        input_lis[i] = np.array(input_lis[i])
        label_lis[i] = np.array(label_lis[i])
    input_lis = np.array(input_lis)
    label_lis = np.array(label_lis)

    input_lis = torch.from_numpy(input_lis)
    label_lis = torch.from_numpy(label_lis)
    input_lis,label_lis = input_lis.type(torch.FloatTensor),label_lis.type(torch.LongTensor)
    return input_lis,label_lis

In [7]:
n_batches = 500
batch_size = 20

input_lis,label_lis = generate_inputs_labels(n_batches, batch_size)

In [8]:
print(input_lis.shape)

torch.Size([500, 20, 110, 8])


In [9]:
model_rnn = RecurrentNeuralNet(input_size, hidden_size)

# adjust network parameters
for name,param in model_rnn.named_parameters():
    if name == 'rnn1.bias_ih_l0':
        param.data = nn.parameter.Parameter(nn.init.constant_(param, -2.0))
    elif name == 'rnn3.bias_ih_l0':
        param.data = nn.parameter.Parameter(nn.init.constant_(param, -6.0))
    elif name == 'rnn2.bias_ih_l0':
        param.data = nn.parameter.Parameter(nn.init.constant_(param, -4.0))
    elif name == 'rnn1.bias_hh_l0' or name == 'rnn2.bias_hh_l0' or name == 'rnn3.bias_hh_l0':
        pass
    else:   
        param.data = nn.parameter.Parameter(nn.init.uniform_(param, -0.1,0.1))

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_rnn.parameters(), lr=learning_rate)

In [10]:
# training_loop
rnn_loss_lis = []
rnn_step_lis = []
steps = 1
n_total_steps = len(input_lis)
for epoch in range(num_epochs):
    for i in range(len(input_lis)):
        
        # forward
        outputs =  model_rnn(input_lis[i])
        loss = criterion(outputs, label_lis[i])
        
        # backwards 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        steps += 1        
        if (i+1)%100 == 0:
            print(f'epoch {epoch+1} / {num_epochs}, step {i+1}/{n_total_steps}, loss = {loss.item():.4f}')    
            rnn_loss_lis.append(loss.item())
            rnn_step_lis.append(steps)


epoch 1 / 10, step 100/500, loss = 0.3786
epoch 1 / 10, step 200/500, loss = 0.3853
epoch 1 / 10, step 300/500, loss = 0.3749
epoch 1 / 10, step 400/500, loss = 0.3813
epoch 1 / 10, step 500/500, loss = 0.3760
epoch 2 / 10, step 100/500, loss = 0.3773
epoch 2 / 10, step 200/500, loss = 0.3823
epoch 2 / 10, step 300/500, loss = 0.3742
epoch 2 / 10, step 400/500, loss = 0.3806
epoch 2 / 10, step 500/500, loss = 0.3759
epoch 3 / 10, step 100/500, loss = 0.3778
epoch 3 / 10, step 200/500, loss = 0.3841
epoch 3 / 10, step 300/500, loss = 0.3736
epoch 3 / 10, step 400/500, loss = 0.3801
epoch 3 / 10, step 500/500, loss = 0.3758
epoch 4 / 10, step 100/500, loss = 0.3782
epoch 4 / 10, step 200/500, loss = 0.3822
epoch 4 / 10, step 300/500, loss = 0.3726
epoch 4 / 10, step 400/500, loss = 0.3805
epoch 4 / 10, step 500/500, loss = 0.3741
epoch 5 / 10, step 100/500, loss = 0.3766
epoch 5 / 10, step 200/500, loss = 0.3832
epoch 5 / 10, step 300/500, loss = 0.4100
epoch 5 / 10, step 400/500, loss =

In [11]:
model_lstm = LSTMNeuralNet(input_size, hidden_size)

# adjust network parameters
for name,param in model_lstm.named_parameters():
    if name == 'lstm1.bias_ih_l0':
        param.data = nn.parameter.Parameter(nn.init.constant_(param, -2.0))
    elif name == 'lstm3.bias_ih_l0':
        param.data = nn.parameter.Parameter(nn.init.constant_(param, -6.0))
    elif name == 'lstm2.bias_ih_l0':
        param.data = nn.parameter.Parameter(nn.init.constant_(param, -4.0))
    elif name == 'lstm1.bias_hh_l0' or name == 'lstm2.bias_hh_l0' or name == 'lstm3.bias_hh_l0':
        pass
    else:   
        param.data = nn.parameter.Parameter(nn.init.uniform_(param, -0.1,0.1))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_lstm.parameters(), lr=learning_rate)

In [None]:
# training_loop
lstm_loss_lis = []
lstm_step_lis = []
steps = 1
n_total_steps = len(input_lis)
for epoch in range(num_epochs):
    for i in range(len(input_lis)):
        
        # forward
        outputs =  model_lstm(input_lis[i])
        loss = criterion(outputs, label_lis[i])
        
        # backwards 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        steps += 1        
        if (i+1)%100 == 0:
            print(f'epoch {epoch+1} / {num_epochs}, step {i+1}/{n_total_steps}, loss = {loss.item():.4f}')    
            lstm_loss_lis.append(loss.item())
            lstm_step_lis.append(steps)


epoch 1 / 10, step 100/500, loss = 0.3771
epoch 1 / 10, step 200/500, loss = 0.3883
epoch 1 / 10, step 300/500, loss = 0.3755
epoch 1 / 10, step 400/500, loss = 0.3819
epoch 1 / 10, step 500/500, loss = 0.3762
epoch 2 / 10, step 100/500, loss = 0.3769
epoch 2 / 10, step 200/500, loss = 0.3853
epoch 2 / 10, step 300/500, loss = 0.3736
epoch 2 / 10, step 400/500, loss = 0.3810
epoch 2 / 10, step 500/500, loss = 0.3749
epoch 3 / 10, step 100/500, loss = 0.3769
epoch 3 / 10, step 200/500, loss = 0.3839
epoch 3 / 10, step 300/500, loss = 0.3732
epoch 3 / 10, step 400/500, loss = 0.3804
epoch 3 / 10, step 500/500, loss = 0.3747
epoch 4 / 10, step 100/500, loss = 0.3769
epoch 4 / 10, step 200/500, loss = 0.3832
epoch 4 / 10, step 300/500, loss = 0.3733
epoch 4 / 10, step 400/500, loss = 0.3798
epoch 4 / 10, step 500/500, loss = 0.3748
epoch 5 / 10, step 100/500, loss = 0.3769
epoch 5 / 10, step 200/500, loss = 0.3827
epoch 5 / 10, step 300/500, loss = 0.3730
epoch 5 / 10, step 400/500, loss =

In [None]:
model_AttnRnn = AttnRecurrentNeuralNet(input_size, hidden_size)

# adjust network parameters
for name,param in model_AttnRnn.named_parameters():
    if name == 'rnn1.bias_ih_l0':
        param.data = nn.parameter.Parameter(nn.init.constant_(param, -2.0))
    elif name == 'rnn3.bias_ih_l0':
        param.data = nn.parameter.Parameter(nn.init.constant_(param, -6.0))
    elif name == 'rnn2.bias_ih_l0':
        param.data = nn.parameter.Parameter(nn.init.constant_(param, -4.0))
    elif name == 'rnn1.bias_hh_l0' or name == 'rnn2.bias_hh_l0' or name == 'rnn3.bias_hh_l0':
        pass
    else:
        param.data = nn.parameter.Parameter(nn.init.uniform_(param, -0.1,0.1))

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_AttnRnn.parameters(), lr=learning_rate)

In [None]:
# training_loop
AttnRnn_loss_lis = []
AttnRnn_step_lis = []
steps = 1
n_total_steps = len(input_lis)
for epoch in range(num_epochs):
    for i in range(len(input_lis)):
        
        # forward
        outputs =  model_AttnRnn(input_lis[i])
        loss = criterion(outputs, label_lis[i])
        
        # backwards 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        steps += 1        
        if (i+1)%100 == 0:
            print(f'epoch {epoch+1} / {num_epochs}, step {i+1}/{n_total_steps}, loss = {loss.item():.4f}')    
            AttnRnn_loss_lis.append(loss.item())
            AttnRnn_step_lis.append(steps)


### How many input sequences were generated in the training phase before it meets the stopping condition
#### Answer:
#### RNN
 - We ran the network with a variety of differnet batch sizes and number of batches with the initial configuration consisting of 3200 input sequences (100 batches of size 32) run for 10 epochs.
 - The loss seemed to stop reducing at 0.37 following which it fluctuated between 0.37 and 0.38
 - The model was run again with 10000 input sequences (500 batches of size 20) with the loss behaving in the same way.
 - As an additonal step, I put all the temporal values responsible for determining the sequence i.e., X,Y, at the sequence end in one trial and at the beginging in one trial with no improvements, the model seemed to learn nothing.
 - The RNN and RNN+attention model was run over 600 times with adjusted parameters and variations of test sequences, including constant sequence length, paddings at various locations, using zero padding as well as padding with the start symbol at both the beginging and the end with the same result.
 - We used the RNN from the previous question with 1, 2 and 3 layers and hidden size set to 128 followed by a linear layer with no improvement.
 - We tested with softmax activation and obtained worse results, loss stabalized at 0.9 with adhoc predictions.
 - We tested by removing the linear layer and obtained worse results.
 
#### LSTM
 - We ran the network with a variety of differnet batch sizes and number of batches with the initial configuration consisting of 3200 input sequences (100 batches of size 32) run for 10 epochs.
 - The loss seemed to stop reducing at 0.37 following which it fluctuated between 0.37 and 0.38
 - The model was run again with 10000 input sequences (500 batches of size 20) with the loss behaving in the same way.
 
#### RNN+Attention 
 - Attention mechanism was applied by following the paper, we first calculated the $e_i$ as the cosine similarity score between input symbol embedding (one-hot) $V_i$ and the network output at time T $O_T$ as $O_T \odot V_i$, these were calculated for each batch.
 - After this, we calculated the Attention weights by taking the sigmoid of the $e_i$ scores over the sum of all $e_i's$ over $T$, the total number of sequences  as $ \frac{\sigma(e_i)}{\sum_{i=1}^{T}{\sigma(e_i)}}$
 - Finally, before sending to a fully connected linear layer, we multipy each embedding $V_i$ with the corresponding $\alpha_i$ and take the sum, note that this is done over an entire batch of size 20.
 - The network was run with 10000 input sequences (500 batches of size 20) for over 30 epochs, However convergence was not achieved.
 - The loss seemed to reduce constantly for the first 20 epochs after which its minimum stabilized at around 0.6, fluctuation between 0.56 and 0.8

### Plot the number of input sequences passed through the network versus training error (for RNN, LSTM and RNN+Attention)

In [None]:
plt.plot(lstm_step_lis,lstm_loss_lis, color='r')
plt.plot(rnn_step_lis,rnn_loss_lis,color='b')
plt.plot(AttnRnn_step_lis,AttnRnn_loss_lis,color='g')

plt.legend(['LSTM convergence', 'RNN convergence', 'AttnRnn convergence'])
plt.xlabel('Number of input sequences')
plt.ylabel('Loss')
plt.show()

### Testing
### Report the average number of wrong predictions on the test set in 10 different trials (for RNN, LSTM and RNN+Attention using 3000 test sequences)

In [None]:
# generate 3000 test sequences
n_batches = 3000
batch_size = 1
num_trials = 10
test_input_lis,test_label_lis = generate_inputs_labels(n_batches, batch_size)


In [None]:
# testing rnn
print("===RNN Test===")
with torch.no_grad():
    for trial in range(num_trials):
        n_correct = 0
        n_samples = 0
        for i in range(len(test_input_lis)):
            
            # forward
            outputs =  model_rnn(test_input_lis[i])
            loss = criterion(outputs, test_label_lis[i])
            # value, index|
            _, prediction = torch.max(outputs, 1)
            n_samples += test_label_lis[i].shape[1]
            if (prediction == test_label_lis[i]).sum().item() == 8:
                n_correct += (prediction == test_label_lis[i]).sum().item()
            else:
                n_correct += 0
        acc = 100.0 * n_correct / n_samples
        print(f'accuracy in trial {trial+1} of {num_trials}: {acc}')

In [None]:
# testing lstm
print("===LSTM Test===")
with torch.no_grad():
    for trial in range(num_trials):
        n_correct = 0
        n_samples = 0
        for i in range(len(test_input_lis)):
            
            # forward
            outputs =  model_lstm(test_input_lis[i])
            loss = criterion(outputs, test_label_lis[i])
            # value, index|
            _, prediction = torch.max(outputs, 1)
            
            n_samples += test_label_lis[i].shape[1]
            if (prediction == test_label_lis[i]).sum().item() == 8:
                n_correct += (prediction == test_label_lis[i]).sum().item()
            else:
                n_correct += 0
        acc = 100.0 * n_correct / n_samples
        print(f'accuracy in trial {trial+1} of {num_trials}: {acc}')

In [None]:
# testing Attention RNN
print("===Attention+RNN Test===")
with torch.no_grad():
    for trial in range(num_trials):
        n_correct = 0
        n_samples = 0
        for i in range(len(test_input_lis)):
            
            # forward
            outputs =  model_AttnRnn(test_input_lis[i])
            loss = criterion(outputs, test_label_lis[i])
            # value, index|
            _, prediction = torch.max(outputs, 1)
            
            n_samples += test_label_lis[i].shape[1]
            if (prediction == test_label_lis[i]).sum().item() == 8:
                n_correct += (prediction == test_label_lis[i]).sum().item()
            else:
                n_correct += 0

        acc = 100.0 * n_correct / n_samples
        print(f'accuracy in trial {trial+1} of {num_trials}: {acc}')

#### Average number of wrong predictions:
To convert the outputs to meaningful one-hot vectors we use determine the index of the max values as follows:

$$ \_, prediction = torch.max(outputs, 1) $$

#### RNN
 - Our RNN seems to predict all 0s and the accuracy is thus 0%.
 
#### LSTM
 - Our LSTM seems to predict all 0s and the accuracy is thus 0%.

#### RNN+Attention
 - Our RNN+Attention seems to predict all 0s and the accuracy is thus 0%.

All of these models were run with a wide variety of hyperparameters and tested as described previously but only adhoc predictions were obtained as if the network learned nothing, convergence was never attained by either of the models.