In [1]:
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch
# datasets
from datasets import *

In [2]:
# Initialize the device which to run the model on
device = torch.device("cpu") if not torch.cuda.is_available() else torch.device("cuda:0")
print("[INFO]: Using device", device)

[INFO]: Using device cpu


In [3]:
seq_length = 11

In [4]:
dataset = RandomCombinationsDataset(seq_length)
dataloader =  DataLoader(dataset, 8, num_workers=1, drop_last=True)

x,y = next(iter(dataloader))


In [5]:
"""
This module implements a LSTM model in PyTorch.
You should fill in code into indicated sections.
"""


"""
use this for weight init torch.nn.init.kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu')
use 
stdv = 1.0 / math.sqrt(self._hidden_dim)

"""

class LSTM(nn.Module):

    def __init__(self, seq_length, input_dim, hidden_dim, num_classes,
                 batch_size, device):

        # super(LSTM, self).__init__()
        super().__init__()

        self.embeddings = nn.Embedding(seq_length, input_dim)
        self.seq_length = seq_length
        self.hidden_dim = hidden_dim
        self.device = device

        # Input modulation gate
        self.W_gx = nn.Parameter(torch.Tensor(input_dim, hidden_dim))
        self.W_gh = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        self.b_g = nn.Parameter(torch.Tensor(hidden_dim))

        # Input gate
        self.W_ix = nn.Parameter(torch.Tensor(input_dim, hidden_dim))
        self.W_ih = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        self.b_i = nn.Parameter(torch.Tensor(hidden_dim))

        # Forget gate
        self.W_fx = nn.Parameter(torch.Tensor(input_dim, hidden_dim))
        self.W_fh = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        self.b_f = nn.Parameter(torch.Tensor(hidden_dim))

        # Output gate
        self.W_ox = nn.Parameter(torch.Tensor(input_dim, hidden_dim))
        self.W_oh = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        self.b_o = nn.Parameter(torch.Tensor(hidden_dim))

        # Output calculation
        self.W_ph = nn.Parameter(torch.Tensor(hidden_dim, num_classes))
        self.b_p = nn.Parameter(torch.Tensor(num_classes))

        self.kaiming_init()
        self.logSoftmax = nn.LogSoftmax(dim=1)

    def kaiming_init(self):
        for p in self.parameters():
            if len(p.size()) == 2:
                p.data.normal_(0, 1 / math.sqrt(self.hidden_dim))
            else:
                p.data.fill_(0)

    def forward(self, x):

        h_t = torch.zeros(self.hidden_dim).to(self.device)
        c_t = torch.zeros(self.hidden_dim).to(self.device)
        print
        x = self.embeddings(x.long())

        for time in range(self.seq_length - 1):
            x_t = x[:, time, :]
            # Input modulation gate
            g_t = torch.tanh(x_t @ self.W_gx + h_t @ self.W_gh + self.b_g)
            # Input gate
            i_t = torch.sigmoid(x_t @ self.W_ix + h_t @ self.W_ih + self.b_i)
            # Forget gate
            f_t = torch.sigmoid(x_t @ self.W_fx + h_t @ self.W_fh + self.b_f)
            # Output gate
            o_t = torch.sigmoid(x_t @ self.W_ox + h_t @ self.W_oh + self.b_o)
            # Cell state
            c_t = g_t * i_t + c_t * f_t
            h_t = torch.tanh(c_t) * o_t

        out = self.logSoftmax((h_t @ self.W_ph + self.b_p))
        return out

In [6]:
lstm = LSTM(seq_length=seq_length, input_dim=1, hidden_dim = 128, num_classes= seq_length,
            batch_size = 128, device = device)

In [7]:
embeddings = nn.Embedding(11, 3)
embeddings(x.long()).shape

torch.Size([8, 10, 3])

In [8]:
lstm(x).shape

torch.Size([8, 11])

## Bi-directional LSTM

In [9]:
class biLSTM(nn.Module):

    def __init__(self, seq_length, input_dim, hidden_dim, num_classes,
                 batch_size, device):
        super(biLSTM, self).__init__()

        self.hidden_dim = hidden_dim
        self.device = device

        self.lstm_cell = LSTMCell(seq_length, input_dim, hidden_dim, num_classes,
                                  batch_size, device)

        # Output calculation
        self.W_ph = nn.Parameter(torch.Tensor(2 * hidden_dim, num_classes))
        self.b_p = nn.Parameter(torch.Tensor(num_classes))

        self.kaiming_init()
        self.logSoftmax = nn.LogSoftmax(dim=1)

    def kaiming_init(self):
        for p in self.parameters():
            if len(p.size()) == 2:
                p.data.normal_(0, 1 / math.sqrt(2*self.hidden_dim)) # 2 beceause W_ph is a combination of forward and backward pass
            else:
                p.data.fill_(0)

    def forward(self, x):
        x_forward = x
        x_backward = torch.flip(x, dims=[1])

        # Forward step
        c_forward = torch.zeros(self.hidden_dim).to(self.device)
        h_forward = torch.zeros(self.hidden_dim).to(self.device)
        c_T, h_T = self.lstm_cell(x_forward, c_forward, h_forward)

        # Backward step
        c_0, h_0 = self.lstm_cell(x_backward, c_T, h_T)

        H = torch.cat((h_T, h_0), -1)

        out = self.logSoftmax((H @ self.W_ph + self.b_p))
        # print(out.shape)
        return out


class LSTMCell(nn.Module):

    def __init__(self, seq_length, input_dim, hidden_dim, num_classes,
                 batch_size, device):

        super(LSTMCell, self).__init__()

        self.embeddings = nn.Embedding(seq_length, input_dim)
        self.seq_length = seq_length
        self.hidden_dim = hidden_dim
        self.device = device

        # Input modulation gate
        self.W_gx = nn.Parameter(torch.Tensor(input_dim, hidden_dim))
        self.W_gh = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        self.b_g = nn.Parameter(torch.Tensor(hidden_dim))

        # Input gate
        self.W_ix = nn.Parameter(torch.Tensor(input_dim, hidden_dim))
        self.W_ih = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        self.b_i = nn.Parameter(torch.Tensor(hidden_dim))

        # Forget gate
        self.W_fx = nn.Parameter(torch.Tensor(input_dim, hidden_dim))
        self.W_fh = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        self.b_f = nn.Parameter(torch.Tensor(hidden_dim))

        # Output gate
        self.W_ox = nn.Parameter(torch.Tensor(input_dim, hidden_dim))
        self.W_oh = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        self.b_o = nn.Parameter(torch.Tensor(hidden_dim))

        self.kaiming_init()
        self.logSoftmax = nn.LogSoftmax(dim=1)

    def kaiming_init(self):
        for p in self.parameters():
            if len(p.size()) == 2:
                p.data.normal_(0, 1 / math.sqrt(self.hidden_dim))
            else:
                p.data.fill_(0)

    def forward(self, x, c, h):

        h_t = h
        c_t = c
        x = self.embeddings(x.long())

        for time in range(self.seq_length - 1):
            x_t = x[:, time, :]
            # Input modulation gate
            g_t = torch.tanh(x_t @ self.W_gx + h_t @ self.W_gh + self.b_g)
            # Input gate
            i_t = torch.sigmoid(x_t @ self.W_ix + h_t @ self.W_ih + self.b_i)
            # Forget gate
            f_t = torch.sigmoid(x_t @ self.W_fx + h_t @ self.W_fh + self.b_f)
            # Output gate
            o_t = torch.sigmoid(x_t @ self.W_ox + h_t @ self.W_oh + self.b_o)
            # Cell state
            c_t = g_t * i_t + c_t * f_t
            h_t = torch.tanh(c_t) * o_t

        return c_t, h_t

In [10]:
biLstm = biLSTM(seq_length=seq_length, input_dim=1, hidden_dim = 128, num_classes= seq_length,
            batch_size = 128, device = device)

In [11]:
biLstm(x)

tensor([[-2.3980, -2.3980, -2.3978, -2.3973, -2.3980, -2.3978, -2.3980, -2.3975,
         -2.3983, -2.3975, -2.3986],
        [-2.3979, -2.3978, -2.3979, -2.3979, -2.3982, -2.3978, -2.3978, -2.3977,
         -2.3983, -2.3976, -2.3977],
        [-2.3983, -2.3975, -2.3971, -2.3964, -2.3977, -2.3978, -2.3978, -2.3984,
         -2.3959, -2.3994, -2.4006],
        [-2.3981, -2.3979, -2.3980, -2.3967, -2.3973, -2.3979, -2.3980, -2.3978,
         -2.3981, -2.3975, -2.3993],
        [-2.3977, -2.3986, -2.3988, -2.3981, -2.3979, -2.3978, -2.3983, -2.3968,
         -2.4002, -2.3959, -2.3968],
        [-2.3981, -2.3984, -2.3976, -2.3965, -2.3981, -2.3975, -2.3983, -2.3970,
         -2.3977, -2.3977, -2.3998],
        [-2.3982, -2.3977, -2.3976, -2.3970, -2.3979, -2.3978, -2.3979, -2.3979,
         -2.3974, -2.3982, -2.3992],
        [-2.3980, -2.3978, -2.3979, -2.3979, -2.3979, -2.3979, -2.3978, -2.3979,
         -2.3984, -2.3976, -2.3978]], grad_fn=<LogSoftmaxBackward>)

In [65]:
torch.flip(x, dims = [1])

tensor([[ 4.,  1., 10.,  0.,  7.,  6.,  2.,  3.,  8.,  5.],
        [ 7.,  4.,  6.,  5.,  9.,  8., 10.,  2.,  0.,  3.],
        [ 3.,  0.,  5.,  1.,  9.,  7.,  2., 10.,  6.,  8.],
        [ 7.,  8.,  5.,  9.,  0.,  4.,  1.,  3.,  2.,  6.],
        [ 6.,  8.,  3.,  0.,  5., 10.,  7.,  2.,  4.,  1.],
        [ 6.,  9., 10.,  1.,  0.,  8.,  3.,  5.,  2.,  7.],
        [ 1.,  2.,  5.,  0.,  8.,  7.,  6.,  4.,  9., 10.],
        [ 9.,  7.,  5., 10.,  1.,  0.,  3.,  6.,  8.,  2.]])

In [23]:
embeddings = nn.Embedding(10, 3)
embeddings(torch.tensor(0))

tensor([ 0.8575, -0.5294, -1.3109], grad_fn=<EmbeddingBackward>)

In [98]:
embeddings(torch.tensor(range(0,10)))

tensor([[-0.0090, -0.6284, -1.3574],
        [-1.7949,  0.1815, -0.4776],
        [-0.5935,  0.6689,  0.8285],
        [-1.5750, -0.9543, -0.8172],
        [ 0.6617,  0.8185,  1.6829],
        [-0.5433,  1.3335, -0.2157],
        [-2.3346,  1.6737, -0.9637],
        [-0.5777, -0.3100,  0.7148],
        [-0.6781,  0.0526,  1.1361],
        [ 0.1203,  0.2913, -0.3643]], grad_fn=<EmbeddingBackward>)

In [99]:
embeddings(x.long()).shape

torch.Size([8, 5, 3])

In [100]:
embeddings(x.long())[:, 0, :].shape

torch.Size([8, 3])

In [134]:
torch.tanh(embeddings(x.long())[:, 0, :]@torch.Tensor(3,128)  + torch.zeros(128)@torch.Tensor(128, 128))

tensor([[ 0.0000e+00,  0.0000e+00,  2.0739e-43,  ...,  0.0000e+00,
          3.7975e-42,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00, -4.9466e-43,  ...,  0.0000e+00,
         -9.0398e-42,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00, -1.8637e-43,  ...,  0.0000e+00,
         -3.4066e-42,  0.0000e+00],
        ...,
        [ 0.0000e+00,  0.0000e+00, -2.8026e-45,  ...,  0.0000e+00,
         -5.1848e-44,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00, -5.6332e-43,  ...,  0.0000e+00,
         -1.0302e-41,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00, -4.9466e-43,  ...,  0.0000e+00,
         -9.0398e-42,  0.0000e+00]], grad_fn=<TanhBackward>)

In [132]:
torch.zeros(128)@torch.Tensor(128, 128)

torch.Size([128])

In [105]:
(torch.Tensor(128, 3)@embeddings(x.long())[:, 0, :].T) .shape

torch.Size([128, 8])

In [107]:
bs, seq_len, feat_sz, hidden_sz = 5, 10, 32, 16
arr = torch.randn(bs, seq_len, feat_sz)

bs, seq_sz, _ = arr.size()
for t in range(seq_sz): # iterate over the time steps
    x_t = arr[:, t, :]
    print(x_t.shape)
    break

torch.Size([5, 32])


In [111]:
from torch.nn import Parameter

In [121]:
class NaiveLSTM(nn.Module):
    def __init__(self, input_sz: int, hidden_sz: int):
        super().__init__()
        self.input_size = input_sz
        self.hidden_size = hidden_sz
        # input gate
        self.W_ii = Parameter(torch.Tensor(input_sz, hidden_sz))
        self.W_hi = Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_i = Parameter(torch.Tensor(hidden_sz))
        # forget gate
        self.W_if = Parameter(torch.Tensor(input_sz, hidden_sz))
        self.W_hf = Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_f = Parameter(torch.Tensor(hidden_sz))
        # ???
        self.W_ig = Parameter(torch.Tensor(input_sz, hidden_sz))
        self.W_hg = Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_g = Parameter(torch.Tensor(hidden_sz))
        # output gate
        self.W_io = Parameter(torch.Tensor(input_sz, hidden_sz))
        self.W_ho = Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_o = Parameter(torch.Tensor(hidden_sz))
        
        self.init_weights()
    
    def init_weights(self):
        for p in self.parameters():
            if p.data.ndimension() >= 2:
                nn.init.xavier_uniform_(p.data)
            else:
                nn.init.zeros_(p.data)
        
    def forward(self, x: torch.Tensor, 
                init_states = None) :
        """Assumes x is of shape (batch, sequence, feature)"""
        bs, seq_sz, _ = x.size()
        hidden_seq = []
        if init_states is None:
            h_t, c_t = torch.zeros(self.hidden_size).to(x.device), torch.zeros(self.hidden_size).to(x.device)
        else:
            h_t, c_t = init_states
        for t in range(seq_sz): # iterate over the time steps
            x_t = x[:, t, :]
            i_t = torch.sigmoid(x_t @ self.W_ii + h_t @ self.W_hi + self.b_i)
            f_t = torch.sigmoid(x_t @ self.W_if + h_t @ self.W_hf + self.b_f)
            g_t = torch.tanh(x_t @ self.W_ig + h_t @ self.W_hg + self.b_g)
            #print((x_t @ self.W_ig).shape)
            print()
            print((h_t @ self.W_hg).shape)
            o_t = torch.sigmoid(x_t @ self.W_io + h_t @ self.W_ho + self.b_o)
            c_t = f_t * c_t + i_t * g_t
            h_t = o_t * torch.tanh(c_t)
            hidden_seq.append(h_t.unsqueeze(Dim.batch))
        hidden_seq = torch.cat(hidden_seq, dim=Dim.batch)
        # reshape from shape (sequence, batch, feature) to (batch, sequence, feature)
        hidden_seq = hidden_seq.transpose(Dim.batch, Dim.seq).contiguous()
        return hidden_seq, (h_t, c_t)
    
bs, seq_len, feat_sz, hidden_sz = 5, 10, 32, 16
arr = torch.randn(bs, seq_len, feat_sz)
lstm = NaiveLSTM(feat_sz, hidden_sz)

In [122]:
hs, (hn, cn) = lstm(arr)

torch.Size([16])


NameError: name 'Dim' is not defined

In [69]:
torch.LongTensor([x[:,0], x[:,0] 

TypeError: only integer tensors of a single element can be converted to an index

In [70]:
x[:, 0].item()

ValueError: only one element tensors can be converted to Python scalars

In [63]:
x[:,0]
embeddings(torch.tensor(list(x[:,0])))

RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding)

In [50]:
embedding = nn.Embedding(10, 3)
# a batch of 2 samples of 4 indices each
input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
embedding(input)

tensor([[[-1.0236,  0.0157, -0.2074],
         [-0.5139,  1.2556,  0.0493],
         [-0.5753,  0.3371, -0.4927],
         [-0.9977, -0.0603, -0.6731]],

        [[-0.5753,  0.3371, -0.4927],
         [ 1.7524, -1.6599,  1.2259],
         [-0.5139,  1.2556,  0.0493],
         [-0.8980,  0.1093,  1.0010]]], grad_fn=<EmbeddingBackward>)

In [54]:
torch.LongTensor(list(x[:,0]))

TypeError: only integer tensors of a single element can be converted to an index

In [54]:
batch_size, seq_length = x.size()

In [61]:
nn.Embedding(num_embeddings=10, embedding_dim=1)(torch.tensor(0))

tensor([-0.2372], grad_fn=<EmbeddingBackward>)

In [67]:
x.shape

torch.Size([8, 9])

In [32]:


dataset1 = datasets.RandomCombinationsDataset(10)
dataset2 = datasets.BinaryPalindromeDataset(10)
dataset3 = datasets.BaumSweetSequenceDataset(10)

data_loader1 = DataLoader(dataset1, 8, num_workers=1, drop_last=True)
data_loader2 = DataLoader(dataset2, 8, num_workers=1, drop_last=True)
data_loader3 = DataLoader(dataset3, 8, num_workers=1, drop_last=True)

values1, _ = next(iter(data_loader1))
values2, _ = next(iter(data_loader2))
values3, _ = next(iter(data_loader3))

print("RandomCombinations Shape:", values1.shape)
print("BinaryPalindrome Shape:", values2.shape)
print("BaumSweetSequence Shape:", values3.shape)

RandomCombinations Shape: torch.Size([8, 9])
BinaryPalindrome Shape: torch.Size([8, 41, 1])
BaumSweetSequence Shape: torch.Size([8, 1, 40])


In [40]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7ffb7f95c8b0>

In [52]:
x.shape

torch.Size([8, 9])

In [47]:
bs, seq_len, feat_sz, hidden_sz = 5, 10, 32, 16
arr = torch.randn(bs, seq_len, feat_sz)

In [49]:
arr.shape

torch.Size([5, 10, 32])