In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

Let:

$fn$ = Number of features

$hs$ = Number of output nodes (hidden size)

$bs$ = Batch size

Then:
 * Each $W_{something}$ matrix below has the shape $(fn, hs)$;
 * Each $U_{something}$ matrix below has the shape $(hs, hs)$;
 * Each $b_{something}$ matrix below has the shape $(1, hs)$;
 * The $x_t$ matrix below has shape $(bs, fn)$, corresponding to the element of index $t$ of each sequence inf the batch.; and
 * The $h_t$ matrix below has shape $(bs, hs)$, corresponding to hidden state at time $t$ of each sequence inf the batch.

And:

$f_t = \sigma(W_f \ x_t + U_f \ h_{t-1} + b_f)$

$i_t = \sigma(W_i \ x_t + U_i \ h_{t-1} + b_i)$

$o_t = \sigma(W_o \ x_t + U_o \ h_{t-1} + b_o)$

$g_t = \tanh \ (W_g \ x_t + U_g \ h_{t-1} + b_g)$ a.k.a. $\tilde{c}_t$

$c_t = f_t \circ c_{t-1} + i_t \circ g_t$

$h_t = o_t \circ \tanh \ (c_t)$

In [255]:
import math
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.w_f = nn.Parameter(torch.Tensor(input_size,hidden_size))
        self.u_f= nn.Parameter(torch.Tensor(hidden_size,hidden_size))
        self.b_f = nn.Parameter(torch.Tensor(1,hidden_size))
        self.w_i = nn.Parameter(torch.Tensor(input_size,hidden_size))
        self.u_i= nn.Parameter(torch.Tensor(hidden_size,hidden_size))
        self.b_i = nn.Parameter(torch.Tensor(1,hidden_size))
        self.w_o = nn.Parameter(torch.Tensor(input_size,hidden_size))
        self.u_o= nn.Parameter(torch.Tensor(hidden_size,hidden_size))
        self.b_o = nn.Parameter(torch.Tensor(1,hidden_size))
        self.w_c = nn.Parameter(torch.Tensor(input_size,hidden_size))
        self.u_c= nn.Parameter(torch.Tensor(hidden_size,hidden_size))
        self.b_c = nn.Parameter(torch.Tensor(1,hidden_size))
        self.init_weights()
    def init_weights(self):
        stdv = 1.0/math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv,stdv)

    def forward(self, x, init_states = None):
        bs,seq,_ = x.size()
        if not init_states:
            h_t = torch.zeros(bs, self.hidden_size) 
            c_t = torch.zeros(bs, self.hidden_size)
        else:
            h_t,c_t  = init_states
            
        hidden_seq = []
        for t in range(seq):
            x_t = x[:,t,:]
            f_t = torch.sigmoid(x_t @ self.w_f + h_t @ self.u_f + self.b_f)
            i_t = torch.sigmoid(x_t @ self.w_i + h_t @ self.u_i + self.b_i)
            o_t = torch.sigmoid(x_t @ self.w_o + h_t @ self.u_o + self.b_o)
            g_t = torch.tanh(x_t @ self.w_c + h_t @ self.u_c + self.b_c)
            c_t = f_t * c_t + i_t * g_t
            h_t = o_t * torch.tanh(c_t)
            hidden_seq.append(h_t.unsqueeze(0))
        hidden_seq = torch.cat(hidden_seq,dim=0)
        hidden_seq = hidden_seq.transpose(0,1).contiguous()
        return hidden_seq, (h_t,c_t)
        

In [246]:
class RNN(nn.Module):
    def __init__(self,input_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.w_ih = nn.Parameter(torch.Tensor(input_size,hidden_size))
        self.w_hh = nn.Parameter(torch.Tensor(hidden_size,hidden_size))
        self.b_h = nn.Parameter(torch.Tensor(1,hidden_size))
        self.init_weights()
    def init_weights(self):
        stdev = 1.0/math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdev,stdev)
    def forward(self, x, init_states= None):
        batch_size,seq,_ = x.size()
        if not init_states:
            h_t = torch.zeros(batch_size,self.hidden_size)
        else:
            h_t = init_states
        hidden_seqs= []
        for t in range(seq):
            x_t  = x[:,t,:]
            h_t = torch.tanh(x_t @ self.w_ih + h_t @ self.w_hh + self.b_h)
            hidden_seqs.append(h_t.unsqueeze(0))
        hidden_seqs = torch.cat(hidden_seqs,dim=0)
        hidden_seqs = hidden_seqs.transpose(0,1).contiguous()
        return hidden_seqs, h_t

In [247]:
lstm = LSTM(30,5)

In [248]:
rnn = RNN(30,5)

In [249]:
inp = torch.ones((1, 5, 30))
lstm(inp)

(tensor([[[ 0.0383, -0.0926,  0.5642,  0.0537, -0.0438],
          [ 0.0467, -0.0919,  0.7110,  0.0895, -0.0592],
          [ 0.0499, -0.0883,  0.7483,  0.0999, -0.0666],
          [ 0.0510, -0.0867,  0.7611,  0.1023, -0.0699],
          [ 0.0514, -0.0861,  0.7666,  0.1030, -0.0714]]],
        grad_fn=<TransposeBackward0>),
 (tensor([[ 0.0514, -0.0861,  0.7666,  0.1030, -0.0714]], grad_fn=<MulBackward0>),
  tensor([[ 0.2105, -0.4219,  2.2219,  0.5070, -0.4391]], grad_fn=<AddBackward0>)))

In [250]:
inp = torch.ones((1, 5, 30))
rnn(inp)

(tensor([[[ 0.7895, -0.8852, -0.9403, -0.9718,  0.3570],
          [ 0.9516, -0.8796, -0.9175, -0.9619, -0.0694],
          [ 0.9428, -0.9149, -0.8845, -0.9634, -0.0624],
          [ 0.9449, -0.9145, -0.8886, -0.9648, -0.0710],
          [ 0.9446, -0.9151, -0.8877, -0.9648, -0.0699]]],
        grad_fn=<TransposeBackward0>),
 tensor([[ 0.9446, -0.9151, -0.8877, -0.9648, -0.0699]], grad_fn=<TanhBackward>))

In [251]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm_layer = RNN(30,5)
        self.fc1 = nn.Linear(5,3)
    def forward(self, x):
        h_s,_ = self.lstm_layer(x)
        y = self.fc1(h_s[:,-1,:])
        return y

In [252]:
classifier = Net()

In [253]:
classifier(inp)

tensor([[ 0.4908,  0.4611, -1.1691]], grad_fn=<AddmmBackward>)

In [254]:
device = "cpu"
classifier = Net().to(device)
optimizer = optim.Adam(classifier.parameters(), lr=0.005)#0.002 dives 85% acc
criterion = nn.CrossEntropyLoss()

category = [0,1,2]
data = [([1,2,3,4,5],0),
     ([2,5,11,13,15],1),
     ([12,15,22,24,25],2),
     ([5,6,3,9,11],1),
     ([23,15,26,27],2)]

test_data = [([1,5,7,8,3],0),
             ([11,23,14,16,17],1),
             ([23,25,12,28,29],2)]
num_epochs = 100
for epoch in range(num_epochs):
    for x,y in data:
        input_tensor = torch.zeros((1,5,30))
        labels = torch.tensor([y], dtype=torch.long)
        for i, elem in enumerate(x):
            input_tensor[0][i][elem] = 1
        optimizer.zero_grad()
        preds = classifier(input_tensor.to(device))
        loss = criterion(preds, labels.to(device))
        print(loss.item())
        loss.backward()
        optimizer.step()  
    acc = 0
    with torch.no_grad():
        for  x,y in test_data:
            input_tensor = torch.zeros((1,5,30))
            labels = torch.tensor([y], dtype=torch.long)
            for i, elem in enumerate(x):
                input_tensor[0][i][elem] = 1
            preds = classifier(input_tensor.to(device))
            acc += (preds.argmax(dim=1) == labels.to(device)).float().sum().cpu().item()
    acc /= len(test_data)
    print("test accuracy:",acc)

0.9302617907524109
0.988351047039032
1.1443642377853394
0.9165745973587036
1.0967761278152466
test accuracy: 0.3333333333333333
0.8754047751426697
0.9321969151496887
1.119646430015564
0.8730838298797607
1.079025149345398
test accuracy: 0.6666666666666666
0.8516170978546143
0.8815810680389404
1.0946593284606934
0.8311192393302917
1.0590115785598755
test accuracy: 0.6666666666666666
0.8286897540092468
0.8322532176971436
1.0703446865081787
0.7902989983558655
1.0384382009506226
test accuracy: 0.6666666666666666
0.8037112951278687
0.7837380170822144
1.0462753772735596
0.7504451274871826
1.0172709226608276
test accuracy: 0.6666666666666666
0.775713324546814
0.7357447147369385
1.0217498540878296
0.7113040685653687
0.9950616955757141
test accuracy: 0.6666666666666666
0.7443580627441406
0.6880583167076111
0.9960266351699829
0.6726559400558472
0.9712073802947998
test accuracy: 0.6666666666666666
0.709659218788147
0.6405701637268066
0.9682766795158386
0.6343910694122314
0.9449901580810547
test ac

In [335]:
class LSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.w_f = nn.Parameter(torch.Tensor(input_size,hidden_size))
        self.u_f= nn.Parameter(torch.Tensor(hidden_size,hidden_size))
        self.b_f = nn.Parameter(torch.Tensor(1,hidden_size))
        self.w_i = nn.Parameter(torch.Tensor(input_size,hidden_size))
        self.u_i= nn.Parameter(torch.Tensor(hidden_size,hidden_size))
        self.b_i = nn.Parameter(torch.Tensor(1,hidden_size))
        self.w_o = nn.Parameter(torch.Tensor(input_size,hidden_size))
        self.u_o= nn.Parameter(torch.Tensor(hidden_size,hidden_size))
        self.b_o = nn.Parameter(torch.Tensor(1,hidden_size))
        self.w_c = nn.Parameter(torch.Tensor(input_size,hidden_size))
        self.u_c= nn.Parameter(torch.Tensor(hidden_size,hidden_size))
        self.b_c = nn.Parameter(torch.Tensor(1,hidden_size))
        self.init_weights()
    def init_weights(self):
        stdv = 1.0/math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv,stdv)

    def forward(self, x_t, h_t_c_t):
        h_t,c_t = h_t_c_t
        f_t = torch.sigmoid(x_t @ self.w_f + h_t @ self.u_f + self.b_f)
        i_t = torch.sigmoid(x_t @ self.w_i + h_t @ self.u_i + self.b_i)
        o_t = torch.sigmoid(x_t @ self.w_o + h_t @ self.u_o + self.b_o)
        g_t = torch.tanh(x_t @ self.w_c + h_t @ self.u_c + self.b_c)
        c_t = f_t * c_t + i_t * g_t
        h_t = o_t * torch.tanh(c_t)
        return (h_t,c_t)
    def init_hidden_cell_state(self):
        return torch.zeros(1,self.hidden_size),torch.zeros(1,self.hidden_size)
        

In [336]:
lstmcell = LSTMCell(10,5)
inp = torch.ones(1,10)
h_t,c_t = lstmcell.init_hidden_cell_state()
h_t,c_t = lstmcell.forward(inp,(h_t,c_t))

In [339]:
class LSTMNet(nn.Module):
    def __init__(self, input_size,hidden_size,output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.LSTMCell(input_size,hidden_size)
        self.fc = nn.Linear(hidden_size,output_size)
    def forward(self, x_t, h_t, c_t):
        h_t, c_t = self.rnn(x_t,(h_t, c_t))
        o_t = self.fc(h_t)
        return o_t, h_t, c_t 
    def init_hidden_cell_state(self):
        return torch.zeros(1,self.hidden_size),torch.zeros(1,self.hidden_size)


In [340]:
inp = torch.ones(1,10)
rnn = LSTMNet(10,5,2)
h_t,c_t = rnn.init_hidden_cell_state()
o_t,h_t,c_t = rnn(inp,h_t,c_t)

In [341]:
class RNNCell(nn.Module):
    
    def __init__(self,input_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.w_ih = nn.Parameter(torch.Tensor(input_size,hidden_size))
        self.w_hh = nn.Parameter(torch.Tensor(hidden_size,hidden_size))
        self.b_h = nn.Parameter(torch.Tensor(1,hidden_size))
        self.init_weights()
    def init_weights(self):
        stdev = 1.0/math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdev,stdev)
    def init_hidden_state(self):
        return torch.zeros(1,self.hidden_size)
    def forward(self, x_t, h_t):
        h_t = torch.tanh(x_t @ self.w_ih + h_t @ self.w_hh + self.b_h)
        return h_t

In [342]:
rnncell = RNNCell(10,5)
inp = torch.ones(1,10)
hidden_state = rnncell.init_hidden_state()
hidden_state = rnncell.forward(inp,hidden_state)

In [343]:
class RNNNet(nn.Module):
    def __init__(self, input_size,hidden_size,output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNNCell(input_size,hidden_size)
        self.fc = nn.Linear(hidden_size,output_size)
    def forward(self, x_t, h_t):
        h_t = self.rnn(x_t,h_t)
        o_t = self.fc(h_t)
        return o_t, h_t 
    def init_hidden_state(self):
        return torch.zeros(1,self.hidden_size)

In [344]:
inp = torch.ones(1,10)
rnn = RNNNet(10,5,2)
h_t = rnn.init_hidden_state()
o_t,h_t = rnn(inp,h_t)

In [345]:
def get_input_tensor(x):
    tensor = torch.zeros(1,30)
    tensor[0][x] = 1
    return tensor

In [347]:
rnn = LSTMNet(30,5,30)
learning_rate = 0.05
criterion = nn.CrossEntropyLoss()
inp_list = [[(1,2,3),(2,3,4)],[(2,3,4),(3,4,5)],[(5,6,7),(6,7,8)]]
for i in range(100):
    for inp,out in inp_list:
        h_t,c_t = rnn.init_hidden_cell_state()
        rnn.zero_grad()
        total_loss = 0
        for i,o in zip(inp,out):
            #print(i,o)
            inp_ten = get_input_tensor(i)
            o_t,h_t, c_t = rnn(inp_ten,h_t,c_t)
            category_tensor = torch.tensor([o], dtype=torch.long)
            loss = criterion(o_t,category_tensor)
            total_loss += loss
        print(total_loss.item()/3)
        total_loss.backward()
        for p in rnn.parameters():
            p.data.add_(p.grad.data, alpha=-learning_rate)


3.278469721476237
3.143273035685221
3.224144617716471
3.207681973775228
3.072141965230306
3.1923481623331704
3.1395816802978516
3.004084269205729
3.1620969772338867
3.074049631754557
2.9389527638753257
3.1332858403523765
3.0109875996907554
2.8766180674235025
3.1058197021484375
2.9503164291381836
2.816965103149414
3.0796111424764
2.8919690450032554
2.7598915100097656
3.054576555887858
2.8358866373697915
2.705301602681478
3.0306371053059897
2.7820138931274414
2.6531054178873696
3.007713953653971
2.7302964528401694
2.6032158533732095
2.9857301712036133
2.6806774139404297
2.5555454889933267
2.9646072387695312
2.63309637705485
2.5100056330362954
2.9442675908406577
2.5874862670898438
2.4665053685506186
2.9246317545572915
2.5437746047973633
2.4249507586161294
2.9056215286254883
2.5018819173177085
2.3852438926696777
2.887157440185547
2.461721738179525
2.347283363342285
2.869159698486328
2.423201243082682
2.3109636306762695
2.8515491485595703
2.386221249898275
2.2761764526367188
2.8342485427856