In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import numpy as np


#### Attention+CNN Encoder

In [113]:
class _Encoder_SentAttCNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size , batch_size,
                 dropoutRate = 0.5, window_size = 5, filterNum = 100, max_SentNum = 100, max_WordNum = 100,
                 embeddings = None, verbose = False):
        super(_Encoder_AttCNN, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = self.hidden_size
        self.output_size = self.output_size
        self.batch_size = batch_size
        self.window_size = window_size
        self.filterNum = filterNum
        self.max_SentNum = max_SentNum
        self.max_WordNum = max_WordNum
        sefl.dropoutRate = dropoutRate
        
        #Embedding Layer define --> threeDEmbedding
        self.embed = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=0)#(9) Word Representation Embedding layer
        if embeddings is not None:
            self.embed.weight = nn.Parameter(embeddings)
        if freeze_embeddings:
            self.embed.weight.requires_grad = False
        self.dropout = nn.Dropout(p = self.dropoutRate)  #Dropout
        
        #Convolutional Layer define --> Word_CNN
        self.Sent_Conv = nn.Conv1d(in_channels= self.embedding_dim, 
                                   out_channels = self.filterNum, 
                                   kernel_size=self.window_size,
                                   stride = 1,
                                   padding = 0, 
                                   bias = True)
        
        #Attention Layer define --> Word_Attn
        self.word_attn_v = F.tanh(nn.Linear(self.filterNum, self.filterNum)) 
        self.word_attn_w = F.softmax(nn.Linear(self.filterNum, 1))
        
        self.init_weights()
                                   
    def _threeDEmbedding_U(self, x):
        #x : (Batchsize, SentNum, seq_len, index(1))
        #Output Size: (Batch, SenNum, seq_len, embdding_dim )
        print("Original Input size: ", x.size())
        ipt_shape = x.size()
        bsz = x.size(0)
        SentNum = x.size(1)
        Seq_len = x.size(2)
        #WordNum = x.size(3)
        x = x.view(bsz, SentNum*Seq_len)  # (BatchSize, SentNum*seq_len , index)
        x = self.embed(self.vocab_size, self.embedding_dim, padding_idx = 0) #(BatchSize, SentNum*seq_len, embedding_dim, )
        x = x.view(bsz, SentNum, Seq_len, -1)  # Embedding_dim representation for each word
        print("Size after Word Embedding :", x.size()) 
        return x
                                   
    def _Word_CNN_U(self, x):
        print("Size after Word Embedding and Word_CNN Input : ", x.size())
        #Word CNN is only on single word inner level, just another representation for each word
        #Output size should be same as the input size, except the feature dimmension for each word
        #Output Size: (BatchSize, SentNum, Seq_len, filterNum)
        #x : (Batchsize, SentNum, Seq_len, Embedding_dim)
        
        x = self.Sent_Conv(x)
        print("Size of Word representation after Convolutional layer: ", x.size())
        return x
    
    
    def _Word_Attn_U(self, x):
        #Word Attention layer use the sum of weighted word representations as the sentense representation
        #Output size should be (BatchSize, SentNum, filterNum)
        #x : (Batch, SentNum, Seq_len, filterNum)
        ipt_shape = x.size()
        bsz = x.size(0)
        SentNum = x.size(1)
        Seq_len = x.size(2)
        f_Num = x.size(3)
        mask = x # (BatchSize, SentNum, Seq_len, filterNum)
        mask = mask.view(bsz, SentNum*Seq_len, -1) # (BatchSize, SentNum*seq_len , filterNum)
        attn_weights = self.word_attn_w(self.word_attn_v(mask)) #(BatchSize, SentNum*seq_len, weightValue(1))
        attn_weights = attn_weights.view(bsz, SentNum, Seq_len, -1)# (BatchSize, SentNum, Seq_len, weightValue(1)))
        output = attn_weights * x #Broadcast product in the last dimension  # (BatchSize, SentNum, Seq_len, FilterNum(weighted values))
        output = output.sum(2) #(13) #(BatchSize, SentNum, FilterNum(sum of weighted values of different z_i))
        return output
            
                                   
    def forward(self, x):
        x_emb = _threeDEmbedding_U(x)
        x_Conv = _Word_CNN_U(x_emb)
        x_attn = _Word_Attn_U(x_Conv)
        return x_attn
    
    def init_weights(self):
        initrange = 1
        lin_layers = [self.Sent_Conv, self.word_attn_v]
        lin_layers_nobias = [self.word_attn_w]
        for layer in lin_layers + lin_layers_nobias:
            layer.weight.data.uniform_(-initrange, initrange)
            if layer in lin_layers:
                layer.bias.data.fill_(0)
        
        
        
        

#### Attention + LSTM Decoder

In [126]:
class _Decoder_AttLSTM(nn.Module):
    
    def __init__(self, filterNum, hidden_size, output_size, batch_size,
                 dropoutRate = 0.5, max_SentNum = 100, max_WordNum = 100):
        super(_Decoder_AttLSTM, self).__init__()
        self.filterNum = filterNum
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.dropoutRate = dropoutRate
        self.max_SentNum = max_SentNum
        self.max_WordNum = max_WordNum
        
        #LSTM
        self.linear_f = nn.Linear(self.filterNum + self.hidden_size, self.hidden_size)
        self.linear_i = nn.Linear(self.filterNum + self.hidden_size, self.hidden_size)
        self.linear_ctilde = nn.Linear(self.filterNum + self.hidden_size, self.hidden_size)
        self.linear_o = nn.Linear(self.filterNum + self.hidden_size, self.hidden_size)
        
        #BiLSTM
        
        #Attention 
        self.attn_v = F.tanh(nn.Linear(self.hidden_size, self.hidden_size))
        self.attn_w = F.softmax(nn.Linear(self.hidden_size, 1))
        
        #Dropout
        self.dropout = nn.Dropout(p = self.dropoutRate)
        
        
        #Decoder
        self.decoder = nn.Linear(self.hidden_size, self.output_size)
        
        self.init_weights()
        
    
    def forward(self, x, hidden, c):
        hiddens = []
        for i in range(self.max_SentNum):
            hidden, c = self._lstm_step(x[i].squeeze(), hidden, c)
            hiddens.append(hidden)
            hidden_drop = self.dropout(hidden)
            
        output = torch.stack(hiddens)
        text_rep = self._Sent_Attn_U(output)
        
        scaled_score = F.sigmoid(self.decoder(text_rep))
        return scaled_score
        
        
        
    def _lstm_step(self, ipt_s, hid, c_t):
        ipt_s = self.dropout(ipt_s)
        combined = torch.cat((hid, ipt_s), 1)
        f = F.sigmoid(self.linear_f(combined))
        i = F.sigmoid(self.linear_i(combined))
        c_tilde = F.tanh(self.linear_ctilde(combined))
        c_t = f * c_t + i * c_tilde
        o = F.sigmoid(self.linear_o(combined))
        hid = o * F.tanh(c_t)
        return hid, c_t
    
    def _Sent_Attn_U(self, x):
        ipt_shape = x.size()
        bsz = x.size(0)
        SentNum = x.size(1)
        Seq_f = x.size(2)
        mask = x # (BatchSize, SentNum, Seq_f)
        attn_weights = self.attn_w(self.attn_v(mask))# (BatchSize, SentNum, 1)
        output = attn_weights * x # (BatchSize, SentNum, Seq_f)
        output = output.sum(1) # (BatchSize, Seq_f) #Final text representation
        return output
    
    def init_weights(self):
        initrange = 1
        lin_layers = [self.linear_f, self.linear_i, self.linear_ctilde, self.linear_o. self.attn_v]
        lin_layers_nobias = [self.attn_w]
        for layer in lin_layers + lin_layers_nobias:
            layer.weight.data.uniform_(-initrange, initrange)
            if layer in lin_layers:
                layer.bias.data.fill_(0)
    
    def init_hidden(self,batch_size):
        h0 = Variable(torch.zeros(batch_size, self.hidden_size))
        c0 = Variable(torch.zeros(batch_size, self.hidden_size))
        return h0, c0
        
        

In [122]:
ls = []
a = torch.rand([10,1])
b = torch.rand([10,1])
ls.append(a)
ls.append(b)

In [123]:
ls

[
  0.2535
  0.3492
  0.7258
  0.0170
  0.7659
  0.1883
  0.2387
  0.6542
  0.6160
  0.9839
 [torch.FloatTensor of size 10x1], 
  0.2146
  0.7935
  0.9036
  0.3694
  0.6767
  0.4687
  0.2924
  0.9960
  0.6635
  0.0904
 [torch.FloatTensor of size 10x1]]

In [124]:
torch.stack(ls)


(0 ,.,.) = 
  0.2535
  0.3492
  0.7258
  0.0170
  0.7659
  0.1883
  0.2387
  0.6542
  0.6160
  0.9839

(1 ,.,.) = 
  0.2146
  0.7935
  0.9036
  0.3694
  0.6767
  0.4687
  0.2924
  0.9960
  0.6635
  0.0904
[torch.FloatTensor of size 2x10x1]

In [102]:
x = torch.floor(torch.rand(32,10,10)*3).long()
x


(0 ,.,.) = 
   0   2   0  ...    0   0   0
   0   1   2  ...    1   2   2
   1   1   1  ...    2   2   0
     ...       ⋱       ...    
   1   1   0  ...    0   2   2
   0   0   0  ...    0   2   0
   2   0   2  ...    0   2   1

(1 ,.,.) = 
   1   1   2  ...    0   2   1
   0   2   0  ...    1   0   1
   2   1   2  ...    2   0   1
     ...       ⋱       ...    
   2   0   2  ...    1   0   2
   0   0   2  ...    1   1   1
   2   2   0  ...    1   2   0

(2 ,.,.) = 
   0   0   0  ...    2   0   0
   2   0   2  ...    0   2   0
   2   0   1  ...    0   2   1
     ...       ⋱       ...    
   2   0   1  ...    2   0   1
   1   0   1  ...    2   2   1
   0   1   1  ...    2   2   2
...

(29,.,.) = 
   0   1   1  ...    1   0   0
   0   0   0  ...    1   2   2
   1   2   2  ...    0   1   0
     ...       ⋱       ...    
   0   2   1  ...    1   2   2
   2   0   0  ...    1   0   1
   2   1   1  ...    0   0   2

(30,.,.) = 
   0   2   0  ...    1   2   0
   1   0   0  ...    2   2   0
 

In [95]:
numpy_mtx

array([[          0,           0, 30064771075,  4564281360,           0]])

In [96]:
embed_layer.weight.data


 0.0000e+00  0.0000e+00  3.0065e+10  4.5643e+09  0.0000e+00
[torch.FloatTensor of size 1x5]

In [94]:
embed_layer = nn.Embedding(1,5)
numpy_mtx = np.ndarray(shape = (1,5), dtype = int)
embed_layer.weight.data.copy_(torch.from_numpy(numpy_mtx))
embed_layer.weight.requires_grad = False

input_shape = x.size()
print(x.size())
bs = x.size(0)
seqnum = x.size(1)
seq_len = x.size(2)
#word_len = x.size(3)
#x = x.view(bs,seqnum*seq_len, -1)# (N*seq_len, word_len)
x = x.view(bs, seqnum*seq_len)
print(x.size())
#x = embed_layer(x) # (N*seq_len, word_len, embd_size)
x = x.view(bs, seqnum, seq_len, 1)
print(x.size())
x = x.view(bs,seqnum,seq_len, -1) # (N, seq_len, word_len, embd_size)
print(x.size())
x = x.sum(2) # (N, seq_len, embd_size)
print(x.size())

torch.Size([32, 10, 10])
torch.Size([32, 100])
torch.Size([32, 10, 10, 1])
torch.Size([32, 10, 10, 1])
torch.Size([32, 10, 1])


In [71]:
np.ones(400).dot(numpy_mtx)

array([  6.46103893e+18,   8.99664134e+18,   9.06173763e+18,
         1.15735443e+19,   9.01007606e+18])

In [62]:
np.ndarray(shape = (4000,50), dtype = int)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
x


(0 ,.,.) = 
   4
   4
   6
   9
   4
   4
   6

(1 ,.,.) = 
   4
   9
   3
   3
   6
   7
   3

(2 ,.,.) = 
   6
   6
   5
   6
   5
   5
   6
[torch.LongTensor of size 3x7x1]

In [100]:
F.softmax(torch.rand([2,3]))

Variable containing:
 0.4294  0.2548  0.3158
 0.2820  0.2318  0.4862
[torch.FloatTensor of size 2x3]

In [106]:
a = torch.rand([2,1])
print(a)
b = torch.rand([2,3])
print(b)


 0.7976
 0.6155
[torch.FloatTensor of size 2x1]


 0.7866  0.9444  0.5443
 0.4137  0.2194  0.1141
[torch.FloatTensor of size 2x3]



In [108]:
c = a*b
print(c)


 0.6274  0.7532  0.4341
 0.2546  0.1350  0.0703
[torch.FloatTensor of size 2x3]



In [109]:
c.sum(0)


 0.8820
 0.8882
 0.5043
[torch.FloatTensor of size 3]