In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torchinfo import summary

In [2]:
class viterbinet(nn.Module):
    def __init__(self, input_size, hidden_unit_size, num_layers, class_size):
        super(viterbinet, self).__init__()

        self.num_layers = num_layers
        self.hidden_unit_size = hidden_unit_size

        # set the ViterbiNet architecture.
        # 1 x 100 , 100 x 50, 50 x 16

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_unit_size,
                            num_layers=num_layers, batch_first=True)

        self.fc1 = nn.Linear(in_features=hidden_unit_size, out_features=int(hidden_unit_size/2))

        self.fc2 = nn.Linear(in_features=int(hidden_unit_size/2), out_features=class_size)

    def forward(self, x):
        h_0 = Variable(torch.zeros(1, 5000, 100))  # hidden state initialized with zeros
        c_0 = Variable(torch.zeros(1, 5000, 100))  # internal state initialized with zeros
        # Propagate input through LSTM
        x = x.view(5000, 1, 1)
        output, (hn, cn) = self.lstm(x, (h_0, c_0))  # lstm with input, hidden, and internal state at current timestamp, returns a new hidden state, current state, and output.
        #lstm_out, _ = self.lstm(x)
        hn = hn.view(-1, self.hidden_unit_size)  # reshape the output so that it can pass to a dense layer
        out = torch.tanh(hn)  # lstm activation function (tanh)
        out = self.fc1(out)  # first Dense
        out = F.relu(out)  # relu
        out = self.fc2(out)  # Final Output
        out = F.softmax(out, dim=1)  # softmax
        return out
    
net = viterbinet(input_size=1, hidden_unit_size=100, num_layers=1, class_size=16)
summary(net, input_size=(1,1,5000))

Layer (type:depth-idx)                   Output Shape              Param #
viterbinet                               --                        --
├─LSTM: 1-1                              [5000, 1, 100]            41,200
├─Linear: 1-2                            [5000, 50]                5,050
├─Linear: 1-3                            [5000, 16]                816
Total params: 47,066
Trainable params: 47,066
Non-trainable params: 0
Total mult-adds (M): 235.33
Input size (MB): 0.02
Forward/backward pass size (MB): 6.64
Params size (MB): 0.19
Estimated Total Size (MB): 6.85

In [3]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1d = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=7, stride=1, padding=3)
        self.conv1d2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=7, stride=1, padding=3)
        self.conv1d3 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=7, stride=1, padding=3)
        self.maxpool = nn.MaxPool1d(2)
        self.fc = nn.Linear(128, 16)

    def forward(self, x):
        # layer 1
        x = self.conv1d(x)
        x = F.relu(x)
        x = self.maxpool(x)
        # # layer2
        x = self.conv1d2(x)
        x = F.relu(x)
        x = self.maxpool(x)
        # layer3
        x = self.conv1d3(x)
        x = F.relu(x)
        x = self.maxpool(x)
        # flatten
        #print(x.view(-1,x.size(0)).shape)
        #x = x.flatten()
        x = x.view(-1, 128)
        #x = x.view(x.size(0), -1)
        # feed to a fully-connected layer
        x = self.fc(x)
        x = torch.sigmoid(x)
        return x
    
net = CNN()
summary(net, input_size=(8,1,5000))

Layer (type:depth-idx)                   Output Shape              Param #
CNN                                      --                        --
├─Conv1d: 1-1                            [8, 32, 5000]             256
├─MaxPool1d: 1-2                         [8, 32, 2500]             --
├─Conv1d: 1-3                            [8, 64, 2500]             14,400
├─MaxPool1d: 1-4                         [8, 64, 1250]             --
├─Conv1d: 1-5                            [8, 128, 1250]            57,472
├─MaxPool1d: 1-6                         [8, 128, 625]             --
├─Linear: 1-7                            [5000, 16]                2,064
Total params: 74,192
Trainable params: 74,192
Non-trainable params: 0
Total mult-adds (M): 883.28
Input size (MB): 0.16
Forward/backward pass size (MB): 31.36
Params size (MB): 0.30
Estimated Total Size (MB): 31.82

In [4]:
class multiLSTM(nn.Module):
    def __init__(self, input_size, hidden_unit_size, num_layers, class_size):
        super(multiLSTM, self).__init__()

        self.num_layers = num_layers
        self.hidden_unit_size = hidden_unit_size

        # set the ViterbiNet architecture.
        # 1 x 100 , 100 x 100, 100 x 100, 100 x 16

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_unit_size,
                            num_layers=num_layers, batch_first=True)

        self.lstm2 = nn.LSTM(input_size=input_size, hidden_size=hidden_unit_size,
                            num_layers=num_layers, batch_first=True)


        self.fc2 = nn.Linear(in_features=hidden_unit_size, out_features=class_size)

    def forward(self, x):
        np.random.seed(9001)
        if torch.cuda.is_available():
            h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_unit_size).cuda())
        else:
            h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_unit_size))

        if torch.cuda.is_available():
            c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_unit_size).cuda())
        else:
            c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_unit_size))



        # Propagate input through LSTM
        output, (h1, c1) = self.lstm(x, (h_0, c_0))  # lstm with input, hidden, and internal state at current timestamp, returns a new hidden state, current state, and output.
        output2, (h2, c2) = self.lstm2(x, (h1, c1))
        # output3, (h3, c3) = self.lstm3(x, (h2, c2))
        hx = h2.view(-1, self.hidden_unit_size)  # reshape the output so that it can pass to a dense layer

        out = torch.tanh(hx)  # lstm activation function (tanh)
        out = self.fc2(out)  # Final Output
        #out = F.softmax(out, dim=1)  # softmax
        out = torch.sigmoid(out)
        return out

net = multiLSTM(input_size=1, hidden_unit_size=100, num_layers=1, class_size=16)
summary(net, (5000,1,1))

Layer (type:depth-idx)                   Output Shape              Param #
multiLSTM                                --                        --
├─LSTM: 1-1                              [5000, 1, 100]            41,200
├─LSTM: 1-2                              [5000, 1, 100]            41,200
├─Linear: 1-3                            [5000, 16]                1,616
Total params: 84,016
Trainable params: 84,016
Non-trainable params: 0
Total mult-adds (M): 420.08
Input size (MB): 0.02
Forward/backward pass size (MB): 8.64
Params size (MB): 0.34
Estimated Total Size (MB): 9.00

In [5]:
class multiGRU(nn.Module):
    def __init__(self, input_size, hidden_unit_size, num_layers, class_size):
        super(multiGRU, self).__init__()

        self.num_layers = num_layers
        self.hidden_unit_size = hidden_unit_size

        # set the ViterbiNet architecture.
        # 1 x 100 , 100 x 50, 50 x 16

        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_unit_size,
                          num_layers=num_layers, batch_first=True)

        self.gru2 = nn.GRU(input_size=input_size, hidden_size=hidden_unit_size,
                           num_layers=num_layers, batch_first=True)

        self.fc = nn.Linear(in_features=hidden_unit_size, out_features=class_size)

    def forward(self, x):
        if torch.cuda.is_available():
            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_unit_size).cuda())
        else:
            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_unit_size))

        output, h1 = self.gru(x, h0.detach())
        output2, h2 = self.gru2(x, h1.detach())
        out = output2[:, -1, :]

        out = self.fc(out)
        #out = F.softmax(out, dim=1)
        out = torch.sigmoid(out)
        return out

net = multiGRU(input_size=1, hidden_unit_size=100, num_layers=1, class_size=16)
summary(net, (5000,1,1))

Layer (type:depth-idx)                   Output Shape              Param #
multiGRU                                 --                        --
├─GRU: 1-1                               [5000, 1, 100]            30,900
├─GRU: 1-2                               [5000, 1, 100]            30,900
├─Linear: 1-3                            [5000, 16]                1,616
Total params: 63,416
Trainable params: 63,416
Non-trainable params: 0
Total mult-adds (M): 317.08
Input size (MB): 0.02
Forward/backward pass size (MB): 8.64
Params size (MB): 0.25
Estimated Total Size (MB): 8.91

In [6]:
import math

##**Taken from##
##**** https://github.com/hyunwoongko/transformer** ##

class ScaleDotProductAttention(nn.Module):
    """
    compute scale dot product attention
    Query : given sentence that we focused on (decoder)
    Key : every sentence to check relationship with Qeury(encoder)
    Value : every sentence same with Key (encoder)
    """

    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None, e=1e-12):
        # input is 4 dimension tensor
        # [batch_size, head, length, d_tensor]
        batch_size, head, length, d_tensor = k.size()

        # 1. dot product Query with Key^T to compute similarity
        k_t = k.transpose(2, 3)  # transpose
        score = (q @ k_t) / math.sqrt(d_tensor)  # scaled dot product

        # 2. apply masking (opt)
        if mask is not None:
            score = score.masked_fill(mask == 0, -e)

        # 3. pass them softmax to make [0, 1] range
        score = self.softmax(score)

        # 4. multiply with Value
        v = score @ v

        return v, score

class MultiHeadAttention(nn.Module):

    def __init__(self, d_model=150, n_head=5):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        #self.w_k = nn.Linear(d_model, d_model)
        #elf.w_v = nn.Linear(d_model, d_model)
        #self.w_concat = nn.Linear(d_model, 16)
        self.fc = nn.Linear(d_model, 16)

    def forward(self, q, mask=None):
        # 1. dot product with weight matrices
        q = self.w_q(q)
        q = F.relu(q)
        q = self.w_k(q)
        q = F.relu(q)
        q = self.w_v(q)
        q = F.relu(q)

        # 2. split tensor by number of heads
        q = self.split(q)

        # 3. do scale dot product to compute similarity
        out, attention = self.attention(q,q,q, mask=mask)

        # 4. concat and pass to linear layer
        out = self.concat(out)
        out = out.view(-1, 150)
        out = self.fc(out)

        # 5. visualize attention map
        # TODO : we should implement visualization

        return out

    def split(self, tensor):
        """
        split tensor by number of head
        :param tensor: [batch_size, length, d_model]
        :return: [batch_size, head, length, d_tensor]
        """
        batch_size, length, d_model = tensor.size()

        d_tensor = d_model // self.n_head
        tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1, 2)
        # it is similar with group convolution (split by number of heads)

        return tensor

    def concat(self, tensor):
        """
        inverse function of self.split(tensor : torch.Tensor)
        :param tensor: [batch_size, head, length, d_tensor]
        :return: [batch_size, length, d_model]
        """
        batch_size, head, length, d_tensor = tensor.size()
        d_model = head * d_tensor

        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor
    
net = MultiHeadAttention()
summary(net, (5000,1,150))

Layer (type:depth-idx)                   Output Shape              Param #
MultiHeadAttention                       --                        --
├─Linear: 1-1                            [5000, 1, 150]            22,650
├─Linear: 1-2                            [5000, 1, 150]            22,650
├─Linear: 1-3                            [5000, 1, 150]            22,650
├─ScaleDotProductAttention: 1-4          [5000, 5, 1, 30]          --
│    └─Softmax: 2-1                      [5000, 5, 1, 1]           --
├─Linear: 1-5                            [5000, 16]                2,416
Total params: 70,366
Trainable params: 70,366
Non-trainable params: 0
Total mult-adds (M): 351.83
Input size (MB): 3.00
Forward/backward pass size (MB): 18.64
Params size (MB): 0.28
Estimated Total Size (MB): 21.92

In [8]:
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)

        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0

        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)

        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))

    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(x.contiguous().view(-1, feature_dim),self.weight)
            #torch.bmm(x, self.weight.unsqueeze(0).repeat(5000, 1, 1))

        if self.bias:
            eij = eij + self.b

        eij = torch.tanh(eij)
        a = torch.exp(eij)

        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)


class MyLSTM(nn.Module):
    def __init__(self, hidden_dim=100, lstm_layer=1):
        super(MyLSTM, self).__init__()

        self.lstm1 = nn.LSTM(input_size=1, hidden_size=100,
                             num_layers=1, batch_first=True)

        self.atten1 = Attention(hidden_dim, 100)  # 2 is bidrectional
        self.atten2 = Attention(hidden_dim, 100)  # 2 is bidrectional
        self.atten3 = Attention(hidden_dim, 100)  # 2 is bidrectional
        

        self.fc1 = nn.Linear(in_features=100, out_features=16)

    def forward(self, x):

        if torch.cuda.is_available():
            h_0 = Variable(torch.zeros(1, x.size(0), 100).cuda())
        else:
            h_0 = Variable(torch.zeros(1, x.size(0), 100))

        if torch.cuda.is_available():
            c_0 = Variable(torch.zeros(1, x.size(0), 100).cuda())
        else:
            c_0 = Variable(torch.zeros(1, x.size(0), 100))

        out1, (hn, cn) = self.lstm1(x, (h_0, c_0))
        #hx = hn.view(-1, 100)
        ans1 = self.atten1(out1)  # skip connect
        ans2 = self.atten2(out1)
        ans3 = self.atten3(out1)
        z = ans1 + ans2 + ans3

        out = self.fc1(z)
        out = torch.sigmoid(out)

        return out

    
net = MyLSTM()
summary(net, (5000,1,1))

Layer (type:depth-idx)                   Output Shape              Param #
MyLSTM                                   --                        --
├─LSTM: 1-1                              [5000, 1, 100]            41,200
├─Attention: 1-2                         [5000, 100]               200
├─Attention: 1-3                         [5000, 100]               200
├─Attention: 1-4                         [5000, 100]               200
├─Linear: 1-5                            [5000, 16]                1,616
Total params: 43,416
Trainable params: 43,416
Non-trainable params: 0
Total mult-adds (M): 215.58
Input size (MB): 0.02
Forward/backward pass size (MB): 16.64
Params size (MB): 0.17
Estimated Total Size (MB): 16.83