In [1]:
from src.datasets.nli import *
from src.model.nli_models import *
from src.utils.nli_utils import *

In [2]:
snli_conf = {"batch_size":128,"max_len":50,"device":'cuda',"tokenizer":'spacy',"use_char_emb":False,"max_word_len":10}
dataset = snli_module(snli_conf)

In [3]:
dataset.prepare_data()

In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

class MwAN_Encoder(nn.Module):
    def __init__(self,conf):
        super().__init__()
        self.dropout = nn.Dropout(conf["dropout"])
        self.embedding = nn.Embedding(
            num_embeddings=conf["vocab_size"],
            embedding_dim=conf["embedding_dim"],
            padding_idx=conf["padding_idx"],
        )
        
        if conf["use_glove"]:
            self.embedding = nn.Embedding.from_pretrained(
                torch.load(".vector_cache/{}_vectors.pt".format(conf["dataset"]))
            )

        if conf["use_char_emb"]:
            self.char_embedding = nn.Embedding(
                num_embeddings=conf["char_vocab_size"],
                embedding_dim=conf["char_embedding_dim"],
                padding_idx=0,
            )
            self.char_cnn = nn.Conv2d(
                conf["max_word_len"],
                conf["char_embedding_dim"],
                (1, 6),
                stride=(1, 1),
                padding=0,
                bias=True,
            )

        self.projection = nn.Linear(
            (
                conf["embedding_dim"]
                + int(conf["use_char_emb"]) * conf["char_embedding_dim"]
            ),
            conf["hidden_size"],
        )

        self.gru = nn.GRU(
            input_size=conf["hidden_size"],
            hidden_size=conf["hidden_size"],
            batch_first=True,
            bidirectional=True,
        )

    def char_embedding_forward(self, x):
        # X - [batch_size, seq_len, char_emb_size])
        batch_size, seq_len, char_emb_size = x.shape
        x = x.view(-1, char_emb_size)
        x = self.char_embedding(x)  # (batch_size * seq_len, char_emb_size, emb_size)
        x = x.view(batch_size, -1, seq_len, char_emb_size)
        x = x.permute(0, 3, 2, 1)
        x = self.char_cnn(x)
        x = torch.max(F.relu(x), 3)[0]
        return x.view(batch_size, seq_len, -1)

    def forward(self, inp, char_vec):
        embedded = self.embedding(inp)
        if char_vec != None:
            char_emb = self.char_embedding_forward(char_vec)
            embedded = torch.cat([embedded, char_emb], dim=2)
        embedded = self.dropout(self.projection(embedded))
        all_, _ = self.gru(embedded)
        return all_

class MwAN(nn.Module):
    def __init__(self, conf):
        super().__init__()
        self.dropout = nn.Dropout(conf["dropout"])

        
        self.embedding = nn.Embedding(
            num_embeddings=conf["vocab_size"],
            embedding_dim=conf["embedding_dim"],
            padding_idx=conf["padding_idx"],
        )
        
        if conf["use_glove"]:
            self.embedding = nn.Embedding.from_pretrained(
                torch.load(".vector_cache/{}_vectors.pt".format(conf["dataset"]))
            )

        if conf["use_char_emb"]:
            self.char_embedding = nn.Embedding(
                num_embeddings=conf["char_vocab_size"],
                embedding_dim=conf["char_embedding_dim"],
                padding_idx=0,
            )
            self.char_cnn = nn.Conv2d(
                conf["max_word_len"],
                conf["char_embedding_dim"],
                (1, 6),
                stride=(1, 1),
                padding=0,
                bias=True,
            )

        

        self.prem_gru = nn.GRU(
            input_size=(
                conf["embedding_dim"]
                + int(conf["use_char_emb"]) * conf["char_embedding_dim"]
            ),
            hidden_size=conf["hidden_size"],
            batch_first=True,
            bidirectional=True,
        )

        self.hypo_gru = nn.GRU(
            input_size=(
                conf["embedding_dim"]
                + int(conf["use_char_emb"]) * conf["char_embedding_dim"]
            ),
            hidden_size=conf["hidden_size"],
            batch_first=True,
            bidirectional=True,
        )

        # Concat Attention
        self.Wc1 = nn.Linear(2 * conf["hidden_size"], conf["hidden_size"], bias=False)
        self.Wc2 = nn.Linear(2 * conf["hidden_size"], conf["hidden_size"], bias=False)
        self.vc = nn.Linear(conf["hidden_size"], 1, bias=False)
        # Bilinear Attention
        self.Wb = nn.Linear(
            2 * conf["hidden_size"], 2 * conf["hidden_size"], bias=False
        )
        # Dot Attention :
        self.Wd = nn.Linear(2 * conf["hidden_size"], conf["hidden_size"], bias=False)
        self.vd = nn.Linear(conf["hidden_size"], 1, bias=False)
        # Minus Attention :
        self.Wm = nn.Linear(2 * conf["hidden_size"], conf["hidden_size"], bias=False)
        self.vm = nn.Linear(conf["hidden_size"], 1, bias=False)

        self.Ws = nn.Linear(2 * conf["hidden_size"], conf["hidden_size"], bias=False)
        self.vs = nn.Linear(conf["hidden_size"], 1, bias=False)

        self.gru_agg = nn.GRU(
            12 * conf["hidden_size"],
            conf["hidden_size"],
            batch_first=True,
            bidirectional=True,
        )

        self.Wq = nn.Linear(2 * conf["hidden_size"], conf["hidden_size"], bias=False)
        self.vq = nn.Linear(conf["hidden_size"], 1, bias=False)
        self.Wp1 = nn.Linear(2 * conf["hidden_size"], conf["hidden_size"], bias=False)
        self.Wp2 = nn.Linear(2 * conf["hidden_size"], conf["hidden_size"], bias=False)
        self.vp = nn.Linear(conf["hidden_size"], 1, bias=False)

        self.prediction = nn.Linear(2 * conf["hidden_size"], 2, bias=False)


    def char_embedding_forward(self, x):
        # X - [batch_size, seq_len, char_emb_size])
        batch_size, seq_len, char_emb_size = x.shape
        x = x.view(-1, char_emb_size)
        x = self.char_embedding(x)  # (batch_size * seq_len, char_emb_size, emb_size)
        x = x.view(batch_size, -1, seq_len, char_emb_size)
        x = x.permute(0, 3, 2, 1)
        x = self.char_cnn(x)
        x = torch.max(F.relu(x), 3)[0]
        return x.view(batch_size, seq_len, -1)

    def forward(self, premise, hypothesis,**kwargs):
        char_vec_x0 = kwargs.get("char_premise", None)
        char_vec_x1 = kwargs.get("char_hypothesis", None)

        hp = self.embedding(premise)
        hh = self.embedding(hypothesis)

        if char_vec_x0 != None:
            char_emb_hp = self.char_embedding_forward(char_vec_x0)
            hp = torch.cat([hp, char_emb_hp], dim=2)

        if char_vec_x1 != None:
            char_emb_hh = self.char_embedding_forward(char_vec_x1)
            hh = torch.cat([hh, char_emb_hh], dim=2)

        hp,_ = self.prem_gru(hp)
        hh,_ = self.hypo_gru(hh)

        print(hp.shape,hh.shape)

        _s1 = self.Wc1(hp).unsqueeze(1)
        print(_s1.shape)
        _s2 = self.Wc2(hh).unsqueeze(2)
        print(_s1.shape)
        sjt = self.vc(torch.tanh(_s1 + _s2)).squeeze()
        print(sjt.shape)
        ait = F.softmax(sjt, 2)
        print(ait.shape)
        qtc = ait.bmm(hp)
        _s1 = self.Wb(hp).transpose(2, 1)
        sjt = hh.bmm(_s1)
        ait = F.softmax(sjt, 2)
        qtb = ait.bmm(hp)
        _s1 = hp.unsqueeze(1)
        _s2 = hh.unsqueeze(2)
        sjt = self.vd(torch.tanh(self.Wd(_s1 * _s2))).squeeze()
        ait = F.softmax(sjt, 2)
        qtd = ait.bmm(hp)
        sjt = self.vm(torch.tanh(self.Wm(_s1 - _s2))).squeeze()
        ait = F.softmax(sjt, 2)
        qtm = ait.bmm(hp)
        _s1 = hh.unsqueeze(1)
        _s2 = hh.unsqueeze(2)
        sjt = self.vs(torch.tanh(self.Ws(_s1 * _s2))).squeeze()
        ait = F.softmax(sjt, 2)
        qts = ait.bmm(hh)
        aggregation = torch.cat([hh, qts, qtc, qtd, qtb, qtm], 2)
        aggregation_representation, _ = self.gru_agg(aggregation)
        sj = self.vq(torch.tanh(self.Wq(hp))).transpose(2, 1)
        rq = F.softmax(sj, 2).bmm(hp)
        sj = F.softmax(
            self.vp(self.Wp1(aggregation_representation) + self.Wp2(rq)).transpose(
                2, 1
            ),
            2,
        )
        rp = sj.bmm(aggregation_representation)
        encoder_output = self.dropout(F.leaky_relu(self.prediction(rp)))
        encoder_output = encoder_output.squeeze(1)
        return encoder_output


In [29]:
model_conf = {
    "hidden_size":200,
    "embedding_dim":300,
    "char_embedding_dim":100,
    "dropout":0.3,
    "use_glove":True,
    "num_layers":1,
    "dataset":"snli",
    "fcs":1,
    "use_char_emb":False,
    "vocab_size":dataset.vocab_size(),
    # "char_vocab_size":dataset.char_vocab_size(),
    # "max_word_len": dataset.char_word_len(),
    "tokenizer":"spacy",
    "padding_idx":dataset.padding_idx(),
    "attention_layer_param":200,
    # "r":3,
    # "gated_embedding_dim":150,
    # "pool_strategy":'max',
    # "gated":True
}

hparams = {
    "optimizer_base": {
        "optim": "adamw",
        "lr": 0.0010039910781394373,
        "scheduler": "const",
    },
    "optimizer_tune": {
        "optim": "adam",
        "lr": 0.0010039910781394373,
        "weight_decay": 0.1,
        "scheduler": "lambda",
    },
    "switch_epoch": 5,
}

model = MwAN(model_conf)
# model = SNLI_model(attn_bilstm_snli,model_conf,hparams=hparams)

In [30]:
opt = model(i.premise.cpu(),i.hypothesis.cpu(),**{})

torch.Size([128, 50, 400]) torch.Size([128, 50, 400])
torch.Size([128, 1, 50, 200])
torch.Size([128, 50, 50])
torch.Size([128, 50, 50])


In [20]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
count_parameters(model)

2525250

In [4]:
for i in dataset.train_dataloader():
    print(i)
    break


[torchtext.data.batch.Batch of size 128 from SNLI]
	[.premise]:[torch.cuda.LongTensor of size 128x50 (GPU 0)]
	[.hypothesis]:[torch.cuda.LongTensor of size 128x50 (GPU 0)]
	[.label]:[torch.cuda.LongTensor of size 128 (GPU 0)]


In [5]:
i.premise

tensor([[   2,   17,  133,  ...,    1,    1,    1],
        [   2,    4,    9,  ...,    1,    1,    1],
        [   2,   16, 2119,  ...,    1,    1,    1],
        ...,
        [   2,    4,   30,  ...,    1,    1,    1],
        [   2,    4,   32,  ...,    1,    1,    1],
        [   2,   53,   86,  ...,    1,    1,    1]], device='cuda:0')

In [6]:
a = torch.rand((128,50,300))

In [7]:
gru = nn.GRU(300,150,batch_first=True,bidirectional=True)

In [9]:
a_,a__ = gru(a)

In [10]:
a_.shape

torch.Size([128, 50, 300])

In [11]:
a__.shape

torch.Size([2, 128, 150])

tensor([[[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]],

        [[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]],

        [[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [