In [2]:
from src.datasets.nli import *
from src.model.nli_models import *
from src.utils.nli_utils import *

In [4]:
snli_conf = {"batch_size":128,"max_len":40,"device":'cuda',"tokenizer":'spacy'}
dataset = snli_module(snli_conf)

In [5]:
dataset.prepare_data()

downloading snli_1.0.zip
snli_1.0.zip: 100%|██████████| 94.6M/94.6M [00:07<00:00, 12.6MB/s]
extracting
.vector_cache/glove.840B.300d.zip: 2.18GB [17:08, 2.12MB/s]                            
100%|█████████▉| 2195697/2196017 [05:11<00:00, 7666.82it/s]

In [167]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

class Struc_Attention(nn.Module):
    def __init__(self, conf):
        super(Struc_Attention, self).__init__()
        self.Ws = nn.Linear(
            2 * conf["hidden_size"],
            conf["attention_layer_param"],
            bias=False,
        )
        self.Wa = nn.Linear(conf["attention_layer_param"], conf["r"], bias=False)

    def forward(self, hid):
        opt = self.Ws(hid)
        opt = torch.tanh(opt)
        opt = self.Wa(opt)
        opt = F.softmax(opt)
        return opt


class Struc_Attn_Encoder(nn.Module):
    def __init__(self, conf):
        super(Struc_Attn_Encoder, self).__init__()
        self.conf = conf
        self.embedding = nn.Embedding(
            num_embeddings=conf["vocab_size"],
            embedding_dim=conf["embedding_dim"],
            padding_idx=conf["padding_idx"],
        )
        if conf["use_glove"]:
            self.embedding = nn.Embedding.from_pretrained(
                torch.load(".vector_cache/{}_vectors.pt".format(conf["dataset"]))
            )
        self.translate = nn.Linear(self.conf["embedding_dim"], self.conf["hidden_size"])
        self.relu = nn.ReLU()

        self.lstm_layer = nn.LSTM(
            input_size=self.conf["hidden_size"],
            hidden_size=self.conf["hidden_size"],
            num_layers=self.conf["num_layers"],
            bidirectional=True,
            batch_first=True,
        )
        self.attention = Struc_Attention(conf)

    def forward(self, inp):
        batch_size = inp.shape[0]
        embedded = self.embedding(inp)
        embedded = self.relu(self.translate(embedded))
        all_, (_, _) = self.lstm_layer(embedded)
        attn = self.attention(all_)
        cont = torch.bmm(attn.permute(0, 2, 1), all_)
        return cont, attn


class Struc_Attn_encoder_snli(nn.Module):
    def __init__(self, conf):
        super(Struc_Attn_encoder_snli, self).__init__()
        self.conf = conf
        self.encoder = Struc_Attn_Encoder(conf)
        self.gated = self.conf["gated"]
        self.template = nn.Parameter(torch.zeros((1)), requires_grad=True)
        self.pool_strategy = self.conf["pool_strategy"]
        # Gated parameters
        if self.gated:
            self.wt_p = torch.nn.Parameter(
                torch.rand(
                    (
                        self.conf["r"],
                        2 * self.conf["hidden_size"],
                        self.conf["gated_embedding_dim"],
                    )
                )
            )
            self.wt_h = torch.nn.Parameter(
                torch.rand(
                    (
                        self.conf["r"],
                        2 * self.conf["hidden_size"],
                        self.conf["gated_embedding_dim"],
                    )
                )
            )
            self.init_gated_encoder()
            self.fc_in = nn.Linear(
                self.conf["gated_embedding_dim"] * self.conf["r"],
                self.conf["hidden_size"],
            )
            self.fcs = nn.ModuleList(
                [
                    nn.Linear(self.conf["hidden_size"], self.conf["hidden_size"])
                    for i in range(self.conf["fcs"])
                ]
            )
            self.fc_out = nn.Linear(self.conf["hidden_size"], 3)

        # Non Gated Version (max pool avg pool)
        else:
            self.fc_in = nn.Linear(
                2 * 4 * self.conf["hidden_size"],
                self.conf["hidden_size"],
            )
            self.fcs = nn.ModuleList(
                [
                    nn.Linear(
                        self.conf["hidden_size"],
                        self.conf["hidden_size"],
                    )
                    for i in range(self.conf["fcs"])
                ]
            )
            self.fc_out = nn.Linear(self.conf["hidden_size"], 3)

        self.act = nn.ReLU()
        self.softmax = nn.Softmax(dim=2)
        self.dropout = nn.Dropout(p=self.conf["dropout"])

    def init_gated_encoder(self):
        nn.init.kaiming_uniform_(self.wt_p)
        nn.init.kaiming_uniform_(self.wt_h)

    def penalty_l2(self, att):
        att = att.permute(1, 0, 2)
        penalty = (
            torch.norm(
                torch.bmm(att, att.transpose(1, 2))
                - torch.eye(att.size(1)).to(self.template.device),
                p="fro",
            )
            / att.size(0)
        ) ** 2
        return penalty

    def forward(self, x0, x1):
        x0_enc, x0_attn = self.encoder(x0)
        x0_enc = self.dropout(x0_enc)
        x1_enc, x1_attn = self.encoder(x1)
        x1_enc = self.dropout(x1_enc)

        if self.gated:
            F0 = x0_enc @ self.wt_p
            F1 = x1_enc @ self.wt_h
            Fr = F0 * F1
            Fr = Fr.permute(1, 0, 2).flatten(start_dim=1)
        else:
            if self.pool_strategy == "avg":
                F0 = x0_enc.mean(1)
                F1 = x1_enc.mean(1)
                Fr = torch.cat([F0, F1, torch.abs(F0 - F1), F0 * F1], dim=1)
            elif self.pool_strategy == "max":
                F0 = x0_enc.max(1)
                F0 = F0.values
                F1 = x1_enc.max(1)
                F1 = F1.values
                Fr = torch.cat([F0, F1, torch.abs(F0 - F1), F0 * F1], dim=1)

        opt = self.fc_in(Fr)
        opt = self.dropout(opt)
        for fc in self.fcs:
            opt = fc(opt)
            opt = self.dropout(opt)
            opt = self.act(opt)
        opt = self.fc_out(opt)
        return opt


In [168]:
model_conf = {
    "hidden_size":300,
    "embedding_dim":300,
    "dropout":0.3,
    "use_glove":True,
    "num_layers":1,
    "dataset":"snli",
    "fcs":1,
    "vocab_size":dataset.vocab_size(),
    "tokenizer":"spacy",
    "padding_idx":dataset.padding_idx(),
    "attention_layer_param":200,
    "r":3,
    "gated_embedding_dim":150,
    "pool_strategy":'max',
    "gated":True
}

hparams = {
    "optimizer_base": {
        "optim": "adamw",
        "lr": 0.0010039910781394373,
        "scheduler": "const",
    },
    "optimizer_tune": {
        "optim": "adam",
        "lr": 0.0010039910781394373,
        "weight_decay": 0.1,
        "scheduler": "lambda",
    },
    "switch_epoch": 5,
}

model = Struc_Attn_encoder_snli(model_conf)
# model = SNLI_model(attn_bilstm_snli,model_conf,hparams=hparams)

In [169]:
for i in dataset.train_dataloader():
    print(i)
    break



[torchtext.data.batch.Batch of size 128 from SNLI]
	[.premise]:[torch.cuda.LongTensor of size 128x40 (GPU 0)]
	[.hypothesis]:[torch.cuda.LongTensor of size 128x40 (GPU 0)]
	[.label]:[torch.cuda.LongTensor of size 128 (GPU 0)]


In [170]:
opt = model(i.premise.cpu(),i.hypothesis.cpu())

here
torch.Size([128, 40, 600])
torch.Size([128, 40, 3])
torch.Size([128, 3, 600]) torch.Size([128, 40, 3])
here
torch.Size([128, 40, 600])
torch.Size([128, 40, 3])
torch.Size([128, 3, 600]) torch.Size([128, 40, 3])
torch.Size([128, 3, 600])
torch.Size([3, 600, 150])


RuntimeError: The size of tensor a (128) must match the size of tensor b (3) at non-singleton dimension 0

In [6]:
EPOCHS = 20

tensorboard_logger = TensorBoardLogger("lightning_logs")
lr_logger = LearningRateLogger(logging_interval="step")

trainer = pl.Trainer(
    gpus=1,
    max_epochs=EPOCHS, 
    progress_bar_refresh_rate=10,
    profiler=False,
    auto_lr_find=False,
    callbacks=[lr_logger, SwitchOptim()],
    logger=[tensorboard_logger],
    row_log_interval=2,
) 
trainer.fit(model, dataset)
trainer.test(model, datamodule=dataset)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type        | Params
--------------------------------------
0 | model | BiLSTM_snli | 14 M  


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

In [157]:
a = torch.rand((128,600,3))
b = torch.rand((3,600,150))
#.unsqueeze(0).repeat(128,1,1,1)

In [158]:
a @ b

RuntimeError: The size of tensor a (128) must match the size of tensor b (3) at non-singleton dimension 0

In [153]:
a.matmul(b)

RuntimeError: The size of tensor a (128) must match the size of tensor b (3) at non-singleton dimension 0

In [156]:
a.permute(0,2,1) @ b

RuntimeError: The size of tensor a (128) must match the size of tensor b (3) at non-singleton dimension 0

In [11]:
opt = model(i.premise,i.hypothesis)

In [15]:
opt = opt.squeeze(0)

In [24]:
torch.rand([2, 128, 300])[-2:].transpose(0,1).contiguous().view(128,-1).shape

torch.Size([128, 600])

In [66]:
import time
time.strftime('%H:%M:%S', time.gmtime(time.time()))

'02:23:36'

In [171]:

all_ = torch.rand((128,40,400))
attn = torch.rand((128,40,3))

In [172]:
cont = torch.bmm(all_.permute(1, 2, 0), attn.permute(1, 0, 2)).permute(2, 0, 1)

In [173]:
cont.shape

torch.Size([3, 40, 400])