In [1]:
from src.datasets.nli import *
from src.model.nli_models import *
from src.utils.nli_utils import *

In [2]:
snli_conf = {"batch_size":128,"max_len":50,"device":'cuda',"tokenizer":'spacy',"use_char_emb":True,"max_word_len":10}
dataset = snli_module(snli_conf)

In [3]:
dataset.prepare_data()

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

"""
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
BiLSTM + Attention based SNLI model
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""


class Attention(nn.Module):
    def __init__(self, conf):
        super(Attention, self).__init__()
        self.Ws = nn.Linear(
            2 * conf["hidden_size"],
            conf["attention_layer_param"],
            bias=False,
        )
        self.Wa = nn.Linear(conf["attention_layer_param"], 1, bias=False)

    def forward(self, hid):
        opt = self.Ws(hid)
        opt = torch.tanh(opt)
        opt = self.Wa(opt)
        opt = F.softmax(opt, dim=1)
        return opt


class Attn_Encoder(nn.Module):
    def __init__(self, conf):
        super(Attn_Encoder, self).__init__()
        self.embedding = nn.Embedding(
            num_embeddings=conf["vocab_size"],
            embedding_dim=conf["embedding_dim"],
            padding_idx=conf["padding_idx"],
        )
        self.translate = nn.Linear(
            (
                conf["embedding_dim"]
                + int(conf["use_char_emb"]) * conf["char_embedding_dim"]
            ),
            conf["hidden_size"],
        )
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=conf["dropout"])

        if conf["use_glove"]:
            self.embedding = nn.Embedding.from_pretrained(
                torch.load(".vector_cache/{}_vectors.pt".format(conf["dataset"]))
            )

        if conf["use_char_emb"]:
            self.char_embedding = nn.Embedding(
                num_embeddings=conf["char_vocab_size"],
                embedding_dim=conf["char_embedding_dim"],
                padding_idx=0,
            )
            self.char_cnn = nn.Conv2d(
                conf["max_word_len"],
                conf["char_embedding_dim"],
                (1, 6),
                stride=(1, 1),
                padding=0,
                bias=True,
            )
        self.lstm_layer = nn.LSTM(
            input_size=conf["hidden_size"],
            hidden_size=conf["hidden_size"],
            num_layers=conf["num_layers"],
            dropout=conf["dropout"],
            bidirectional=True,
            batch_first=True,
        )
        self.attention = Attention(conf)

    def char_embedding_forward(self, x):
        # X - [batch_size, seq_len, char_emb_size])
        batch_size, seq_len, char_emb_size = x.shape
        x = x.view(-1, char_emb_size)
        x = self.char_embedding(x)  # (batch_size * seq_len, char_emb_size, emb_size)
        x = x.view(batch_size, -1, seq_len, char_emb_size)
        x = x.permute(0, 3, 2, 1)
        x = self.char_cnn(x)
        x = torch.max(F.relu(x), 3)[0]
        return x.view(batch_size, seq_len, -1)

    def forward(self, inp, char_vec):
        batch_size = inp.shape[0]
        embedded = self.embedding(inp)
        if char_vec != None:
            char_emb = self.char_embedding_forward(char_vec)
            embedded = torch.cat([embedded, char_emb], dim=2)
        embedded = self.relu(self.translate(embedded))
        all_, (_, _) = self.lstm_layer(embedded)
        attn = self.attention(all_)
        cont = torch.bmm(attn.permute(0, 2, 1), all_)
        cont = cont.squeeze(1)
        return cont


class AttnBiLSTM_snli(nn.Module):
    def __init__(self, conf):
        super(AttnBiLSTM_snli, self).__init__()
        self.conf = conf
        self.encoder = Attn_Encoder(conf)
        self.fc_in = nn.Linear(
            2 * 4 * self.conf["hidden_size"],
            self.conf["hidden_size"],
        )
        self.fcs = nn.ModuleList(
            [
                nn.Linear(self.conf["hidden_size"], self.conf["hidden_size"])
                for i in range(self.conf["fcs"])
            ]
        )
        self.fc_out = nn.Linear(self.conf["hidden_size"], 3)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=2)
        self.dropout = nn.Dropout(p=self.conf["dropout"])

    def forward(self, x0, x1,**kwargs):
        char_vec_x0 = kwargs.get("char_premise",None)
        char_vec_x1 = kwargs.get("char_hypothesis",None)
        x0_enc = self.encoder(x0,char_vec_x0)
        x1_enc = self.encoder(x1,char_vec_x1)
        cont = torch.cat(
            [x0_enc, x1_enc, torch.abs(x0_enc - x1_enc), x0_enc * x1_enc], dim=1
        )
        opt = self.fc_in(cont)
        opt = self.dropout(opt)
        for fc in self.fcs:
            opt = self.relu(self.dropout(fc(opt)))
        opt = self.fc_out(opt)
        return opt


def attn_bilstm_snli(options):
    return AttnBiLSTM_snli(options)


In [11]:
model_conf = {
    "hidden_size":300,
    "embedding_dim":300,
    "char_embedding_dim":100,
    "dropout":0.3,
    "use_glove":True,
    "num_layers":1,
    "dataset":"snli",
    "fcs":1,
    "use_char_emb":True,
    "vocab_size":dataset.vocab_size(),
    "char_vocab_size":dataset.char_vocab_size(),
    "max_word_len": dataset.char_word_len(),
    "tokenizer":"spacy",
    "padding_idx":dataset.padding_idx(),
    "attention_layer_param":200,
    # "r":3,
    # "gated_embedding_dim":150,
    # "pool_strategy":'max',
    # "gated":True
}

hparams = {
    "optimizer_base": {
        "optim": "adamw",
        "lr": 0.0010039910781394373,
        "scheduler": "const",
    },
    "optimizer_tune": {
        "optim": "adam",
        "lr": 0.0010039910781394373,
        "weight_decay": 0.1,
        "scheduler": "lambda",
    },
    "switch_epoch": 5,
}

model = AttnBiLSTM_snli(model_conf)
# model = SNLI_model(attn_bilstm_snli,model_conf,hparams=hparams)

In [7]:
for i in dataset.train_dataloader():
    print(i)
    break



[torchtext.data.batch.Batch of size 128 from SNLI]
	[.premise]:[torch.cuda.LongTensor of size 128x50 (GPU 0)]
	[.hypothesis]:[torch.cuda.LongTensor of size 128x50 (GPU 0)]
	[.label]:[torch.cuda.LongTensor of size 128 (GPU 0)]


In [12]:
opt = model(i.premise.cpu(),i.hypothesis.cpu(),**kwargs)

torch.Size([128, 50])
torch.Size([128, 50, 10])
torch.Size([128, 50])
torch.Size([128, 50, 10])


In [8]:
from torch.autograd import Variable
kwargs={}

char_premise = Variable(
    torch.LongTensor(dataset.data.characterize(i.premise))
)
char_hypothesis = Variable(
    torch.LongTensor(dataset.data.characterize(i.hypothesis))
)

char_premise = char_premise.cpu()
char_hypothesis = char_hypothesis.cpu()

kwargs["char_premise"] = char_premise
kwargs["char_hypothesis"] = char_hypothesis

In [157]:
a = torch.rand((128,600,3))
b = torch.rand((3,600,150))
#.unsqueeze(0).repeat(128,1,1,1)

In [34]:
values = [len(w) for w in dataset.data.TEXT.vocab.itos]

sorted(zip(values, dataset.data.TEXT.vocab.itos), reverse=True)[:10]



[(43, 'https://www.youtube.com/watch?v=txrsvc25gh8'),
 (32, 'http://fresnosmilemakeovers.com/'),
 (18, 'telecommunications'),
 (17, 'underconstruction'),
 (17, 'telecommunication'),
 (17, 'sings,(presumably'),
 (17, 'extraterrestrials'),
 (17, 'environmentalists'),
 (17, 'anashthesiologist'),
 (16, 'unprofessionally')]

In [32]:
dataset.data.TEXT.vocab.itos[28118]

'https://www.youtube.com/watch?v=txrsvc25gh8'

In [156]:
a.permute(0,2,1) @ b

RuntimeError: The size of tensor a (128) must match the size of tensor b (3) at non-singleton dimension 0

In [11]:
opt = model(i.premise,i.hypothesis)

In [15]:
opt = opt.squeeze(0)

In [24]:
torch.rand([2, 128, 300])[-2:].transpose(0,1).contiguous().view(128,-1).shape

torch.Size([128, 600])

In [66]:
import time
time.strftime('%H:%M:%S', time.gmtime(time.time()))

'02:23:36'

In [171]:

all_ = torch.rand((128,40,400))
attn = torch.rand((128,40,3))

In [172]:
cont = torch.bmm(all_.permute(1, 2, 0), attn.permute(1, 0, 2)).permute(2, 0, 1)

In [173]:
cont.shape

torch.Size([3, 40, 400])

In [5]:
import torch

bilstm_model_data = torch.load("results/bilstm/snli/best-bilstm-snli-params.pt")

In [9]:
from src.model.nli_models import *

In [12]:
model = bilstm_snli(bilstm_model_data["options"])

In [15]:
model.load_state_dict(bilstm_model_data["model_dict"])

<All keys matched successfully>

In [17]:
model.encoder

BiLSTM_encoder(
  (embedding): Embedding(39927, 300, padding_idx=1)
  (projection): Linear(in_features=300, out_features=400, bias=True)
  (lstm): LSTM(400, 400, batch_first=True, dropout=0.3, bidirectional=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
)

In [24]:
dataset.data.TEXT.tokenize("Hello how are you")

['Hello', 'how', 'are', 'you']

In [41]:
dataset.data.TEXT.tokenize("Hello how are you doing hackobbs")

['hello', 'how', 'are', 'you', 'doing', 'hack', '##ob', '##bs']

In [30]:
from torchtext.data import Field, Iterator, TabularDataset, NestedField, LabelField, BucketIterator
from torchtext import datasets

In [31]:
TEXT = Field(lower=True, tokenize='spacy', batch_first = True)
LABEL = Field(sequential=False, unk_token = None, is_target = True)

In [48]:
train, dev, test = datasets.SNLI.splits(TEXT, LABEL)

[7592, 2129, 2024, 2017]

In [8]:
nesting_field = Field(pad_token='<c>', init_token='<w>', eos_token='</w>')

In [9]:
field = NestedField(nesting_field, init_token='<s>', eos_token='</s>')


In [10]:
minibatch = [[list('john'), list('loves'), list('mary')],
[list('mary'), list('cries')],]

In [11]:
padded = field.pad(minibatch)


In [13]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(padded)

[   [   ['<w>', '<s>', '</w>', '<c>', '<c>', '<c>', '<c>'],
        ['<w>', 'j', 'o', 'h', 'n', '</w>', '<c>'],
        ['<w>', 'l', 'o', 'v', 'e', 's', '</w>'],
        ['<w>', 'm', 'a', 'r', 'y', '</w>', '<c>'],
        ['<w>', '</s>', '</w>', '<c>', '<c>', '<c>', '<c>']],
    [   ['<w>', '<s>', '</w>', '<c>', '<c>', '<c>', '<c>'],
        ['<w>', 'm', 'a', 'r', 'y', '</w>', '<c>'],
        ['<w>', 'c', 'r', 'i', 'e', 's', '</w>'],
        ['<w>', '</s>', '</w>', '<c>', '<c>', '<c>', '<c>'],
        ['<c>', '<c>', '<c>', '<c>', '<c>', '<c>', '<c>']]]


In [49]:
TEXT = Field(sequential=True, tokenize=list, fix_length=1014)

In [50]:
train,dev,test = datasets.SNLI.splits(TEXT, LABEL)

In [37]:
TEXT.build_vocab(train, max_size=10000)
LABEL.build_vocab(train)



In [24]:

def onehot_encode(self, texts):
    # A helper function to do the one-hot encoding of characters.
    sen_len, batch_size = texts.shape
    out = torch.zeros(size=(sen_len, batch_size, self.voc_size), device=texts.device)
    out.scatter_(2, texts.view(sen_len, batch_size, 1), 1)
    return out.permute(1, 2, 0)

In [38]:
train_iterator = Iterator(
        train,
        device='cuda',
        batch_size=128,
        sort_key=lambda x: len(x.text),
        repeat=False,
        train=True)

In [39]:
for i in train_iterator:
    print(i)
    break


[torchtext.data.batch.Batch of size 128 from SNLI]
	[.premise]:[torch.cuda.LongTensor of size 1014x128 (GPU 0)]
	[.hypothesis]:[torch.cuda.LongTensor of size 1014x128 (GPU 0)]
	[.label]:[torch.cuda.LongTensor of size 128 (GPU 0)]


In [45]:
"".join([TEXT.vocab.itos[i] for i in i.premise[0]])

'PAAAMAaATATtATTAAAAOATAAAACAAAMTATAAAAAAFFTSAAAAVAACSAAAATAAAAAYAAAAAAAATTTTTaAPAAWATTAAAAAGAAATTAAAAaASAAATYFWSFTTAAATAATAAAAYA'

In [46]:
"".join([TEXT.vocab.itos[i] for i in i.hypothesis[0]])

'PTTTTttTTNTtACTtAANTTTATTATSTtMTATAAATAATFTGTTTAOTACSAAAATTtAAaASTTTAAtaTTTTtNAtHASAATTAAaAaAAATTATTPAAOTATTAOATFATAtATBcTTACMYA'

In [47]:
i.hypothesis[0]

tensor([33, 26, 26, 26, 26,  8,  8, 26, 26, 47, 26,  8, 25, 36, 26,  8, 25, 25,
        47, 26, 26, 26, 25, 26, 26, 25, 26, 32, 26,  8, 34, 26, 25, 26, 25, 25,
        25, 26, 25, 25, 26, 37, 26, 43, 26, 26, 26, 25, 42, 26, 25, 36, 32, 25,
        25, 25, 25, 26, 26,  8, 25, 25,  4, 25, 32, 26, 26, 26, 25, 25,  8,  4,
        26, 26, 26, 26,  8, 47, 25,  8, 45, 25, 32, 25, 25, 26, 26, 25, 25,  4,
        25,  4, 25, 25, 25, 26, 26, 25, 26, 26, 33, 25, 25, 42, 26, 25, 26, 26,
        25, 42, 25, 26, 37, 25, 26, 25,  8, 25, 26, 40, 17, 26, 26, 25, 36, 34,
        48, 25], device='cuda:0')

In [48]:
TEXT.vocab.stoi

defaultdict(<function torchtext.vocab._default_unk_index>,
            {' ': 2,
             '!': 70,
             '"': 44,
             '#': 73,
             '$': 79,
             '%': 80,
             '&': 72,
             "'": 35,
             '(': 68,
             ')': 69,
             '*': 87,
             '+': 84,
             ',': 28,
             '-': 31,
             '.': 20,
             '/': 76,
             '0': 58,
             '1': 59,
             '2': 54,
             '3': 57,
             '4': 62,
             '5': 61,
             '6': 66,
             '7': 71,
             '8': 67,
             '9': 64,
             ':': 77,
             ';': 63,
             '<': 86,
             '<pad>': 1,
             '<unk>': 0,
             '=': 85,
             '>': 88,
             '?': 75,
             '@': 83,
             'A': 25,
             'B': 40,
             'C': 36,
             'D': 50,
             'E': 51,
             'F': 37,
             'G': 43,
            

In [None]:
datasets.SNLI()