In [1]:
from src.datasets.nli import *
from src.model.nli_models import *
from src.utils.nli_utils import *

In [3]:
snli_conf = {"batch_size":128,"max_len":50,"device":'cuda',"tokenizer":'bert',"use_char_emb":False,"max_word_len":10}
dataset = snli_module(snli_conf)

In [4]:
dataset.prepare_data()

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

"""
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
BiLSTM + Attention based SNLI model
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""


class Attention(nn.Module):
    def __init__(self, conf):
        super(Attention, self).__init__()
        self.Ws = nn.Linear(
            2 * conf["hidden_size"],
            conf["attention_layer_param"],
            bias=False,
        )
        self.Wa = nn.Linear(conf["attention_layer_param"], 1, bias=False)

    def forward(self, hid):
        opt = self.Ws(hid)
        opt = torch.tanh(opt)
        opt = self.Wa(opt)
        opt = F.softmax(opt, dim=1)
        return opt


class Attn_Encoder(nn.Module):
    def __init__(self, conf):
        super(Attn_Encoder, self).__init__()
        self.embedding = nn.Embedding(
            num_embeddings=conf["vocab_size"],
            embedding_dim=conf["embedding_dim"],
            padding_idx=conf["padding_idx"],
        )
        self.translate = nn.Linear(
            (
                conf["embedding_dim"]
                + int(conf["use_char_emb"]) * conf["char_embedding_dim"]
            ),
            conf["hidden_size"],
        )
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=conf["dropout"])

        if conf["use_glove"]:
            self.embedding = nn.Embedding.from_pretrained(
                torch.load(".vector_cache/{}_vectors.pt".format(conf["dataset"]))
            )

        if conf["use_char_emb"]:
            self.char_embedding = nn.Embedding(
                num_embeddings=conf["char_vocab_size"],
                embedding_dim=conf["char_embedding_dim"],
                padding_idx=0,
            )
            self.char_cnn = nn.Conv2d(
                conf["max_word_len"],
                conf["char_embedding_dim"],
                (1, 6),
                stride=(1, 1),
                padding=0,
                bias=True,
            )
        self.lstm_layer = nn.LSTM(
            input_size=conf["hidden_size"],
            hidden_size=conf["hidden_size"],
            num_layers=conf["num_layers"],
            dropout=conf["dropout"],
            bidirectional=True,
            batch_first=True,
        )
        self.attention = Attention(conf)

    def char_embedding_forward(self, x):
        # X - [batch_size, seq_len, char_emb_size])
        batch_size, seq_len, char_emb_size = x.shape
        x = x.view(-1, char_emb_size)
        x = self.char_embedding(x)  # (batch_size * seq_len, char_emb_size, emb_size)
        x = x.view(batch_size, -1, seq_len, char_emb_size)
        x = x.permute(0, 3, 2, 1)
        x = self.char_cnn(x)
        x = torch.max(F.relu(x), 3)[0]
        return x.view(batch_size, seq_len, -1)

    def forward(self, inp, char_vec):
        batch_size = inp.shape[0]
        embedded = self.embedding(inp)
        if char_vec != None:
            char_emb = self.char_embedding_forward(char_vec)
            embedded = torch.cat([embedded, char_emb], dim=2)
        embedded = self.relu(self.translate(embedded))
        all_, (_, _) = self.lstm_layer(embedded)
        attn = self.attention(all_)
        cont = torch.bmm(attn.permute(0, 2, 1), all_)
        cont = cont.squeeze(1)
        return cont


class AttnBiLSTM_snli(nn.Module):
    def __init__(self, conf):
        super(AttnBiLSTM_snli, self).__init__()
        self.conf = conf
        self.encoder = Attn_Encoder(conf)
        self.fc_in = nn.Linear(
            2 * 4 * self.conf["hidden_size"],
            self.conf["hidden_size"],
        )
        self.fcs = nn.ModuleList(
            [
                nn.Linear(self.conf["hidden_size"], self.conf["hidden_size"])
                for i in range(self.conf["fcs"])
            ]
        )
        self.fc_out = nn.Linear(self.conf["hidden_size"], 3)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=2)
        self.dropout = nn.Dropout(p=self.conf["dropout"])

    def forward(self, x0, x1,**kwargs):
        char_vec_x0 = kwargs.get("char_premise",None)
        char_vec_x1 = kwargs.get("char_hypothesis",None)
        x0_enc = self.encoder(x0,char_vec_x0)
        x1_enc = self.encoder(x1,char_vec_x1)
        cont = torch.cat(
            [x0_enc, x1_enc, torch.abs(x0_enc - x1_enc), x0_enc * x1_enc], dim=1
        )
        opt = self.fc_in(cont)
        opt = self.dropout(opt)
        for fc in self.fcs:
            opt = self.relu(self.dropout(fc(opt)))
        opt = self.fc_out(opt)
        return opt


def attn_bilstm_snli(options):
    return AttnBiLSTM_snli(options)


In [11]:
model_conf = {
    "hidden_size":300,
    "embedding_dim":300,
    "char_embedding_dim":100,
    "dropout":0.3,
    "use_glove":True,
    "num_layers":1,
    "dataset":"snli",
    "fcs":1,
    "use_char_emb":True,
    "vocab_size":dataset.vocab_size(),
    "char_vocab_size":dataset.char_vocab_size(),
    "max_word_len": dataset.char_word_len(),
    "tokenizer":"spacy",
    "padding_idx":dataset.padding_idx(),
    "attention_layer_param":200,
    # "r":3,
    # "gated_embedding_dim":150,
    # "pool_strategy":'max',
    # "gated":True
}

hparams = {
    "optimizer_base": {
        "optim": "adamw",
        "lr": 0.0010039910781394373,
        "scheduler": "const",
    },
    "optimizer_tune": {
        "optim": "adam",
        "lr": 0.0010039910781394373,
        "weight_decay": 0.1,
        "scheduler": "lambda",
    },
    "switch_epoch": 5,
}

model = AttnBiLSTM_snli(model_conf)
# model = SNLI_model(attn_bilstm_snli,model_conf,hparams=hparams)

In [7]:
for i in dataset.train_dataloader():
    print(i)
    break



[torchtext.data.batch.Batch of size 128 from SNLI]
	[.premise]:[torch.cuda.LongTensor of size 128x50 (GPU 0)]
	[.hypothesis]:[torch.cuda.LongTensor of size 128x50 (GPU 0)]
	[.label]:[torch.cuda.LongTensor of size 128 (GPU 0)]


In [6]:
TEXT = dataset.data.TEXT

In [7]:
import dill

with open("./temp/TEXT.Field","wb")as f:
     dill.dump(TEXT,f)



In [1]:
import dill

with open("temp/TEXT.Field","rb")as f:
     TEXT=dill.load(f)

In [5]:
TEXT.preprocess(["Hello how are you"])

['hello how are you']

In [6]:
TEXT.process(TEXT.preprocess(["Hello how are you"]))

tensor([[    2, 13656,  6344,  9152,  9152,  3294,   517, 13656,  3294,  8023,
           517,     4,  5711,  6344,   517, 17802,  3294,  7047,     3,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1]])

In [1]:
from src.utils.download_utils import download_from_url

In [2]:
download_from_url('https://drive.google.com/file/d/1q-P3ReGf-yWnKrhb6XQAuMGo39hXlhYG/view?usp=sharing','./.data/dlnd/dlnd.zip')

downloading from Google Drive; may take a few minutes


In [3]:
import shutil

In [5]:
shutil.unpack_archive('/content/NoveltyDetectionResearch/.data/dlnd/dlnd.zip','/content/NoveltyDetectionResearch/.data/dlnd','zip')

In [10]:
import re
a = "https://drive.google.com/file/d/1q-P3ReGf-yWnKrhb6XQAuMGo39hXlhYG/view?usp=sharing"

a.split('/')[-2]

'1q-P3ReGf-yWnKrhb6XQAuMGo39hXlhYG'

In [11]:
import glob
import re
import os
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import joblib
import random
import torch
import json
from tqdm import tqdm


def create_dlnd_json(path):
    def get_sources(source):
        source_meta = [
            "/".join(i.split("/")[:-1]) + "/" + i.split("/")[-1].split(".")[0] + ".xml"
            for i in source
        ]
        sources = list(zip(source, source_meta))
        data = []
        for x in sources:
            with open(x[0], mode="r", errors="ignore") as f:
                source_text = f.read()
            root = ET.parse(x[1]).getroot()
            title = root.findall("feature")[0].get("title")
            eventname = root.findall("feature")[1].get("eventname")
            id = x[0].split("/")[-1].split(".")[0]
            data.append(
                {
                    "id": id,
                    "eventname": eventname,
                    "title": title,
                    "source_text": source_text,
                }
            )
        return data

    def get_targets(target):
        target_meta = [
            "/".join(i.split("/")[:-1]) + "/" + i.split("/")[-1].split(".")[0] + ".xml"
            for i in target
        ]
        targets = list(zip(target, target_meta))
        data = []
        for x in targets:
            with open(x[0], mode="r", errors="ignore") as f:
                target_text = f.read()
            # with open(x[1],mode='r',errors='ignore') as f:
            #     print(f.read())
            root = ET.parse(x[1]).getroot()
            novel = root.findall("feature")[2].get("DLA")
            src_id = root.findall("feature")[0].get("sourceid").split(",")
            id = x[0].split("/")[-1].split(".")[0]
            eventname = root.findall("feature")[1].get("eventname")
            data.append(
                {
                    "id": id,
                    "eventname": eventname,
                    "target_text": target_text,
                    "src_id": src_id,
                    "DLA": novel,
                }
            )
        return data

    categories = glob.glob(os.path.join(path, "TAP-DLND-1.0_LREC2018_modified/*"))
    sources = []
    targets = []
    for cat in categories:
        if os.path.isdir(cat):
            topics = glob.glob(cat + "/*")
            for topic in topics:
                source = topic + "/source/*.txt"
                target = topic + "/target/*.txt"
                event_id = topic + "/EventId.txt"
                sources += get_sources(glob.glob(source))
                targets += get_targets(glob.glob(target))

    source_set = {}
    for i in sources:
        source_set[i["id"]] = i
    target_set = {}
    for i in targets:
        target_set[i["id"]] = i

    dataset = {}
    i = 0
    for target in target_set.keys():
        source_text = []
        if len(target_set[target]["src_id"]) > 0 and target_set[target]["src_id"] != [
            ""
        ]:
            for src_id in target_set[target]["src_id"]:
                source_text.append(source_set[src_id]["source_text"])
            dataset[i] = {
                "target_text": target_set[target]["target_text"],
                "source": source_text,
                "DLA": target_set[target]["DLA"],
            }
            i += 1

    with open(os.path.join(path, "dlnd.jsonl"), "w") as f:
        json.dump(list(dataset.values()), f)


In [12]:
create_dlnd_json('.data/dlnd')