In [19]:
from src.model.nli_models import *
from src.model.novelty_models import *
from src.defaults import *
from torchtext.data import Example 
import pandas as pd
import numpy as np
import html
import random
from IPython.core.display import display, HTML
from IPython.display import IFrame
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import warnings
from transformers import BertTokenizer, DistilBertTokenizer
warnings.filterwarnings("ignore")

def encode_text(text,field):
    ex = Example.fromlist([text],[("text",field)])
    enc = field.process([ex.text])
    return torch.tensor(enc)

def load_novelty_model(_id):
    # load model data 
    check_model(_id)
    def load_model_data(_id):
        model_path = os.path.join("./results/", _id, "model.pt")
        model_data = torch.load(model_path)
        return model_data
    field = load_field(_id)
    model_data = load_model_data(_id)
    encoder_id = model_data["options"]["load_nli"]
    check_model(encoder_id)

    def load_encoder(enc_data):
        if enc_data["options"].get("attention_layer_param", 0) == 0:
            enc_data["options"]["use_glove"] = False
            model = bilstm_snli(enc_data["options"])
        elif enc_data["options"].get("r", 0) == 0:
            enc_data["options"]["use_glove"] = False
            model = attn_bilstm_snli(enc_data["options"])
        else:
            enc_data["options"]["use_glove"] = False
            model = struc_attn_snli(enc_data["options"])
        model.load_state_dict(enc_data["model_dict"])
        return model
    
    enc_data = load_encoder_data(encoder_id)
    encoder = load_encoder(enc_data).encoder

    model = HAN(model_data["options"],encoder)
    model.load_state_dict(model_data["model_dict"])
    return model,field

def decode(inp,field):
    if hasattr(field.nesting_field,"vocab"):
        return [[field.nesting_field.vocab.itos[i] for i in sent] for sent in inp]
    else:
        tok = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
        return [tok.convert_ids_to_tokens(i) for i in inp.tolist()]


def attention_combined(inp,field,s_att,w_att=None):
    tok_str = decode(inp,field)
    assert len(tok_str) == s_att.shape[0]
    assert len(tok_str) == w_att.shape[0]
    assert len(tok_str[0]) == w_att.shape[1]
    

    opt = []
    for sent in range(len(tok_str)):
        sent_with_att = []
        for word in range(len(tok_str[0])):
            word_str = tok_str[sent][word]
            if word_str not in ["<pad>",'[PAD]']:
                sent_with_att.append((word_str,w_att[sent][word].item()))
        if sent_with_att!=[]:
            opt.append((sent_with_att,s_att[sent].item()))
    return opt



def html_string(word,color,new_line = False):
    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    colored_string = template.format(color, '&nbsp' + word + '&nbsp') + ("<br>" if new_line else "")
    return colored_string


def colorize(attention_list):
    cmap_sent = matplotlib.cm.Blues
    cmap_word = matplotlib.cm.Reds

    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    colored_string = ''

    for sent, sent_att in attention_list:
        sent_color = matplotlib.colors.rgb2hex(cmap_sent(sent_att*5)[:3])
        colored_string  += html_string('\t---\t ',sent_color)
        for word,word_att in sent:
            word_color = matplotlib.colors.rgb2hex(cmap_word(word_att)[:3])
            colored_string += html_string(word,word_color)
        colored_string += "<br>"
    colored_string += "<br><br><br>"
    return colored_string

    seed_torch()

def plot_attention(src,trg,model,field,true_cls = False,return_html=False,cuda=False):
    cmap_word = matplotlib.cm.inferno

    s_enc = encode_text(src,field)
    t_enc = encode_text(trg,field)
    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    model.eval()
    with torch.no_grad():
        if cuda == True:
            s_enc = s_enc.cuda()
            t_enc = t_enc.cuda()
        opt,s_att,t_att = model.forward_with_attn(s_enc,t_enc)
        pred = F.softmax(opt)
        pred = pred.cpu()
        s_att = [i.cpu() for i in s_att]
        t_att = [i.cpu() for i in t_att]

    src_att_map = attention_combined(s_enc[0],field,s_att[0].permute((1,0)),s_att[1][0])
    trg_att_map = attention_combined(t_enc[0],field,t_att[0].permute((1,0)),t_att[1][0])

    s_html = colorize(src_att_map)
    t_html = colorize(trg_att_map)
    if pred[0][0].item()>0.5:
        prob = pred[0][0].item()
        pred_str = "Prediction :    " +str(pred[0][0].item())+ "   Non-Novel"
    else:
        prob = pred[0][1].item()
        pred_str = "Prediction :    " +str(pred[0][1].item())+ "   Novel"
    
    col = matplotlib.colors.rgb2hex(cmap_word(prob)[:3])
    pred_html = template.format(col,pred_str)
    
    if true_cls:
        pred_html += "<br> " +template.format(col," True Class :   "+true_cls)
    if return_html:
        return s_html+t_html+ "<br><br><br>"+pred_html, pred[0]
    with open('colorize.html', 'w') as f:
        f.write(s_html+t_html+ "<br><br><br>"+pred_html )
    


def disp_attention():
    IFrame('./colorize.html',width=1200,height=400)




In [15]:
model,field = load_novelty_model('NOV-1146') # 54,46

In [16]:
source = "We also experimented with the document encoder to find if document level pretraining has any impact on the novelty detection performance. We train our document encoder described in on the Reuters dataset with an objective of 10 class classification. The reuters dataset aligns with the dataset we use for novelty detection, the Reuters dataset contains news articles which are to be classified into categories like Investment, Shipping, Crop, Oil and so on"



In [17]:
target = "Identifing each of these classes requires the ability to extract features which tell which industry the news is related to. We hypothesise that this information is also essential while calculating the novelty of a document, since knowing if the target document is talking about the same thing or topic is also important. This can be seen as assisting the information filtering task. For this experiment we have 3 settings, we test the impact with and without pretraining for Reuters dataset and Reuters+NLI dataset combined. The settings used are listed below."


In [20]:
a = plot_attention(source,target,model,field)

In [21]:
IFrame('./colorize.html',width=2200,height=1000)


In [68]:
import json

In [8]:
with open('.data/dlnd/TAP-DLND-1.0_LREC2018_modified/dlnd.jsonl','r') as f:
    items = f.readlines()
data = [json.loads(i) for i in items]

In [69]:
example = data[120]
print("Prediction:")
plot_attention(example["source"],example["target_text"],model,field,example["DLA"])
print("Actual:")
example["DLA"]

Prediction:
Actual:


'Novel'

In [70]:
IFrame('./colorize.html',width=2200,height=2000)


In [14]:
lens = []
for i in data:
    lens.append(len(i['source']))
print(lens.index(min(lens)))


4765


In [15]:
lens = [(i,lens[i]) for i in range(len(lens))]

In [None]:
model.cuda()

from tqdm import tqdm
def predict(data,model,field):
    wrong_pred_path = './results/all_pred/wrong_pred'
    correct_pred_path = './results/all_pred/correct_pred'
    if not os.path.exists(correct_pred_path):
        os.makedirs(wrong_pred_path)
        os.makedirs(correct_pred_path)

    for i in tqdm(range(len(data))):
        src = data[i]['source']
        trg = data[i]['target_text']
        true = data[i]['DLA']
        html_str,pred = plot_attention(src,trg,model,field,true_cls = true,return_html=True,cuda=True)
        pred_lab = "Non-Novel" if pred[0]>0.5 else "Novel"
            
        if pred_lab!=true:
            html_path = os.path.join(wrong_pred_path,str(i)+".html")
            with open(html_path,'w') as f:
                f.write(html_str)
        else:
            html_path = os.path.join(correct_pred_path,str(i)+".html")
            with open(html_path,'w') as f:
                f.write(html_str)
        

In [16]:
model.cuda()

from tqdm import tqdm
def predict(data,model,field):
    
    wrong_id = []
    for i in tqdm(range(len(data))):
        src = data[i]['source']
        trg = data[i]['target_text']
        true = data[i]['DLA']
        s_enc = encode_text(src,field)
        t_enc = encode_text(trg,field)
        

        model.eval()
        with torch.no_grad():
            opt,s_att,t_att = model.forward_with_attn(s_enc.cuda(),t_enc.cuda())
            pred = F.softmax(opt)[0][1].item()
        if pred > 0.5:
            pred = "Novel"
        else:
            pred = "Non-Novel"
        if pred!=true:
            wrong_id.append(i)
    return wrong_id

In [17]:
wrong_id = predict(data,model,field)

100%|██████████| 5435/5435 [02:49<00:00, 32.05it/s]


In [18]:
model.cpu()

HAN(
  (encoder): HAN_DOC(
    (encoder): Attn_Encoder(
      (embedding): Embedding(33934, 300, padding_idx=1)
      (translate): Linear(in_features=300, out_features=400, bias=True)
      (relu): ReLU()
      (dropout): Dropout(p=0.3, inplace=False)
      (lstm_layer): LSTM(400, 400, batch_first=True, dropout=0.3, bidirectional=True)
      (attention): Attention(
        (Ws): Linear(in_features=800, out_features=200, bias=False)
        (Wa): Linear(in_features=200, out_features=1, bias=False)
      )
    )
    (translate): Linear(in_features=800, out_features=400, bias=True)
    (act): ReLU()
    (dropout): Dropout(p=0.3, inplace=False)
    (lstm_layer): LSTM(400, 400, bidirectional=True)
    (attention): StrucSelfAttention(
      (ut_dense): Linear(in_features=800, out_features=200, bias=False)
      (et_dense): Linear(in_features=200, out_features=10, bias=False)
    )
  )
  (act): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=32000, out_features=2,

In [19]:
c=0
for i in sorted(lens,key = lambda x:x[1]): 
    c+=1
    if i[0] in wrong_id:
        print(i)
        break

(5261, 2128)


In [22]:
wrong_id[0]

191

In [11]:
a = 12.92931979