In [63]:
from src.model.nli_models import *
from src.model.novelty_models import *
from src.defaults import *
from torchtext.data import Example 
import pandas as pd
import numpy as np
import html
import random
from IPython.core.display import display, HTML
from IPython.display import IFrame
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import warnings
from transformers import BertTokenizer, DistilBertTokenizer
warnings.filterwarnings("ignore")

def encode_text(text,field):
    ex = Example.fromlist([text],[("text",field)])
    enc = field.process([ex.text])
    return torch.tensor(enc)

def load_novelty_model(_id):
    # load model data 
    check_model(_id)
    def load_model_data(_id):
        model_path = os.path.join("./results/", _id, "model.pt")
        model_data = torch.load(model_path)
        return model_data
    field = load_field(_id)
    model_data = load_model_data(_id)
    encoder_id = model_data["options"]["load_nli"]
    check_model(encoder_id)

    def load_encoder(enc_data):
        if enc_data["options"].get("attention_layer_param", 0) == 0:
            enc_data["options"]["use_glove"] = False
            model = bilstm_snli(enc_data["options"])
        elif enc_data["options"].get("r", 0) == 0:
            enc_data["options"]["use_glove"] = False
            model = attn_bilstm_snli(enc_data["options"])
        else:
            enc_data["options"]["use_glove"] = False
            model = struc_attn_snli(enc_data["options"])
        model.load_state_dict(enc_data["model_dict"])
        return model
    
    enc_data = load_encoder_data(encoder_id)
    encoder = load_encoder(enc_data).encoder

    model = HAN(model_data["options"],encoder)
    model.load_state_dict(model_data["model_dict"])
    return model,field

def decode(inp,field):
    if hasattr(field.nesting_field,"vocab"):
        return [[field.nesting_field.vocab.itos[i] for i in sent] for sent in inp]
    else:
        tok = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
        return [tok.convert_ids_to_tokens(i) for i in inp.tolist()]


def attention_combined(inp,field,s_att,w_att=None):
    tok_str = decode(inp,field)
    assert len(tok_str) == s_att.shape[0]
    assert len(tok_str) == w_att.shape[0]
    assert len(tok_str[0]) == w_att.shape[1]
    

    opt = []
    for sent in range(len(tok_str)):
        sent_with_att = []
        for word in range(len(tok_str[0])):
            word_str = tok_str[sent][word]
            if word_str not in ["<pad>",'[PAD]']:
                sent_with_att.append((word_str,w_att[sent][word].item()))
        if sent_with_att!=[]:
            opt.append((sent_with_att,s_att[sent].item()))
    return opt
        


def html_string(word,color,new_line = False):
    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    colored_string = template.format(color, '&nbsp' + word + '&nbsp') + ("<br>" if new_line else "")
    return colored_string


def colorize(attention_list):
    cmap_sent = matplotlib.cm.Blues
    cmap_word = matplotlib.cm.Reds

    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    colored_string = ''

    for sent, sent_att in attention_list:
        sent_color = matplotlib.colors.rgb2hex(cmap_sent(sent_att*5)[:3])
        colored_string  += html_string('\t---\t ',sent_color)
        for word,word_att in sent:
            word_color = matplotlib.colors.rgb2hex(cmap_word(word_att)[:3])
            colored_string += html_string(word,word_color)
        colored_string += "<br>"
    colored_string += "<br><br><br>"
    return colored_string

    seed_torch()

def plot_attention(src,trg,model,field,true_cls = False):
    cmap_word = matplotlib.cm.inferno

    s_enc = encode_text(src,field)
    t_enc = encode_text(trg,field)
    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    model.eval()
    with torch.no_grad():
        opt,s_att,t_att = model.forward_with_attn(s_enc,t_enc)
        pred = F.softmax(opt)

    src_att_map = attention_combined(s_enc[0],field,s_att[0].permute((1,0)),s_att[1][0])
    trg_att_map = attention_combined(t_enc[0],field,t_att[0].permute((1,0)),t_att[1][0])

    s_html = colorize(src_att_map)
    t_html = colorize(trg_att_map)
    if pred[0][0].item()>0.5:
        prob = pred[0][0].item()
        pred_str = "Prediction :    " +str(pred[0][0].item())+ "   Non-Novel"
    else:
        prob = pred[0][1].item()
        pred_str = "Prediction :    " +str(pred[0][1].item())+ "   Novel"
    
    col = matplotlib.colors.rgb2hex(cmap_word(prob)[:3])
    pred_html = template.format(col,pred_str)
    
    if true_cls:
        pred_html += "<br> " +template.format(col," True Class :   "+true_cls)

    with open('colorize.html', 'w') as f:
        f.write(s_html+t_html+ "<br><br><br>"+pred_html )
    


def disp_attention():
    IFrame('./colorize.html',width=1200,height=400)




In [51]:
model,field = load_novelty_model('NOV-1146') # 54,46

In [64]:
source = "We also experimented with the document encoder to find if document level pretraining has any impact on the novelty detection performance. We train our document encoder described in on the Reuters dataset with an objective of 10 class classification. The reuters dataset aligns with the dataset we use for novelty detection, the Reuters dataset contains news articles which are to be classified into categories like Investment, Shipping, Crop, Oil and so on"



In [65]:
target = "Identifing each of these classes requires the ability to extract features which tell which industry the news is related to. We hypothesise that this information is also essential while calculating the novelty of a document, since knowing if the target document is talking about the same thing or topic is also important. This can be seen as assisting the information filtering task. For this experiment we have 3 settings, we test the impact with and without pretraining for Reuters dataset and Reuters+NLI dataset combined. The settings used are listed below."


In [66]:
a = plot_attention(source,target,model,field)

In [67]:
IFrame('./colorize.html',width=2200,height=1000)


In [68]:
import json

In [8]:
with open('.data/dlnd/TAP-DLND-1.0_LREC2018_modified/dlnd.jsonl','r') as f:
    items = f.readlines()
data = [json.loads(i) for i in items]

In [69]:
example = data[120]
print("Prediction:")
plot_attention(example["source"],example["target_text"],model,field,example["DLA"])
print("Actual:")
example["DLA"]

Prediction:
Actual:


'Novel'

In [70]:
IFrame('./colorize.html',width=2200,height=2000)


In [14]:
lens = []
for i in data:
    lens.append(len(i['source']))
print(lens.index(min(lens)))


4765


In [15]:
lens = [(i,lens[i]) for i in range(len(lens))]

In [16]:
model.cuda()

from tqdm import tqdm
def predict(data,model,field):
    wrong_id = []
    for i in tqdm(range(len(data))):
        src = data[i]['source']
        trg = data[i]['target_text']
        true = data[i]['DLA']
        s_enc = encode_text(src,field)
        t_enc = encode_text(trg,field)

        model.eval()
        with torch.no_grad():
            opt,s_att,t_att = model.forward_with_attn(s_enc.cuda(),t_enc.cuda())
            pred = F.softmax(opt)[0][1].item()
        if pred > 0.5:
            pred = "Novel"
        else:
            pred = "Non-Novel"
        if pred!=true:
            wrong_id.append(i)
    return wrong_id

In [17]:
wrong_id = predict(data,model,field)

100%|██████████| 5435/5435 [02:49<00:00, 32.05it/s]


In [18]:
model.cpu()

HAN(
  (encoder): HAN_DOC(
    (encoder): Attn_Encoder(
      (embedding): Embedding(33934, 300, padding_idx=1)
      (translate): Linear(in_features=300, out_features=400, bias=True)
      (relu): ReLU()
      (dropout): Dropout(p=0.3, inplace=False)
      (lstm_layer): LSTM(400, 400, batch_first=True, dropout=0.3, bidirectional=True)
      (attention): Attention(
        (Ws): Linear(in_features=800, out_features=200, bias=False)
        (Wa): Linear(in_features=200, out_features=1, bias=False)
      )
    )
    (translate): Linear(in_features=800, out_features=400, bias=True)
    (act): ReLU()
    (dropout): Dropout(p=0.3, inplace=False)
    (lstm_layer): LSTM(400, 400, bidirectional=True)
    (attention): StrucSelfAttention(
      (ut_dense): Linear(in_features=800, out_features=200, bias=False)
      (et_dense): Linear(in_features=200, out_features=10, bias=False)
    )
  )
  (act): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=32000, out_features=2,

In [19]:
c=0
for i in sorted(lens,key = lambda x:x[1]): 
    c+=1
    if i[0] in wrong_id:
        print(i)
        break

(5261, 2128)


In [22]:
wrong_id[0]

191

In [20]:
source = """ORANGE, Calif. — A gunman opened fire at a Southern California real estate office on Wednesday, killing four people, including a 9-year-old boy who the authorities said appeared to have died in his mother’s arms as she tried to shield him from the gunfire.

The shooting was likely related to a “business and personal relationship which existed between the suspect and all of the victims,” Lt. Jennifer Amat, a spokeswoman for the Orange Police Department, said at a news conference on Thursday morning.

“This appears to be an isolated incident, and we believe everyone knew each other,” Lieutenant Amat said.

The woman holding the boy was wounded in the shooting and remained in a hospital on Thursday after emergency treatment, with the Orange County district attorney, Todd Spitzer, cautioning that their relationship had not been formally determined. The police did not provide details about the other victims, a man and two women, because their next of kin had not all been notified.

The suspect, identified as Aminadab Gaxiola Gonzalez, 44, of Fullerton, was also hospitalized in critical condition with a gunshot wound, the authorities said.

Dig deeper into the moment.
Special offer: Subscribe for $1 a week.
Mr. Spitzer called the shooting a “horrific massacre,” and said that officials would learn more as the investigation unfolded. “It is a horrible, horrible tragedy that Mr. Gonzalez made a decision to use deadly force to deal with issues he was dealing with, apparently, in his life,” Mr. Spitzer said.

The attack in Orange, about 30 miles southeast of Los Angeles, occurred at the offices of Unified Homes, a real estate and mobile home dealer, according to the business’s website.

When officers arrived after reports of a shooting around 5:30 p.m., the gates to the complex were closed with bicycle cable locks. The suspect fired shots toward officers, who fired their weapons from outside the gates, Lieutenant Amat said.

Officers then forced their way in, using bolt cutters to enter, officials said. When they reached the courtyard, officers found a wounded Mr. Gonzalez and took him into custody. It was unclear whether the officers struck the suspect or whether his injuries were self-inflicted.

Editors’ Picks

Deals Designed to Lure Travelers Off Their Couches

When Ian Desmond Opted Out, His Work Was Just Starting
Continue reading the main story
At the scene, which covered two floors and a courtyard area of the building, officials recovered a semiautomatic handgun and a backpack containing pepper spray, handcuffs and ammunition, Lieutenant Amat said. The suspect had arrived at the location in a rental car, she said, and was believed to have been living out of a hotel room in Anaheim.

Wednesday’s shooting set people on edge both near and far from the scene because it came shortly after two nationally publicized mass shootings. On March 16, a gunman killed eight people at three spas in the Atlanta area. Six days later, a man stormed a grocery store in Boulder, Colo., and killed 10 people.

Until the shooting in Atlanta, it had been a year since a large-scale shooting in a public place in the United States, according to the Violence Project. But researchers say the kind of violence that unfolded in Orange never went away during the coronavirus pandemic — it simply went out of view. Data from the Violence Project shows that in 2020 there were more than 600 shootings in which at least four people were shot by one person, compared with 417 in 2019.

“Those numbers clearly indicate that it’s not that there were less during the pandemic, but actually more,” Ronnie Dunn, a professor of urban studies at Cleveland State University, said about mass shootings, adding that record gun sales may have added to the spike in shootings.

“It’s almost as if people have become desensitized to the human loss of the shootings in urban areas,” he said.



Emma Soto, 26, who lives in an apartment near the Orange real estate office, was doing laundry on Wednesday when she said she heard seven to 10 gunshots.

“It just sounded like a popping sound,” she said, adding: “We’re hearing of all these shootings going on, so I just thought, ‘Another shooting.’ But we never imagined it would be that close to us.”

Almost immediately after hearing the gunfire, Ms. Soto said, several police vehicles pulled up. She watched as officers emerged with their weapons drawn and ran toward the building.

The neighborhood is typically quiet and peaceful, and it is largely Hispanic, said Ms. Soto, a manager at a nearby big-box store.

Hope Orozco, 27, was with her 3-year-old son at a neighbor’s house when she said she heard the gunfire. She said her son liked to watch her neighbor’s children play Call of Duty, the popular video game. At first, she said, she mistook the commotion outside for gunfire from the game.

“I was like, ‘Wait a minute, is this from the TV?’” Ms. Orozco said. She realized it was real after noticing that all the players were wearing headsets.

Hector Gomez and Edgar Gonzalez work at a roofing business located on the first floor of the building where the shooting occurred. Mr. Gomez said the woman who ran the real estate office would often bring her son to the building.

“He’s a cute little boy,” Mr. Gomez said.

The two men said they were convinced the woman and her son were among the victims. The woman’s S.U.V. was still in the parking lot, they said, as the police conducted their investigation late into the evening.

Mr. Gomez and Mr. Gonzalez usually leave the office around 5:30 p.m., when the shooting happened. On Wednesday, they left early.

“It could have been us,” said Mr. Gomez, who came back with Mr. Gonzalez after hearing about the shooting from their boss. “I don’t want to say this, but it probably would have been us. Because we’re always the last ones here.”


ImageInvestigators at the scene of a shooting in Orange, Calif., where four people were killed.
Investigators at the scene of a shooting in Orange, Calif., where four people were killed.Credit...Allison Zaucha for The New York Times
The squat commercial building where the shooting took place is mostly surrounded by homes and apartment buildings in Orange, a city of 139,000 people less than six miles from Disneyland. Late Wednesday evening, about a dozen police and fire vehicles blocked the wide Lincoln Avenue.

The beige, low-rise building houses several businesses, including a property management company, an insurance agency and a consulting firm.

Lieutenant Amat said Orange had not seen “an incident like this” since a rampage in 1997 at a Caltrans maintenance yard, in which a gunman killed four people and was later killed by the police in a shootout.

Gov. Gavin Newsom of California said on Twitter that he was jolted by the shooting.

“Horrifying and heartbreaking,” he said. “Our hearts are with the families impacted by this terrible tragedy.”"""

In [30]:
target = """he man accused of carrying out a massacre at an office building in Orange Wednesday evening, leaving four people including a child dead, was charged in an assault case six years ago.

Records from the Orange County Superior Court show Aminadab Gonzalez-Gaxiola faced four misdemeanor charges of child abuse and endangerment, assault with a deadly weapon other than a gun, dissuading a witness, and battery for an incident that occurred on March 31, 2015.

Gonzalez-Gaxiola pleaded not guilty to all counts and prosecutors dismissed all but the battery charge at a hearing later that year. The battery charge was dismissed in 2017 after the records show Gonzalez-Gaxiola successfully completed a probationary sentence. 

Other court filings showed Gonzalez-Gaxiola was cited for traffic violations in 2014 and 2015, and one of the cases indicated that, at the time, he was working as a commercial truck driver.



"""

In [31]:
a = plot_attention(source,target,model,field)

0.9970517158508301 Non-Novel


In [32]:
IFrame('./colorize.html',width=2200,height=2000)
