In [265]:
import torch
from torch.utils.data import Dataset
import pandas as pd
from transformers import BertTokenizer, BertForMaskedLM
from nltk.tokenize import sent_tokenize
import nltk
import time
import pickle
from torch.utils.data import DataLoader
from torch.optim import AdamW
import random
from scipy.spatial.distance import cdist, pdist, squareform
from scipy.stats import hmean, gmean
import numpy as np
from sklearn.metrics.pairwise import cosine_distances as cosine_distances, cosine_similarity

path = "/global/cscratch1/sd/ajaybati/bertmodelDSe3.pickle"


In [222]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased',output_hidden_states=True)
# model.load_state_dict(torch.load(path))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [223]:
def bleu(p,r):
    smoothie = SmoothingFunction().method2
    bleu_list = []
    for index in range(len(p)):
        BLEUscore = nltk.translate.bleu_score.sentence_bleu(p[index],r[index],smoothing_function=smoothie)
        bleu_list.append(BLEUscore)
    return sum(bleu_list) / len(bleu_list)
def getSent_pred(prediction,real_labels): #convert all ids to sentences for bscore
    sentlist_real = []
    sep_list = []
    for sent2 in real_labels:
        tokenized = tokenizer.convert_ids_to_tokens(sent2)
        sep = tokenized.index('[SEP]')
        sep_list.append(sep)
        sentlist_real.append(tokenized[1:sep])
    
    
    sentlist_ids = []
    sentlist = []
    for sent in prediction:
        word_list = []
        for word in sent:
            word_list.append(torch.argmax(word))
        sentlist_ids.append(word_list)
    
    for index,sent in enumerate(sentlist_ids):
        sentlist.append(tokenizer.convert_ids_to_tokens(sent)[1:sep_list[index]])
    return sentlist,sentlist_real
def calc_accuracy(prediction, real_labels, mask_indices):
    score = 0
    total = 0
    for step,sent in enumerate(mask_indices):
        if list(sent).count(0)!=40:
            for mask in sent:
                if int(mask)!=0:
                    predicted_index = int(torch.argmax(prediction[step,int(mask)]))
                    actual = int(real_labels[step][int(mask)])
                    if bool(predicted_index==actual):
                        score+=1
                    total+=1
                else:
                    pass

        else:
            pass
    
    p,r = getSent_pred(prediction,real_labels)
    
    
    accuracy = score/total
    try:
        bscore = bleu(p,r)
    except:
        bscore = "Unfortunately, not possible"
    return accuracy, bscore 


In [224]:
path = '/global/cscratch1/sd/ajaybati/model_ckptDS5.pickle' #path for saved model
def load_model(): #load trained model for inference
    model.load_state_dict(torch.load(path, map_location=torch.device('cpu'))['model_state_dict'])
    model.eval()
    return model

def get_model_input(sents,n_percent_mask=0.0):
    input_ids_real = []
    att = []
    compare = []
    mask_indices = []
    for sent in sent_tokenize(sents):
        mask = []
        encoded_dict = tokenizer.encode_plus(
            sent,                      # Sentence to encode.
            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
            max_length = 128,
            truncation = True,# Pad & truncate all sentences.
            pad_to_max_length = True,
            return_attention_mask = True,   # Construct attn. masks.
            return_tensors = 'pt',     # Return pytorch tensors.
       )
        input_ids = encoded_dict['input_ids']
        compare.append(input_ids)
        attention_masks = encoded_dict['attention_mask']
        att.append(attention_masks)
        input_ids_part = []
        for step,word in enumerate(input_ids[0]):
            if int(word) != 101 and int(word) != 102:
                rando = random.random()
                random.seed()
                if rando < n_percent_mask and int(word)!=0:
                    mask.append(step)
                    input_ids_part.append(103)
                else:
                    input_ids_part.append(int(word))
            else:
                input_ids_part.append(int(word))
        input_ids_part = torch.tensor(input_ids_part).view(1,128)
        input_ids_real.append(input_ids_part)
        mask_indices.append(mask)

    if len(input_ids_real)>1:
        bert_input = torch.cat(tuple(input_ids_real),0)
        att = torch.cat(tuple(att),0)
        compare = torch.cat(tuple(compare),0)
    else:
        bert_input = input_ids_real[0]
        att = att[0]
        compare = compare[0]

    return bert_input, att, compare, mask_indices, tokenizer.tokenize(sent)

def bert_model_output(model, bert_input, att, compare, mask_indices):
    loss, predictions = model(bert_input,attention_mask = att, masked_lm_labels = compare)
    accuracy, bscore = calc_accuracy(predictions, compare, mask_indices)

    return {"loss":loss,
            "predictions": predictions,
            "performance":[accuracy, bscore]}

In [None]:
#======ALIGNMENT FUNCTIONS (expensive search)========
def distance_matrix(s1, s2):
    '''
    Method calcualtes cosine distance matrix between two sequences. Returns
    distances between values of -1.0 and 1.0.

    '''
    a = 1 - cdist(s1,s2,'cosine')
    return a

def scoring_matrix(a, wi=1.0, wj=1.0, epsilon=0.01):
    '''
    Method generates scoring matrix used to align sequences. This algorithm is
    inspired by the Smith-Waterman local sequence-alignment algorithm used
    in bioinformatics. Source: https://en.wikipedia.org/wiki/Smith–Waterman_algorithm

    The gap weights are adpatively assigned according to fuzzy graph kernels defined
    by wi, wj and eplison. Gap weights vary from (0.0, 0.0) to (wi, wj) where
    small gaps are closer to 0.0.

    '''m
    # Pad distance matrix
    sa = np.pad(a, ((1,0),(1,0)), 'constant', constant_values=0)

    # Calculate gap weight kernels
    dims = a.shape
    wi_ = [wi*np.exp((i*np.log(epsilon))/dims[0]) for i in reversed(range(dims[0]+1))]
    wj_ = [wj*np.exp((j*np.log(epsilon))/dims[1]) for j in reversed(range(dims[1]+1))]

    # Updated scoring matrix according to policy
    for i in range(1,dims[0]+1):
        for j in range(1,dims[1]+1):

            inputs = [(sa[i,j]+sa[i-1,j-1]), # Top Left + Bottom Right
                      np.max(sa[:i,j])-wi_[i-np.argmax(sa[:i,j])], # Max of all previous values in column - column gap weight
                      np.max(sa[i,:j])-wj_[j-np.argmax(sa[i,:j])], # Max of all previous values in row - row gap weight
                      0] # Zero
            sa[i,j] = np.max(inputs)
    return sa

def traceback(sa, k=100):
    '''
    Method preforms traceback path finding on scoring matrix to find first k alignments
    of length greater than 1.

    '''
    # Sort scoring matrix values in descending order; Save coordinates in look up table.
    sorted_args = np.argsort(sa.flatten())[::-1]
    coords = [(i,j) for i in range(sa.shape[0]) for j in range(sa.shape[1])]

    # Perform traceback until all coords have been visted
    tracebacks = []
    seen = []
    route = []
    for ind in sorted_args:
        i, j = coords[ind]

        flag = True
        score = sa[i,j]
        while(flag):

            # Route connects to other traceback
            if (i,j) in seen:
                tracebacks.append([route,(i,j)])
                route = []
                break

            route.append((i,j))
            seen.append((i,j))

            # Route terminates at zero
            if sa[i,j] == 0:
                tracebacks.append([route,[]])
                route = []
                break

            # Select path direction
            kernel = [sa[i-1,j],sa[i,j-1],sa[i-1,j-1],sa[i,j]]
            m = np.argmax(kernel)

            # Move to next gap
            if m == 0:
                # Terminate route if score is less than gap value
                if score > sa[i-1,j]:
                    i -= 1
                    score += sa[i,j]
                else:
                    tracebacks.append([route,[]])
                    route = []
                    break
            elif m==1:
                # Terminate route if score is less than gap value
                if score > sa[i,j-1]:
                    j -= 1
                    score += sa[i,j]
                else:
                    tracebacks.append([route,[]])
                    route = []
                    break

            # Move to next hit
            elif m==2:
                i -= 1
                j -= 1
                score += sa[i,j]
            elif m==3:
                i -= 1
                j -= 1
                score += sa[i,j]
​
    # Return alignments with length greater than 1 in order as discovered
    if k == None: k = len(tracebacks)
    alignments = []
    for _ in tracebacks:
        if len(_[0]) > 1:
            r = [(i-1,j-1) for i,j in _[0]]
            alignments.append(r)
        if len(alignments) == k: break

    return alignments

def score_alignment(alignment, s1, s2, k):
    '''
    This method is used to calculate a global score for aligmnets, to sort
    alignments from multiple search queries of the same topic. This is still
    a work in progress, but has shown good prelimanary results on the note example.

    '''
    # Find gaps and hits, and gather feature vectors
    temp_i = []
    temp_j = []
    i = -1
    j = -1
    s1_ = []
    s2_ = []
    for _ in alignment:
        if _[0] != i:
            temp_i.append(1)
            i = _[0]
        else: temp_i.append(0.0)
        if _[1] != j:
            temp_j.append(1)
            j = _[1]
        else: temp_j.append(0.0)
        s1_.append(s1[_[0]])
        s2_.append(s2[_[1]])

    # Calculate similarity score
    mask = np.array(temp_i) * np.array(temp_j)
    similarity = 2 - cdist(s1_,s2_,'cosine').diagonal()
    score = hmean([k , ((similarity/2)*mask).mean() , (mask.sum()/len(alignment))])

    return score

#===== RUN INFERENCE and ALIGN

model = load_model()
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output[0].detach()
    return hook
model.bert.encoder.register_forward_hook(get_activation("encoder")) # Adds hook to get ouput from internediate layer

# Get internediate Representations For these Two Passages
str1="The rabbit-hole went straight on like a tunnel for some way,\
and then dipped suddenly down, so suddenly that Alice had not a\
moment to think about stopping herself before she found herself\
falling down a very deep well."

model_input, att, compare, mask_indices, tokens1 = get_model_input(str1, n_percent_mask=0.0)
out = model(model_input,attention_mask = att, masked_lm_labels = compare)
out1 = activation["encoder"].numpy()[0][0:len(tokens1)] # Gets tensor values for first sentence
# Will need to be changed to include more.
# The alignment takes in 2 numpy array of size N*F where N is the number of words
# and F is the number of embedding features.

str2="a bunny dug in the ground" # Search Criteria

model_input, att, compare, mask_indices, tokens2 = get_model_input(str2, n_percent_mask=0.0)
out = model(model_input,attention_mask = att, masked_lm_labels = compare)
out2 = activation["encoder"].numpy()[0][0:len(tokens2)]

# Search for sequence alignments for each search str along text file
all_alignments = []
alignment_scores = []
k = 10 # Get Top K alignments
w = (0.35, 0.35) # Alignment weight parameters likely hood of gaps.

# Calculate cosine similarity between search phrase and text
cos_dist = distance_matrix(out1, out2)

# Calculate scoring matrix for sequence alignment
score = scoring_matrix(cos_dist, wi=w[0], wj=w[1])

# Find first k alignments of len > 1
alignments = traceback(score, k=k)
for j, _ in enumerate(alignments):
    all_alignments.append(_)
    alignment_scores.append(score_alignment(_, out1, out2, 1-(j/len(alignments))))

k = len(alignments)
# Sort Scored alignments
sorted_scores = np.argsort(alignment_scores)[::-1]

# Display results
if True:
    if k>1: print("Top ", k,':')
    for i in range(k):
        alignment = all_alignments[sorted_scores[i]]
        ss1 = []
        ss2 = []
        l = -1
        j = -1
        for _ in reversed(alignment):
            if _[0] != l:
                ss1.append(tokens1[_[0]])
                l = _[0]
            else: ss1.append('GAP')
            if _[1] != j:
                ss2.append(tokens2[_[1]])
                j = _[1]
            else: ss2.append('GAP')
        print('Match', i+1, ':', 'Score:',alignment_scores[sorted_scores[i]])
        print(ss1)


In [288]:
#partially updated search
import nltk
import math
nltk.download('stopwords')
from nltk.tokenize import word_tokenize 
model = load_model()
from nltk.corpus import stopwords


def filter_sent(example_sent):  
    stop_words = set(stopwords.words('english')) 
    punt = ["!",'#','$','&','(',')','*','+','-','.',':',';','<','=','>','?','@','[',']','^','_','`','{','|','}','~',',']
    example_sent = example_sent.lower()
    word_tokens = word_tokenize(example_sent) 

    filtered_sentence = [w for w in word_tokens if not w in stop_words and not w in punt] 

    sentence = ''
    for word in filtered_sentence:
        sentence+=word+' '
    return sentence


activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output[0].detach()
        
    return hook
model.bert.encoder.register_forward_hook(get_activation("encoder")) 


def word_embedding(query_string): 
    model_input, att, compare, mask_indices, tokens1 = get_model_input(query_string, n_percent_mask=0.0)
    out = model(model_input,attention_mask = att, masked_lm_labels = compare)
    out1 = activation["encoder"][0].numpy()[0:len(tokens1)]
    hidden_states = out[2]

    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)
    token_vecs_sum = []

    # `token_embeddings` is a [128 x 13 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [13 x 768] tensor with each layer containing its respective embeddings of each token

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)
    embed = []
    for x in token_vecs_sum[0:len(tokens1)]:
        embed.append(x.numpy())
    return embed, tokens1





query_string = 'she has been complaining of diffuse abdominal pain'
def is_SUICIDE(x, name, heatmap = False, export = False):
    with torch.no_grad():
        str1=query_string
        out1, tokens1 = word_embedding(str1)

        str2 = x
        out3, tokens2 = word_embedding(str2)
        
#         str1=query_string
#         str1 = filter_sent(str1)
#         model_input, att, compare, mask_indices, tokens1 = get_model_input(str1, n_percent_mask=0.0)
#         out = model(model_input,attention_mask = att, masked_lm_labels = compare)
#         out1 = activation["encoder"][0].numpy()[0:len(tokens1)]
#         print(out1)

#         str2 = x
#         str2 = filter_sent(str2)
#         model_input2, att2, compare2, mask_indices2, tokens2 = get_model_input(str2, n_percent_mask=0.0)
#         out2 = model(model_input2,attention_mask = att2, masked_lm_labels = compare2)
#         out3 = activation["encoder"][0].numpy()[0:len(tokens2)]
#         hidden2 = out2[2][-1][0]

    
    
    cos = cosine_distances(out1,out3)
    cos = hmean(cos.flatten())

    if not heatmap:
        return tokens1, tokens2, cos
    else:
        data=cos
        try:
            fig = px.imshow(data,
                            labels=dict(x='Sentence 1', y="Sentence 2", color="Distances"),
                            x=tokens2,
                            y=tokens1
                           )
        except:
            fig = px.imshow(data,
                            labels=dict(x='Sentence 1', y="Sentence 2", color="Distances"),
                            x=tokens2[0:128],
                            y=tokens1[0:128]
                           )


        fig.update_layout(font=dict(
                size=8,
            ))
        fig.update_layout(
            title={
                'text': "Cosine Distances",
                'y':0.985,
                'x':0.04,
                'yanchor': 'top'})

        fig.update_layout(
            autosize=False,
            width=400,
            height=400)

        fig.update_xaxes(side="top")
        fig.show()
        
        if export:
            fig.write_html(name+'.html')

# 


tokens1, tokens2, cos = is_SUICIDE("Abdominal pain may be related to alcohol consumption")


In [9]:
#2D BERT embeddings vs Word2Vec visualization 
from bert_embedding import BertEmbedding
embed = BertEmbedding()


def get_visual_embs(sentence):
    """Get BERT embedding for the sentence,
    project it to a 2D subspace where [CLS] is (1,0) and [SEP] is (0,1)."""
    embs = embed([sentence], False)
    tokens = embs[0][0]
    embV = embs[0][1]
    W = np.array(embV)
    
    B = np.array([embV[0], embV[-1]])
    Bi = np.linalg.pinv(B.T)
    Wp = np.matmul(Bi,W.T)

    return Wp, tokens


Wp, tokens =  get_visual_embs('The sky is blue today.')
Wp[0,:]

tokens

Wp2, tokens2 = get_visual_embs("He used the lead in the pencil to write.")

Wp2

from plotly.subplots import make_subplots
import plotly.graph_objects as go

Wp, tokens =  get_visual_embs('They lead the basketball team.')

Wp2, tokens2 = get_visual_embs("Lead is a common element.")
Wp3, tokens3 = get_visual_embs("They lead the volleyball team to victory.")
fig = go.Figure()
import plotly.express as px
import plotly.graph_objects as go
print(tokens)
# Add traces

fig.add_trace(go.Scatter(x=Wp[0,:], y=Wp[1,:],
                    mode='markers+lines+text',
                    name='Lead(verb)',
                    text = [None if x!='lead' else 'lead' for x in tokens],textposition="bottom center"))
fig.add_trace(go.Scatter(x=Wp2[0,:], y=Wp2[1,:],
                    mode='markers+lines+text',
                    name='Lead(noun)',
                    text = [None if x!='lead' else 'lead' for x in tokens],textposition="bottom center"))
fig.add_trace(go.Scatter(x=Wp3[0,:], y=Wp3[1,:],
                    mode='markers+lines+text',
                    name='Lead(verb)',
                    text = [None if x!='lead' else 'lead' for x in tokens],textposition="bottom center"))
fig.show()
fig.write_html("embeddingsVis.html")