In [None]:
import altair as alt
import pandas as pd

# https://altair-viz.github.io/gallery/simple_bar_chart.html
# source = pd.DataFrame({
#     'a': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
#     'b': [28, 55, 43, 91, 81, 53, 19, 87, 52]
# })

# alt.Chart(source).mark_bar().encode(
#     x='a',
#     y='b'
# )

In [None]:
import pandas as pd
import numpy as np
pd.set_option("display.max_rows", None)

In [None]:
# inheriting usage notes from my other code for now (not self-plagiarization)
# https://github.com/cephcyn/cse517project/blob/master/embed_w2v.py
# for word2vec code; see also word2vec documentation
import gensim

# Load Google's pre-trained Word2Vec model.
# model source: https://code.google.com/archive/p/word2vec/
word2vec = gensim.models.KeyedVectors.load_word2vec_format(
    'model/GoogleNews-vectors-negative300.bin', 
    binary=True)

In [None]:
# Get the word2vec embedding of a phrase
def get_phrase_embed_word2vec(word2vec, phrase):
    try:
        phraseS = phrase.split()
    except:
        return pd.DataFrame()
    unknowns = []
    emb = []
    for w in phraseS:
        try:
            emb.append(word2vec[w])
        except:
            unknowns.append(w)
    if len(emb) == 0:
        return pd.DataFrame()
    emb_sum = pd.DataFrame(emb).sum()
    emb_sum['word'] = phrase
    return pd.DataFrame([emb_sum])

v = get_phrase_embed_word2vec(
    word2vec, 
    'test sentence')
sent_v = get_phrase_embed_word2vec(
    word2vec, 
    'This is a test sentence !')
v

In [None]:
def embed_subtract(v, sent_v, dim):
    try:
        inverse_v = sent_v.iloc[:, 0:dim].subtract(v.iloc[:, 0:dim])
        inverse_v['word'] = v['word']
        inverse_v['sentence'] = sent_v['word']
        return inverse_v
    except:
        return pd.DataFrame()

embed_subtract(v, sent_v, 300)

In [None]:
import spacy
from spacy.lang.en import English
from spacy import displacy

# OPTIONAL - to disable outputs from Tensorflow
import logging
logging.getLogger('tensorflow').disabled = True

# This line only needs to be a one-time download
# !python -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_md')

import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()
import tensorflow_hub as hub

# Load ELMo model
url = "https://tfhub.dev/google/elmo/2"
elmo = hub.Module(url)

In [None]:
# Get the ELMo embedding of a phrase (with given span limits)
def get_phrase_embed_elmo(elmo, sentence, span0, span1, phrase, pregenerated=None):
    try:
        # TODO make the NaN / none case check neater?
        phraseS = phrase.split()
        span0 = int(span0)
        span1 = int(span1)
    except:
        return pd.DataFrame(), pd.DataFrame()
    span0 = len(sentence[:span0].split(' ')) - 1
    span1 = len(sentence[:span1].split(' ')) - 1
    if pregenerated is None:
        # If we don't have a handy pre-generated dict of sentence:vector already...
        embeddings = elmo(
            [sentence], 
            signature='default', 
            as_dict=True)['elmo']
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.tables_initializer())
            x = sess.run(embeddings)
            x = x[0]
    else:
        # otherwise just grab the vector
        x = pregenerated[sentence]
    emb_sum = pd.DataFrame(x[span0:span1]).sum()
    emb_sum['word'] = phrase
    sentence_sum = pd.DataFrame(x).sum()
    sentence_sum['word'] = sentence
    return pd.DataFrame([emb_sum]), pd.DataFrame([sentence_sum])

# v, sent_v = get_phrase_embed_elmo(
#     elmo, 
#     'This is a test sentence !',
#     10, 24, 'test sentence')
# v

In [None]:
# embed_subtract(v, sent_v, 1024)

In [None]:
search_word = 'BERT'
anchor_type = 'coreference'

csv = pd.read_csv(
    f'outputs/{search_word}/{anchor_type}.csv', 
    index_col='Unnamed: 0')
csv

In [None]:
output_w2v = csv.groupby(
    csv.index, 
    group_keys=True, 
    as_index=False,
    sort=True
).apply(
    lambda group: get_phrase_embed_word2vec(
        word2vec,
        group.iloc[0]['d_averb']
    )
).reset_index(level=1, drop=True)
output_w2v

In [None]:
output_w2v_inv = csv.groupby(
    csv.index, 
    group_keys=True, 
    as_index=False,
    sort=True
).apply(
    lambda group: embed_subtract(
        get_phrase_embed_word2vec(
            word2vec,
            group.iloc[0]['d_averb']
        ), 
        get_phrase_embed_word2vec(
            word2vec,
            ' '.join(group.iloc[0]['split_tokens'])
        ), 300)
).reset_index(level=1, drop=True)
output_w2v_inv

In [None]:
# Centroid is the arithmetic mean position of all points in the figure
output_w2v_c = pd.DataFrame(
    [np.mean(output_w2v.iloc[:, 0:300])])
output_w2v_c['word'] = "[CENTROID]"
output_w2v_c

In [None]:
import pickle
import ast

# save ELMo outputs so rerunning doesn't take forever...
# don't rerun this if we don't need to regenerate the ELMo outputs :|
all_sentences = [' '.join(ast.literal_eval(r)) for r in csv['split_tokens']]
embeddings = elmo(
    all_sentences, 
    signature='default', 
    as_dict=True)['elmo']
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    x = sess.run(embeddings)

x = dict(zip(all_sentences, x))
pickle.dump(x, open("temp/BERT_ELMo.pkl", "wb"))

In [None]:
import ast
import pickle

x = pickle.load(open("temp/BERT_ELMo.pkl", "rb"))
output_elmo = csv.groupby(
    csv.index, 
    group_keys=True, 
    as_index=False,
    sort=True
).apply(
    lambda group: get_phrase_embed_elmo(
        elmo,
        ' '.join(ast.literal_eval(group.iloc[0]['split_tokens'])),
        group.iloc[0]['averb_span0'],
        group.iloc[0]['averb_span1'],
        group.iloc[0]['averb'], pregenerated=x
    )[0]
).reset_index(level=1, drop=True)

pickle.dump(output_elmo, open(f'temp/BERT_anchorverb_elmo.pkl', "wb"))
output_elmo

In [None]:
# Centroid is the arithmetic mean position of all points in the figure
output_elmo_c = pd.DataFrame(
    [np.mean(output_elmo.iloc[:, 0:300])])
output_elmo_c['word'] = "[CENTROID]"
output_elmo_c

In [None]:
import ast

def elmo_inv(row, pregenerated=None):
    v, sent_v = get_phrase_embed_elmo(
        elmo,
        ' '.join(ast.literal_eval(row['split_tokens'])),
        row['averb_span0'],
        row['averb_span1'],
        row['averb'], pregenerated=pregenerated
    )
    sent_v_inv = embed_subtract(v, sent_v, 1024)
    return sent_v_inv

x = pickle.load(open("temp/BERT_ELMo.pkl", "rb"))
output_elmo_inv = csv.groupby(
    csv.index, 
    group_keys=True, 
    as_index=False,
    sort=True
).apply(
    lambda group: elmo_inv(group.iloc[0], pregenerated=x)
).reset_index(level=1, drop=True)

output_elmo_inv

In [None]:
# Visualize the vectors
# loaned from :
# https://github.com/cephcyn/ChatlogGrapher/blob/master/data_processing.ipynb

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import altair as alt
alt.renderers.enable('default')

def visualize_embeds(data, reference, color=None, tooltip=['word']):
    x = data.iloc[:, 0:300]
    x = StandardScaler().fit_transform(x)

    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(
        data=principalComponents,
        columns=['pc1', 'pc2'])

    finalDf = principalDf
    finalDf = finalDf.set_index(data.index)
    finalDf['word'] = data['word']
    finalDf = finalDf.join(
        reference, 
        how='inner',
        lsuffix='_embed', 
        rsuffix='_ref'
    )

    chart = alt.Chart(finalDf).mark_circle(size=60)
    # should figure out a more pythonic way to do this :/
    if color is None:
        chart = chart.encode(
            x='pc1',
            y='pc2',
            tooltip=tooltip
        )
    else:
        chart = chart.encode(
            x='pc1',
            y='pc2',
            color=color,
            tooltip=tooltip
        )
    return chart.interactive()

# visualize_embeds(pd.concat([output_w2v, output_w2v_c]).reset_index())
visualize_embeds(output_w2v, csv)

In [None]:
visualize_embeds(output_w2v_inv, csv, tooltip=['word', 'Text'])

In [None]:
visualize_embeds(output_elmo, csv)

In [None]:
visualize_embeds(output_elmo_inv, csv, tooltip=['word', 'Text'])

In [None]:
# this is the automatically-identified "root words"
output_manualgroups = pd.DataFrame([['[introduce]', 'stands'], ['[introduce]', 'role'],
                                    ['[introduce]', 'propose'], ['[success]', 'Encoder'],
                                    ['[success]', 'Recently'], ['[success]', 'ground'],
                                    ['[success]', 'achieved'], ['[success]', 'successfully'],
                                    ['[success]', 'improved'], ['[success]', 'found'],
                                    ['[success]', 'good'], ['[success]', 'advanced'],
                                    ['[success]', 'models'], ['[success]', 'proven'],
                                    ['[success]', 'yielded'], ['[successbut]', 'explore'],
                                    ['[successbut]', 'success'], ['[successbut]', 'currently'],
                                    ['[successbut]', 'currently'], ['[successbut]', 'success'],
                                    ['[successbut]', 'success'], ['[successbut]', 'prevalent'],
                                    ['[successbut]', 'approach'], ['[successbut]', 'led'],
                                    ['[stateproof]', 'show'], ['[introtask]', 'analysis'],
                                    ['[introtask]', 'are'], ['[introtask]', 'is'],
                                    ['[introtask]', 'Question'], ['[assessment]', 'studies'],
                                    ['[assessment]', ')'], ['[known]', 'allows'],
                                    ['[known]', 'are']])
# this is my manually-identified "root words / main verbs"
# output_manualgroups = pd.DataFrame([['[introduce]', 'introduce'], ['[introduce]', 'present'],
#                                     ['[introduce]', 'propose'], ['[success]', 'has shown'],
#                                     ['[success]', 'have achieved'], ['[success]', 'has achieved'],
#                                     ['[success]', 'has achieved'], ['[success]', 'has been applied'],
#                                     ['[success]', 'has improved'], ['[success]', 'was found'],
#                                     ['[success]', 'is'], ['[success]', 'have advanced'],
#                                     ['[success]', 'have pushed forward'], ['[success]', 'has proven'],
#                                     ['[success]', 'has yielded'], ['[successbut]', 'explore'],
#                                     ['[successbut]', 'have had success'], ['[successbut]', 'give'],
#                                     ['[successbut]', 'give'], ['[successbut]', 'have had success'],
#                                     ['[successbut]', 'have had success'], ['[successbut]', 'remains'],
#                                     ['[successbut]', 'has become'], ['[successbut]', 'has led'],
#                                     ['[stateproof]', 'show'], ['[introtask]', 'is'],
#                                     ['[introtask]', 'are'], ['[introtask]', 'is'],
#                                     ['[introtask]', 'plays'], ['[assessment]', 'studies'],
#                                     ['[assessment]', 'assess'], ['[known]', 'allows'],
#                                     ['[known]', 'are applied']])
# this is automaticaly-identified "anchor verbs" (excluding sentences where no anchor/anchorverb was IDed)
# output_manualgroups = pd.DataFrame([['[successbut]', 'applying'], ['[assessment]', 'assess'],
#                                     ['[successbut]', 'contribute'], ['[success]', 'has achieved'],
#                                     ['[successbut]', 'have had'], ['[successbut]', 'have had'],
#                                     ['[successbut]', 'motivating'], ['[success]', 'pre'],
#                                     ['[success]', 'pre'], ['[stateproof]', 'show'],
#                                     ['[introduce]', 'stands'], ['[assessment]', 'studies'],
#                                     ['[success]', 'using']])
# Using the entire sentence
output_manualgroups = pd.DataFrame([
     ['[successbut]', 'Following recent successes in applying BERT to question answering, we explore simple applications to ad hoc document retrieval.'],
     ['[assessment]', 'I assess the extent to which the recently introduced BERT model captures English syntactic phenomena, using (1) naturally-occurring subject-verb agreement stimuli; (2) "coloreless green ideas" subject-verb agreement stimuli, in which content words in natural sentences are randomly replaced with words sharing the same part-of-speech and inflection; and (3) manually crafted stimuli for subject-verb agreement and reflexive anaphora phenomena.'],
     ['[successbut]', 'BERT-based architectures currently give state-of-the-art performance on many NLP tasks, but little is known about the exact mechanisms that contribute to its success.'],
     ['[success]', 'BERT, a pre-trained Transformer model, has achieved ground-breaking performance on multiple NLP tasks.'],
     ['[successbut]', 'Large pre-trained neural networks such as BERT have had great recent success in NLP, motivating a growing body of research investigating what aspects of language they are able to learn from unlabeled data.'],
     ['[successbut]', 'Large pre-trained neural networks such as BERT have had great recent success in NLP, motivating a growing body of research investigating what aspects of language they are able to learn from unlabeled data.'],
     ['[successbut]', 'Large pre-trained neural networks such as BERT have had great recent success in NLP, motivating a growing body of research investigating what aspects of language they are able to learn from unlabeled data.'],
     ['[success]', 'Language model pre-training, such as BERT, has achieved remarkable results in many NLP tasks.'],
     ['[success]', 'Language model pre-training, such as BERT, has significantly improved the performances of many natural language processing tasks.'],
     ['[stateproof]', 'We show that BERT (Devlin et al., 2018) is a Markov random field language model.'],
     ['[introduce]', 'We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers.'],
     ['[assessment]', 'This paper studies the performances and behaviors of BERT in ranking tasks.'],
     ['[success]', 'Recently, a simple combination of passage retrieval using off-the-shelf IR techniques and a BERT reader was found to be very effective for question answering directly on Wikipedia, yielding a large improvement over the previous state of the art on a standard benchmark dataset.'],
     ['[introduce]', 'We present simple BERT-based models for relation extraction and semantic role labeling.'],
     ['[introduce]', 'We propose a practical scheme to train a single multilingual sequence labeling model that yields state of the art results and is small and fast enough to run on a single CPU.'],
     ['[success]', 'Bidirectional Encoder Representations from Transformers (BERT) has shown marvelous improvements across various NLP tasks.'],
     ['[success]', 'Recently, neural models pretrained on a language modeling task, such as ELMo (Peters et al., 2017), OpenAI GPT (Radford et al., 2018), and BERT (Devlin et al., 2018), have achieved impressive results on various natural language processing tasks such as question-answering and natural language inference.'],
     ['[success]', 'BERT model has been successfully applied to open-domain QA tasks.'],
     ['[success]', 'The BERT language model (LM) (Devlin et al., 2019) is surprisingly good at answering cloze-style questions about relational facts.'],
     ['[success]', 'Pre-trained text encoders have rapidly advanced the state of the art on many NLP tasks.'],
     ['[success]', 'Pretrained contextual representation models (Peters et al., 2018; Devlin et al., 2018) have pushed forward the state-of-the-art on many NLP tasks.'],
     ['[success]', 'Language model pre-training has proven to be useful in learning universal language representations.'],
     ['[success]', 'Replacing static word embeddings with contextualized word representations has yielded significant improvements on many NLP tasks.'],
     ['[successbut]', 'BERT-based architectures currently give state-of-the-art performance on many NLP tasks, but little is known about the exact mechanisms that contribute to its success.'],
     ['[successbut]', 'As Transfer Learning from large-scale pre-trained models becomes more prevalent in Natural Language Processing (NLP), operating these large models in on-the-edge and/or under constrained computational training or inference budgets remains challenging.'],
     ['[successbut]', 'Pre-training by language modeling has become a popular and successful approach to NLP tasks, but we have yet to understand exactly what linguistic capacities these pre-training processes confer upon models.'],
     ['[successbut]', 'Language model pretraining has led to significant performance gains but careful comparison between different approaches is challenging.'],
     ['[introtask]', 'Aspect-based sentiment analysis (ABSA), which aims to identify fine-grained opinion polarity towards a specific aspect, is a challenging subtask of sentiment analysis (SA).'],
     ['[introtask]', 'Intent classification and slot filling are two essential tasks for natural language understanding.'],
     ['[introtask]', 'Conversational search is an emerging topic in the information retrieval community.'],
     ['[introtask]', 'Question-answering plays an important role in e-commerce as it allows potential customers to actively seek crucial information about products or services to help their purchase decision making.'],
     ['[known]', 'Multi-task learning allows the sharing of useful information between multiple related tasks.'],
     ['[known]', 'Data augmentation methods are often applied to prevent overfitting and improve generalization of deep neural network models.']])
output_manualgroups = output_manualgroups.rename(columns={0: "groupname", 1: "rootword"})

output_mg_v = output_manualgroups.groupby(
    output_manualgroups.index, 
    group_keys=True, 
    as_index=False,
    sort=True
).apply(
    lambda group: get_phrase_embed_word2vec(
        word2vec,
        group.iloc[0]['rootword']
    )
).reset_index(level=1, drop=True)

visualize_embeds(output_mg_v, output_manualgroups, color='groupname', tooltip=['word'])

In [None]:
# Borrowed from
# https://gmarti.gitlab.io/ml/2017/09/07/how-to-sort-distance-matrix.html

from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
%matplotlib inline

N = len(output_mg_v)
data = output_mg_v.iloc[:, 0:300]
# data = output_elmo.iloc[:, 0:1024]
# Shuffle data for extra comparison
# data = data.sample(frac=1)
dist_mat = squareform(pdist(data))

plt.pcolormesh(dist_mat)
plt.colorbar()
plt.xlim([0,N])
plt.ylim([0,N])
plt.show()

In [None]:
# Borrowed from
# https://gmarti.gitlab.io/ml/2017/09/07/how-to-sort-distance-matrix.html

from fastcluster import linkage

def seriation(Z,N,cur_index):
    '''
        input:
            - Z is a hierarchical tree (dendrogram)
            - N is the number of points given to the clustering process
            - cur_index is the position in the tree for the recursive traversal
        output:
            - order implied by the hierarchical tree Z
            
        seriation computes the order implied by a hierarchical tree (dendrogram)
    '''
    if cur_index < N:
        return [cur_index]
    else:
        left = int(Z[cur_index-N,0])
        right = int(Z[cur_index-N,1])
        return (seriation(Z,N,left) + seriation(Z,N,right))
    
def compute_serial_matrix(dist_mat,method="ward"):
    '''
        input:
            - dist_mat is a distance matrix
            - method = ["ward","single","average","complete"]
        output:
            - seriated_dist is the input dist_mat,
              but with re-ordered rows and columns
              according to the seriation, i.e. the
              order implied by the hierarchical tree
            - res_order is the order implied by
              the hierarchical tree
            - res_linkage is the hierarchical tree (dendrogram)
        
        compute_serial_matrix transforms a distance matrix into 
        a sorted distance matrix according to the order implied 
        by the hierarchical tree (dendrogram)
    '''
    N = len(dist_mat)
    flat_dist_mat = squareform(dist_mat)
    res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True)
    res_order = seriation(res_linkage, N, N + N-2)
    seriated_dist = np.zeros((N,N))
    a,b = np.triu_indices(N,k=1)
    seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
    seriated_dist[b,a] = seriated_dist[a,b]
    
    return seriated_dist, res_order, res_linkage

In [None]:
# Notes on the different clustering linkage methods: 
# https://en.wikipedia.org/wiki/Complete-linkage_clustering
# https://en.wikipedia.org/wiki/Hierarchical_clustering
# (The main difference is in how distances between clusters are calculated)
methods = ["ward","single","average","complete"]
for method in methods:
    print("Method:\t",method)
    
    ordered_dist_mat, res_order, res_linkage = compute_serial_matrix(dist_mat,method)
    
    plt.pcolormesh(ordered_dist_mat)
    plt.xlim([0,N])
    plt.ylim([0,N])
    plt.show()

In [None]:
data = output_elmo.join(
    csv, 
    how='inner',
    lsuffix='_embed', 
    rsuffix='_ref'
)
data = data.loc[csv['averb_relation'] == -1]
# Shuffle data for extra comparison
# data = data.sample(frac=1)
dist_mat = squareform(pdist(data.iloc[:, 0:1024]))

ordered_dist_mat, res_order, res_linkage = compute_serial_matrix(
    dist_mat,
    'ward')
reordered_data = data
reordered_data['temp_index'] = data.index
reordered_data = reordered_data.reset_index(drop=True)
reordered_data = reordered_data.iloc[res_order].reset_index(drop=True)
reordered_data['order_cluster'] = reordered_data.index
reordered_data = reordered_data.set_index('temp_index')

# reordered_data = csv.join(
#     reordered_data, 
#     how='inner',
#     lsuffix='_embed', 
#     rsuffix='_ref'
# )
# reordered_data.to_csv(f'temp/BERT_anchorverb_elmo_order_cluster.csv')
# reordered_data
data

In [None]:
# https://github.com/cephcyn/ChatlogGrapher/blob/master/data_processing.ipynb
# for cosine similarity; see also sklearn documentation
from sklearn.metrics.pairwise import cosine_similarity

# Compute a distance metric column from some origin phrase and given some column name to calculate diff of
def distance_word2vec(row, origin, colname):
    try:
        sim = cosine_similarity(get_phrase_vector(origin), get_phrase_vector(row[colname]))[0][0]
    except:
        sim = -1
    return dict(zip(['distance'], [sim]))

base_averb = 'encodes'
output = csv.apply(
    lambda row: distance_word2vec(
        row, 
        base_averb, 
        'averb'
    ), 
    axis=1, result_type='expand')

output = csv.join(output).sort_values(by=['distance'], ascending=False)
output.to_csv(f'temp/BERT_similarity_encodes.csv')
output