In [None]:
import altair as alt
import pandas as pd

# https://altair-viz.github.io/gallery/simple_bar_chart.html
source = pd.DataFrame({
    'a': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
    'b': [28, 55, 43, 91, 81, 53, 19, 87, 52]
})

alt.Chart(source).mark_bar().encode(
    x='a',
    y='b'
)

In [None]:
import pandas as pd
import numpy as np
pd.set_option("display.max_rows", None)

In [None]:
# inheriting usage notes from my other code for now (not self-plagiarization)
# https://github.com/cephcyn/cse517project/blob/master/embed_w2v.py
# for word2vec code; see also word2vec documentation
import gensim

# Load Google's pre-trained Word2Vec model.
# model source: https://code.google.com/archive/p/word2vec/
model = gensim.models.KeyedVectors.load_word2vec_format('model/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
# Get the word2vec embedding of a phrase
def get_phrase_embed_word2vec(phrase):
    try:
        phraseS = phrase.split()
    except:
        return pd.DataFrame()
    unknowns = []
    emb = []
    for w in phraseS:
        try:
            emb.append(model[w])
        except:
            unknowns.append(w)
    if len(emb) == 0:
        return pd.DataFrame()
    emb_sum = pd.DataFrame(emb).sum()
    emb_sum['word'] = phrase
    return pd.DataFrame([emb_sum])

v = get_phrase_embed_word2vec('test phrase')
v

In [None]:
import spacy
from spacy.lang.en import English
from spacy import displacy

#OPTIONAL - to disable outputs from Tensorflow
import logging
logging.getLogger('tensorflow').disabled = True

!python -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_md')

# Load ELMo model
import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()
import tensorflow_hub as hub

url = "https://tfhub.dev/google/elmo/2"
elmo = hub.Module(url)

In [None]:
# Get the ELMo embedding of a phrase (with given span limits)
def get_phrase_embed_elmo(elmo, sentence, span0, span1, phrase):
    try:
        # TODO make the NaN / none case neater?
        phraseS = phrase.split()
        span0 = int(span0)
        span1 = int(span1)
    except:
        return pd.DataFrame()
    span0 = len(sentence[:span0].split(' ')) - 1
    span1 = len(sentence[:span1].split(' ')) - 1
    embeddings = elmo([sentence], 
                      signature='default', 
                      as_dict=True)['elmo']
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        x = sess.run(embeddings)
    emb_sum = pd.DataFrame(x[0][span0:span1]).sum()
    emb_sum['word'] = phrase
    return pd.DataFrame([emb_sum])

v = get_phrase_embed_elmo(elmo, 
                          'This is a test sentence !',
                          10, 24, 'test sentence')
v

In [None]:
search_word = 'BERT'
anchor_type = 'coreference'

csv = pd.read_csv(f'outputs/{search_word}/{anchor_type}.csv')
csv

In [None]:
output_w2v = csv.groupby(csv.index, group_keys=False).apply(
    lambda group: get_phrase_embed_word2vec(group.iloc[0]['averb'])
).reset_index(drop=True)
output_w2v

In [None]:
# Centroid is the arithmetic mean position of all points in the figure
output_w2v_c = pd.DataFrame([np.mean(output_w2v.iloc[:, 0:300])])
output_w2v_c['word'] = "[CENTROID]"
output_w2v_c

In [None]:
import ast

output_elmo = csv.groupby(csv.index, group_keys=False).apply(
    lambda group: get_phrase_embed_elmo(elmo,
                                        ' '.join(ast.literal_eval(group.iloc[0]['split_tokens'])),
                                        group.iloc[0]['averb_span0'],
                                        group.iloc[0]['averb_span1'],
                                        group.iloc[0]['averb'])
).reset_index(drop=True)
output_elmo

In [None]:
# Centroid is the arithmetic mean position of all points in the figure
output_elmo_c = pd.DataFrame([np.mean(output_elmo.iloc[:, 0:300])])
output_elmo_c['word'] = "[CENTROID]"
output_elmo_c

In [None]:
output_elmo.to_pickle(f'temp/BERT_anchorverb_elmo.pkl')
output_elmo = pd.read_pickle(f'temp/BERT_anchorverb_elmo.pkl')

In [None]:
# Visualize the vectors
# loaned from :
# https://github.com/cephcyn/ChatlogGrapher/blob/master/data_processing.ipynb

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import altair as alt
alt.renderers.enable('default')

def visualize_embeds(data):
    x = data.iloc[:, 0:300]
    x = StandardScaler().fit_transform(x)

    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data=principalComponents,
                               columns = ['pc1', 'pc2'])

    finalDf = pd.concat([principalDf, 
                         data['word']],
                        axis = 1)

    return alt.Chart(finalDf).mark_circle(size=60).encode(
        x='pc1',
        y='pc2',
#         color='type',
        tooltip=['word']
    ).interactive()

# visualize_embeds(pd.concat([output_w2v, output_w2v_c]).reset_index())
visualize_embeds(output_w2v)

In [None]:
visualize_embeds(output_elmo)

In [None]:
# https://github.com/cephcyn/ChatlogGrapher/blob/master/data_processing.ipynb
# for cosine similarity; see also sklearn documentation
from sklearn.metrics.pairwise import cosine_similarity

# Compute a distance metric column from some origin phrase and given some column name to calculate diff of
def distance_word2vec(row, origin, colname):
    try:
        sim = cosine_similarity(get_phrase_vector(origin), get_phrase_vector(row[colname]))[0][0]
    except:
        sim = -1
    return dict(zip(['distance'], [sim]))

base_averb = 'encodes'
output = csv.apply(
    lambda row: distance_word2vec(row, base_averb, 'averb'), 
    axis=1, result_type='expand')

output = csv.join(output).sort_values(by=['distance'], ascending=False)
output.to_csv(f'temp/BERT_similarity_encodes.csv')
output