In [None]:
import altair as alt
import pandas as pd

# https://altair-viz.github.io/gallery/simple_bar_chart.html
source = pd.DataFrame({
    'a': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
    'b': [28, 55, 43, 91, 81, 53, 19, 87, 52]
})

alt.Chart(source).mark_bar().encode(
    x='a',
    y='b'
)

In [None]:
import pandas as pd
import numpy as np
pd.set_option("display.max_rows", None)

In [None]:
# inheriting usage notes from my other code for now (not self-plagiarization)
# https://github.com/cephcyn/cse517project/blob/master/embed_w2v.py
# for word2vec code; see also word2vec documentation
import gensim

# Load Google's pre-trained Word2Vec model.
# model source: https://code.google.com/archive/p/word2vec/
model = gensim.models.KeyedVectors.load_word2vec_format('model/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
def get_phrase_vector(phrase):
    try:
        phraseS = phrase.split()
    except:
        return pd.DataFrame()
    unknowns = []
    emb = []
    for w in phraseS:
        try:
            emb.append(model[w])
        except:
            unknowns.append(w)
    if len(emb) == 0:
        return pd.DataFrame()
    emb_sum = pd.DataFrame(emb).sum()
    emb_sum['word'] = phrase
    return pd.DataFrame([emb_sum])

v = get_phrase_vector('test phrase')
v

In [None]:
search_word = 'BERT'
anchor_type = 'coreference'

csv = pd.read_csv(f'outputs/{search_word}/{anchor_type}.csv')
#csv

In [None]:
output = csv.groupby(csv.index, group_keys=False).apply(
    lambda group: get_phrase_vector(group.iloc[0]['averb'])
).reset_index(drop=True)
output

In [None]:
# Centroid is the arithmetic mean position of all points in the figure
output_centroid = pd.DataFrame([np.mean(output.iloc[:, 0:300])])
output_centroid['word'] = "[CENTROID]"
output_centroid

In [None]:
# Visualize the vectors
# loaned from :
# https://github.com/cephcyn/ChatlogGrapher/blob/master/data_processing.ipynb

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import altair as alt
alt.renderers.enable('default')

def visualize_embeds(data):
    x = data.iloc[:, 0:300]
    x = StandardScaler().fit_transform(x)

    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data=principalComponents,
                               columns = ['pc1', 'pc2'])

    finalDf = pd.concat([principalDf, 
                         data['word']],
                        axis = 1)

    return alt.Chart(finalDf).mark_circle(size=60).encode(
        x='pc1',
        y='pc2',
#         color='type',
        tooltip=['word']
    ).interactive()

# visualize_embeds(pd.concat([output, output_centroid]).reset_index())
visualize_embeds(output)

In [None]:
# https://github.com/cephcyn/ChatlogGrapher/blob/master/data_processing.ipynb
# for cosine similarity; see also sklearn documentation
from sklearn.metrics.pairwise import cosine_similarity

# Compute a distance metric column from some origin phrase and given some column name to calculate diff of
def distance_word2vec(row, origin, colname):
    try:
        sim = cosine_similarity(get_phrase_vector(origin), get_phrase_vector(row[colname]))[0][0]
    except:
        sim = -1
    return dict(zip(['distance'], [sim]))

base_averb = 'encodes'
output = csv.apply(
    lambda row: distance_word2vec(row, base_averb, 'averb'), 
    axis=1, result_type='expand')

output = csv.join(output).sort_values(by=['distance'], ascending=False)
output.to_csv(f'temp/BERT_similarity_encodes.csv')
output