# Embeddings Visualization

In [24]:
# Imports
from argparse import Namespace
import os
import re
import pandas as pd 
import json
import torch
import numpy as np
import random
import torch.nn as nn
from utils.utils import nearest_word

In [2]:
args = Namespace(embeddings_path = 'data/models/embeddings_lemmatized.npy',
                vocab='data/vocabs/Homer_word_frequencies_lemmatized.json',
                word2index="data/vocabs/Homer_word2index_lemmatized.json",
                verbs = "data/vocabs/verbs_labelled.csv"
                )

In [3]:
# import vocab and lookup dictioanries
with open(args.vocab, "r", encoding="utf-8") as fp:
    vocab = json.load(fp)
    
with open(args.word2index, "r", encoding="utf-8") as fp:
    word2index = json.load(fp)
    
# Create a reverse lookup table
index2word = {i: w for w, i in word2index.items()}

In [4]:
embeddings = np.load(args.embeddings_path, allow_pickle=True)
embeddings = torch.tensor(embeddings)

In [5]:
def get_vector(token):
    return embeddings[word2index[token], :]

In [6]:
# test
get_vector('θεά')[:4]

tensor([-0.1405, -0.2163,  0.0107,  0.0216])

## Most similar lemmata

In [7]:
def context_of(word, embeddings=embeddings, n=20):
    target_vector = get_vector(word)
    idx = nearest_word(target=target_vector, embeddings=embeddings,n=n, metrics="cosine")
    for i, index in enumerate(idx):
        if index != word2index[word]:
            print(f"{i}) {index2word[index]}")
    return [(i, index2word[word]) for i, index2word[word] in enumerate(idx)]

In [9]:
context_of('ὀπάζω')

1) ἰότης
2) βούλομαι
3) ὅμοιος
4) κρείσσων
5) ὄλβος
6) ὑπερμενής
7) ἔξοχος
8) αἴτιος
9) καρτερός
10) οἶτος
11) ἀρείων
12) Ὀλύμπιος
13) ἐπιχθόνιος
14) Οὐρανίωνες
15) ἐπιτάρροθος
16) ὀφείλω
17) λίαν
18) ἀρετή
19) πάλαι


# Dim Reduction and Normalization

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

In [11]:
scaler = StandardScaler()
svd = TruncatedSVD(n_components=2, n_iter=100)

In [176]:
reduced_vec = svd.fit_transform(embeddings)
reduced_vec = scaler.fit_transform(reduced_vec)

## Build dataframe

In [214]:
verbs = pd.read_csv(args.verbs)
words = list(word2index)

In [215]:
df = pd.DataFrame({"x" : reduced_vec[:,0],  "y" : reduced_vec[:,1], "word" : words })

In [216]:
# merge both dataframes
df = df.merge(verbs, how="left", on='word')

In [212]:
# Subsetting
# new_df = df[pd.notna(df['diathesis'])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [217]:
df.head()

Unnamed: 0,x,y,word,diathesis
0,-0.628268,-1.232438,μῆνις,
1,-1.233841,-0.586984,ἀείδω,opp
2,-2.304892,-1.79582,θεά,
3,-0.296797,-0.802124,Πηληιάδης,
4,-0.455842,-0.683225,Πηληι,


### Visualization

In [12]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook, export_png
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap, factor_mark, linear_cmap
output_notebook()

In [13]:
def draw_test_words(data, alpha=0.69,width=600, height=400, show=True, markers=['triangle', "asterisk", "circle"], colorstyle=["#fca486", "#91bfdb", "#3DDB5E"]):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    MARKERS = markers
    LABEL = ['mp', 'opp', 'NaN']
    
    src = bm.ColumnDataSource(data)
    
    mapper = linear_cmap(field_name='y', palette=Category20_9 ,low=min(data['y']) ,high=max(data['y']))
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Visualize chosen vectors")
    
    fig.scatter('x','y',
                size=10,
                color=factor_cmap('diathesis', colorstyle, LABEL),
                legend_field="diathesis",
                marker=factor_mark('diathesis', MARKERS, LABEL),
                fill_alpha=alpha,
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@word")]))
    if show: pl.show(fig)
    return fig

In [223]:
draw_test_words(data=df)

### T-sne Visualization

In [14]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2,
           n_iter=4000,
           perplexity=45,metric='cosine',verbose=3, learning_rate=500)

In [15]:
tsne_embs = tsne.fit_transform(embeddings)

[t-SNE] Computing 136 nearest neighbors...
[t-SNE] Indexed 8439 samples in 0.016s...
[t-SNE] Computed neighbors for 8439 samples in 1.985s...
[t-SNE] Computed conditional probabilities for sample 1000 / 8439
[t-SNE] Computed conditional probabilities for sample 2000 / 8439
[t-SNE] Computed conditional probabilities for sample 3000 / 8439
[t-SNE] Computed conditional probabilities for sample 4000 / 8439
[t-SNE] Computed conditional probabilities for sample 5000 / 8439
[t-SNE] Computed conditional probabilities for sample 6000 / 8439
[t-SNE] Computed conditional probabilities for sample 7000 / 8439
[t-SNE] Computed conditional probabilities for sample 8000 / 8439
[t-SNE] Computed conditional probabilities for sample 8439 / 8439
[t-SNE] Mean sigma: 0.016806
[t-SNE] Computed conditional probabilities in 0.558s
[t-SNE] Iteration 50: error = 97.8154602, gradient norm = 0.1192932 (50 iterations in 2.869s)
[t-SNE] Iteration 100: error = 98.3331223, gradient norm = 0.1135242 (50 iterations in 2

In [16]:
tsne_embs = scaler.fit_transform(tsne_embs)

array([1.462232  , 0.05573937], dtype=float32)

In [228]:
# Build dataframe
df = pd.DataFrame({"x" : tsne_embs[:,0],  "y" : tsne_embs[:,1], "word" : words })

# Merge with verbs
df = df.merge(verbs, how="left", on='word')
#new_df = df[pd.notna(df['diathesis'])]

In [229]:
draw_test_words(data=df)

## Euclidean similarity

In [161]:
tsne = TSNE(n_components=2,
           n_iter=4000,
           perplexity=45,metric='euclidean',verbose=3, learning_rate=150)

In [162]:
tsne_embs = tsne.fit_transform(embeddings)
tsne_embs = scaler.fit_transform(tsne_embs)

[t-SNE] Computing 136 nearest neighbors...
[t-SNE] Indexed 8439 samples in 0.485s...
[t-SNE] Computed neighbors for 8439 samples in 8.857s...
[t-SNE] Computed conditional probabilities for sample 1000 / 8439
[t-SNE] Computed conditional probabilities for sample 2000 / 8439
[t-SNE] Computed conditional probabilities for sample 3000 / 8439
[t-SNE] Computed conditional probabilities for sample 4000 / 8439
[t-SNE] Computed conditional probabilities for sample 5000 / 8439
[t-SNE] Computed conditional probabilities for sample 6000 / 8439
[t-SNE] Computed conditional probabilities for sample 7000 / 8439
[t-SNE] Computed conditional probabilities for sample 8000 / 8439
[t-SNE] Computed conditional probabilities for sample 8439 / 8439
[t-SNE] Mean sigma: 0.017660
[t-SNE] Computed conditional probabilities in 0.516s
[t-SNE] Iteration 50: error = 87.3821869, gradient norm = 0.0219691 (50 iterations in 1.845s)
[t-SNE] Iteration 100: error = 67.1213150, gradient norm = 0.0137480 (50 iterations in 1

In [163]:
# Build dataframe
df = pd.DataFrame({"x" : tsne_embs[:,0],  "y" : tsne_embs[:,1], "word" : words })

# Merge with verbs
df = df.merge(verbs, how="left", on='word')
new_df = df[pd.notna(df['diathesis'])]

In [120]:
draw_test_words(data=new_df)

In [164]:
draw_test_words(data=new_df)

## Visualize only some words

In [60]:
from pprint import pprint

In [61]:
pprint(context_of('ἐθέλω'))

1) οἶδα
2) παῖς
3) ἄν
4) πού
5) σός
6) μή
7) ἐμός
8) ἔργον
9) μάλα
10) κακός
11) ὅσος
12) γίγνομαι
13) ἀθάνατος
14) πατήρ
15) νῦν
16) ξένος
17) ἄνθρωπος
18) οὐ
19) Ἰθάκη
None


In [51]:
test = ["ἰότης", "πεύθομαι", "βούλομαι", "ὀφείλω", "ἐθέλω", "γίγνομαι", "μάλα", "φημί", "βαίνω", "ἔρομαι", "ἱκνέομαι", "ἔρχομαι"]

# get their indices
test_idx = [word2index[w] for w in test]

In [None]:
# add some random words (to have an idea of the global geometry)
random_idx = []
i = 0
while i < 50:
    random_idx.append(random.randint(0, len(word2index)-1))
    i += 1    

# merge
test_idx.extend(random_idx)

In [52]:
test_df = pd.DataFrame({"x" : [tsne_embs[i][0] for i in test_idx], "y" : [tsne_embs[i][1] for i in test_idx], "word" : [index2word[i] for i in test_idx ]})

In [53]:
from bokeh.palettes import Category10_9
def draw_test_words(data, alpha=0.69,width=600, height=400, show=True):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    src = bm.ColumnDataSource(data)
    
    mapper = linear_cmap(field_name='y', palette=Category10_9 ,low=min(data['y']) ,high=max(data['y']))
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Visualize chosen vectors")
    
    fig.scatter('x','y',
                size=10,
                color = mapper,
                fill_alpha=alpha,
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@word")]))
    if show: pl.show(fig)
    return fig

In [54]:
draw_test_words(test_df)