# Evaluation and Visualization (Skipgram Negative)

Once the model has been trained, we can visualize the vector space and query for specific features/classes/word.

Explorations:

* Get vector for a given word
* Predict context for the given word
* Draw the vector space for n random words in the vocabulary
* Build a dataframe, classify words (here I experimented with the greek diathesis) and draw the distribution plot

For the visualizations I used the wonderful Bokeh library.

In [1]:
from argparse import Namespace
import os
import re
import pandas as pd 
import json
import torch
import numpy as np
import torch.nn as nn

## Load saved files

In [2]:
args = Namespace(embeddings_path = 'data/models/embeddings.npy',
                vocab='data/vocabs/Homer_word_frequencies_accented.json',
                word2index="data/vocabs/Homer_word2index_accented.json",
                embeddings = 250,
                device = 'cuda'
                )

### Vocabs and Embeddings

In [3]:
# import vocab and lookup dictioanries
with open(args.vocab, "r", encoding="utf-8") as fp:
    vocab = json.load(fp)
    
with open(args.word2index, "r", encoding="utf-8") as fp:
    word2index = json.load(fp)

# Create a reverse lookup table
index2word = {i: w for w, i in word2index.items()}

In [17]:
embeddings = np.load(args.embeddings_path, allow_pickle=True)
embeddings = torch.tensor(embeddings)

In [18]:
#embds = model.emb_context.weight.data.cpu()

print('Embedding', embeddings)

Embedding tensor([[ 0.6250,  0.2046, -0.0251,  ..., -0.3560, -0.0485, -0.1652],
        [-0.0164, -0.3063,  0.3312,  ..., -0.0466,  0.1590,  0.0767],
        [ 0.5669, -0.1569, -0.2868,  ..., -0.5518, -0.0069,  0.1337],
        ...,
        [ 0.0403,  0.2383,  0.3196,  ..., -0.0159,  0.3583,  0.1623],
        [ 0.3061, -0.0333,  0.1572,  ..., -0.1225, -0.1025,  0.1806],
        [ 0.1168,  0.1387, -0.1751,  ..., -0.0056,  0.0176,  0.0378]])


In [19]:
print("Shape: ", embeddings.shape)

Shape:  torch.Size([30657, 250])


### Eval

In [20]:
def get_vector(token):
    return embeddings[word2index[token], :]

In [21]:
# test
get_vector('θεά')[:4]

tensor([0.0160, 0.1896, 0.1096, 0.0254])

In [22]:
words = list(word2index.keys())

## Most similar

In [23]:
from utils.utils import nearest_word

In [24]:
def most_similar(word, embeddings=embeddings, n=7):
    target_vector = get_vector(word)
    idx = nearest_word(target=target_vector, embeddings=embeddings,n=n, metrics="cosine")
    for i, index in enumerate(idx):
        if index != word2index[word]:
            print(f"{i}) {index2word[index]}")

In [25]:
most_similar('ὕπνος')

1) γλυκὺς
2) νήδυμος
3) λύων
4) τεχνηέντως
5) κεκμηῶτα
6) λυσιμελὴς


In [26]:
most_similar('Ἀχιλλεύς')

1) ὠκὺς
2) ποδάρκης
3) ἀπέκτανε
4) πέρσεν
5) τάπησί
6) δῖος


In [None]:
most_similar('Ἀχιλλεύς')

# Analogy finder

In [32]:
def analogy(x1, x2, y, n=20):
    '''
    Analogy formula :
            x1 : x2 = y : ?
    
    n (default 5) how many possible answers do you want?
    '''
    vec_x1 = get_vector(x1)
    vec_x2 = get_vector(x2)
    vec_y = get_vector(y)
    vec_unknown = (vec_x1 - vec_x2) + vec_y
    
    idx_next_words = nearest_word(vec_unknown, embeddings, n=n)
    
    for i, idx in enumerate(idx_next_words):
        print(f"{i}) {index2word[idx]})")    

In [34]:
analogy('θεός', "ἀνήρ", "θεά")

0) θεά)
1) πότνα)
2) θεός)
3) παρθενικῇ)
4) ἐπαρήγοις)
5) νεήνιδι)
6) ἁμόθεν)
7) Ἄρτεμι)
8) ἐχούσῃ)
9) ἐπίρροθος)
10) κάλπιν)
11) μήδεαι)
12) πρόφρασσʼ)
13) χώεο)
14) ἀγαθή)
15) φήνῃ)
16) ἱρῶν)
17) ἐπεμάσσατʼ)
18) ἐᾷ)
19) ὀμόσσαι)


## Dimensionality reduction

In [35]:
# import scaler and dimensionality reducer
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
scaler = StandardScaler()

In [None]:
vectors_tsne = TSNE(n_components=2,
                    perplexity = 20,
                    metric='euclidean',
                    n_iter=700,
                    verbose=3,
                    n_jobs=6).fit_transform(embeddings)

[t-SNE] Computing 61 nearest neighbors...
[t-SNE] Indexed 30657 samples in 2.167s...


In [None]:
vectors_tsne = scaler.fit_transform(vectors_tsne)

## Build datframe

In [None]:
import random
def generate_random_idx(n=250):
    rand_numbers = []
    for _ in range(n):
        rand_numbers.append(random.randint(0, len(words)-1))
    return rand_numbers

In [None]:
#test_words = ["εθηκε", 'επος', 'εφατʼ', 'ελαφρον', 'Αθηνη']

# extract n random words from the vocabulary
test_idx = generate_random_idx(n=50)
test_words = [index2word[i] for i in test_idx]

# extract the vector coordinates
x= []
y = []
for word in test_words:
    x.append(vectors_tsne[word2index[word],0])
    y.append(vectors_tsne[word2index[word], 1])

In [None]:
test_df = pd.DataFrame({'word' : test_words, 'x' : x, 'y' : y})
test_df.head()

## Visualization

In [None]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook, export_png
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap, factor_mark, linear_cmap
output_notebook()

In [None]:
from bokeh.palettes import Category20_9
def draw_test_words(data, alpha=0.69,width=600, height=400, show=True):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    src = bm.ColumnDataSource(data)
    
    mapper = linear_cmap(field_name='y', palette=Category20_9 ,low=min(y) ,high=max(y))
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Visualize chosen vectors")
    
    fig.scatter('x','y',
                size=10,
                line_color=mapper,
                color=mapper,
                fill_alpha=alpha,
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@word")]))
    if show: pl.show(fig)
    return fig

In [None]:
draw_test_words(data=test_df)

## Subset media vs. active

In [None]:
df = pd.DataFrame(data=vectors_tsne, columns=["x", "y"])
df['token'] = words

In [None]:
# find and insert labels for diathesis
classe = []
faslse_positives = ["που", "αυτου","και","ειναι","κατʼ"]
for tok in df['token']:
    if tok in faslse_positives:
        classe.append('non_verb')
        continue
        # just a selection of middle endings
    regex = r"\w+(μαι|σαι|σο|ται|το|μεθα|μεθʼ|σθε|σθʼ|σθ|νται|ντʼ|ντο|σθαι|μην|ου|ιο|μην|σθω|σθων|μεν\w{1,3})\b"
    match = re.match(regex, tok)
    if match:
        classe.append('medium')
    else:
        # just a selection of active endings
        regex = r"\w+(μι|σι|τι|μεν|τε|ντι|ω|εις|ει|μεν|τε|ουσι|ον|οιμι)\b"
        match = re.match(regex, tok)
        if match:
            classe.append('active')
        else:
            classe.append('non_verb')

In [None]:
# add labels
df['label'] = classe
#sdf.to_csv("data/assets/tsne_df_pyTorch.csv")

In [None]:
df = df[df["label"] != "non_verb"]

In [None]:
def draw_groups(data, radius=10, alpha=0.25,width=600, height=400, show=True, markers=['triangle', "diamond"],
                colorstyle=["#fca486", "#91bfdb"], labels=['medium', 'active'], name="plot.png"):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    MARKERS = markers
    LABEL = labels
    
    src = bm.ColumnDataSource(data)
    
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Skip Gram")
    
    fig.scatter('x','y',
                size=10,
                legend_field="label",
                marker=factor_mark('label', MARKERS, LABEL),
                color=factor_cmap('label', colorstyle, LABEL),
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@token")]))
    export_png(fig, filename="data/assets/"+name)
    if show: pl.show(fig)
    return fig

In [None]:
draw_groups(data=df,name="torch_skip_0502_euclidean_tsne_perpl20.png")

In [None]:
df.head()