# Evaluation and Visualization (Skipgram Negative)

Once the model has been trained, we can visualize the vector space and query for specific features/classes/word.

Explorations:

* Get vector for a given word
* Predict context for the given word
* Draw the vector space for n random words in the vocabulary
* Build a dataframe, classify words (here I experimented with the greek diathesis) and draw the distribution plot

For the visualizations I used the wonderful Bokeh library.

In [3]:
from argparse import Namespace
import os
import re
import pandas as pd 
import json
import torch
import numpy as np
import torch.nn as nn

## Load saved files

In [22]:
args = Namespace(model = 'data/models/Skipgram_Pytorch_0602_delta.pth',
                vocab='data/vocabs/Homer_word_frequencies_accented.json',
                word2index="data/vocabs/Homer_word2index_accented.json",
                embeddings = 250,
                device = 'cuda'
                )

### Model

In [23]:
from utils.modules import SkipGram

In [24]:
# import vocab and lookup dictioanries
with open(args.vocab, "r", encoding="utf-8") as fp:
    vocab = json.load(fp)
    
with open(args.word2index, "r", encoding="utf-8") as fp:
    word2index = json.load(fp)

# Create a reverse lookup table
index2word = {i: w for w, i in word2index.items()}

In [25]:
# make noise distribution to sample negative examples from
word_freqs = np.array(list(vocab.values()))
unigram_dist = word_freqs / sum(word_freqs)
noise_dist = torch.from_numpy(unigram_dist ** (0.75) / np.sum(unigram_dist ** (0.75)))

In [26]:
# Initialize model
model = SkipGram(vocab_size=len(vocab),
             embeddings=args.embeddings,
             device=args.device,
             noise_dist=noise_dist,
             negs=15,
             batch_size=1000
                ).to(args.device)

In [27]:
saved_model = torch.load(os.path.join(args.model))
model.load_state_dict(saved_model['model_state_dict'])

<All keys matched successfully>

In [28]:
# Simple word embedding mdodel
print(model)

SkipGram(
  (emb_center): Embedding(30657, 250)
  (emb_context): Embedding(30657, 250)
)


### Embeddings

In [29]:
embds = model.emb_context.weight.data.cpu()

print('Embedding', embds)

Embedding tensor([[ 0.1291,  0.1247,  0.1023,  ..., -0.0867,  0.0868,  0.0866],
        [ 0.1060,  0.0795,  0.1406,  ..., -0.1145,  0.1098,  0.0912],
        [ 0.0383, -0.0205, -0.0616,  ..., -0.0833, -0.0368,  0.0152],
        ...,
        [ 0.0526,  0.0744,  0.0550,  ..., -0.0538,  0.1095,  0.0623],
        [ 0.0440,  0.0342,  0.0580,  ..., -0.0428,  0.0241,  0.0162],
        [ 0.0318,  0.0265,  0.0258,  ..., -0.0197,  0.0346,  0.0330]])


In [30]:
print("Shape: ", embds.shape)

Shape:  torch.Size([30657, 250])


### Eval

In [31]:
def get_vector(token):
    return embds[word2index[token], :]

In [33]:
# test
get_vector('θεά')

tensor([ 3.6242e-02, -1.2981e-02,  7.5585e-02, -2.8209e-02,  1.6054e-01,
        -1.4424e-01, -1.3937e-02, -5.3338e-02, -7.1206e-02, -6.6945e-02,
        -9.0600e-02, -9.8502e-02,  5.5253e-02,  7.6976e-02,  6.8120e-02,
        -7.7399e-03, -2.0034e-02,  3.8564e-02,  1.0810e-01, -9.4729e-02,
        -2.8232e-02,  1.0707e-02,  8.0790e-02, -4.1499e-02, -3.0716e-02,
         2.7264e-02, -3.3844e-02,  6.4839e-02, -7.1548e-02,  1.8352e-01,
        -4.6130e-02,  9.0210e-02,  7.3792e-02, -3.6996e-02,  3.0046e-02,
        -1.4242e-01, -2.6979e-02,  1.0963e-01,  8.5152e-02,  6.1406e-02,
         1.0599e-01,  9.4421e-03, -4.6734e-02, -4.3333e-04,  9.1265e-02,
        -1.2882e-01,  1.0117e-01, -3.1208e-02,  6.6553e-02,  1.5953e-01,
         6.1465e-02, -7.6150e-02,  1.8601e-01,  1.5618e-01,  7.1093e-02,
         7.3835e-02, -6.6054e-02, -9.1406e-02,  9.1673e-02, -2.6440e-02,
        -3.5373e-02, -7.5976e-02, -1.7462e-02, -4.6189e-02,  1.2927e-01,
         1.0055e-01, -6.0665e-02, -1.8801e-02, -8.6

In [90]:
words = list(word2index.keys())

## Most similar

In [50]:
from utils.utils import nearest_word

In [51]:
def most_similar(word, embeddings=embds, n=7):
    target_vector = get_vector(word)
    idx = nearest_word(target=target_vector, embeddings=embds,n=n, metrics="cosine")
    for i, index in enumerate(idx):
        if index != word2index[word]:
            print(f"{i}) {index2word[index]}")

In [52]:
most_similar('ὕπνος')

1) δολιχηρέτμοιο
2) κάμω
3) ὄγχνῃ
4) βλεφάροισιν
5) σταφυλή
6) ὀμοῦμαι


In [39]:
most_similar('Ἀχιλλεύς')

1) πόδας
2) Ὀδυσσεύς
3) ἰδὼν
4) Ζεύς
5) μέγʼ
6) προσέφη


In [53]:
most_similar('Ἀχιλλεύς')

1) προσέφη
2) πόδας
3) ὠκὺς
4) πολύμητις
5) ἀπαμειβόμενος
6) Ὀδυσσεύς


# Analogy finder

In [43]:
def analogy(x1, x2, y, n=20):
    '''
    Analogy formula :
            x1 : x2 = y : ?
    
    n (default 5) how many possible answers do you want?
    '''
    vec_x1 = get_vector(x1)
    vec_x2 = get_vector(x2)
    vec_y = get_vector(y)
    vec_unknown = (vec_x1 - vec_x2) + vec_y
    
    idx_next_words, distances = find_next_words(vec_unknown, embds, n=n)
    
    for idx, dist in enumerate(distances):
        print(f"{idx}) {index2word[idx_next_words[idx]]} \t (distance from x1 : {dist})")    

In [44]:
analogy('θεός', "θεά", "ἀνήρ")

0) οἶον 	 (distance from x1 : 0.9924296736717224)
1) ἐνθάδε 	 (distance from x1 : 0.9934024214744568)
2) ἀνήρ 	 (distance from x1 : 1.0069150924682617)
3) ἀνδρὸς 	 (distance from x1 : 1.009009838104248)
4) ἐστι 	 (distance from x1 : 1.0133938789367676)
5) τάχιστα 	 (distance from x1 : 1.014754056930542)
6) αὐτῇ 	 (distance from x1 : 1.021307349205017)
7) τόδʼ 	 (distance from x1 : 1.0279895067214966)
8) ὄλεθρον 	 (distance from x1 : 1.031813144683838)
9) ὅσσοι 	 (distance from x1 : 1.0323268175125122)
10) οὐκέτι 	 (distance from x1 : 1.0336735248565674)
11) ὧδʼ 	 (distance from x1 : 1.0349684953689575)
12) τέκνα 	 (distance from x1 : 1.0354015827178955)
13) πολλοὶ 	 (distance from x1 : 1.036676287651062)
14) βροτῶν 	 (distance from x1 : 1.0403742790222168)
15) Ἰθάκην 	 (distance from x1 : 1.042536973953247)
16) Δαναοῖσι 	 (distance from x1 : 1.042832612991333)
17) εἵνεκα 	 (distance from x1 : 1.0436313152313232)
18) θεοῖσιν 	 (distance from x1 : 1.046190857887268)
19) ὑμεῖς 	 (distance

## Dimensionality reduction

In [54]:
# import scaler and dimensionality reducer
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
scaler = StandardScaler()

In [105]:
vectors_tsne = TSNE(n_components=2,
                    perplexity = 20,
                    metric='euclidean',
                    n_iter=700,
                    verbose=3,
                    n_jobs=6).fit_transform(embds)

[t-SNE] Computing 61 nearest neighbors...
[t-SNE] Indexed 30657 samples in 2.174s...
[t-SNE] Computed neighbors for 30657 samples in 111.687s...
[t-SNE] Computed conditional probabilities for sample 1000 / 30657
[t-SNE] Computed conditional probabilities for sample 2000 / 30657
[t-SNE] Computed conditional probabilities for sample 3000 / 30657
[t-SNE] Computed conditional probabilities for sample 4000 / 30657
[t-SNE] Computed conditional probabilities for sample 5000 / 30657
[t-SNE] Computed conditional probabilities for sample 6000 / 30657
[t-SNE] Computed conditional probabilities for sample 7000 / 30657
[t-SNE] Computed conditional probabilities for sample 8000 / 30657
[t-SNE] Computed conditional probabilities for sample 9000 / 30657
[t-SNE] Computed conditional probabilities for sample 10000 / 30657
[t-SNE] Computed conditional probabilities for sample 11000 / 30657
[t-SNE] Computed conditional probabilities for sample 12000 / 30657
[t-SNE] Computed conditional probabilities for s

In [106]:
vectors_tsne = scaler.fit_transform(vectors_tsne)

## Build datframe

In [107]:
import random
def generate_random_idx(n=250):
    rand_numbers = []
    for _ in range(n):
        rand_numbers.append(random.randint(0, len(words)-1))
    return rand_numbers

In [108]:
#test_words = ["εθηκε", 'επος', 'εφατʼ', 'ελαφρον', 'Αθηνη']

# extract n random words from the vocabulary
test_idx = generate_random_idx(n=50)
test_words = [index2word[i] for i in test_idx]

# extract the vector coordinates
x= []
y = []
for word in test_words:
    x.append(vectors_tsne[word2index[word],0])
    y.append(vectors_tsne[word2index[word], 1])

In [109]:
test_df = pd.DataFrame({'word' : test_words, 'x' : x, 'y' : y})
test_df.head()

Unnamed: 0,word,x,y
0,ῥινὸν,1.211682,-0.499032
1,φοβέεσθαι,-1.725182,-0.185149
2,μεγάλοισι,-0.840059,0.741624
3,ἀτέλεστα,-0.65087,0.285174
4,ἡγησάσθω,0.148651,0.425239


## Visualization

In [110]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook, export_png
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap, factor_mark, linear_cmap
output_notebook()

In [111]:
from bokeh.palettes import Category20_9
def draw_test_words(data, alpha=0.69,width=600, height=400, show=True):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    src = bm.ColumnDataSource(data)
    
    mapper = linear_cmap(field_name='y', palette=Category20_9 ,low=min(y) ,high=max(y))
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Visualize chosen vectors")
    
    fig.scatter('x','y',
                size=10,
                line_color=mapper,
                color=mapper,
                fill_alpha=alpha,
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@word")]))
    if show: pl.show(fig)
    return fig

In [112]:
draw_test_words(data=test_df)

## Subset media vs. active

In [113]:
df = pd.DataFrame(data=vectors_tsne, columns=["x", "y"])
df['token'] = words

In [114]:
# find and insert labels for diathesis
classe = []
faslse_positives = ["που", "αυτου","και","ειναι","κατʼ"]
for tok in df['token']:
    if tok in faslse_positives:
        classe.append('non_verb')
        continue
        # just a selection of middle endings
    regex = r"\w+(μαι|σαι|σο|ται|το|μεθα|μεθʼ|σθε|σθʼ|σθ|νται|ντʼ|ντο|σθαι|μην|ου|ιο|μην|σθω|σθων|μεν\w{1,3})\b"
    match = re.match(regex, tok)
    if match:
        classe.append('medium')
    else:
        # just a selection of active endings
        regex = r"\w+(μι|σι|τι|μεν|τε|ντι|ω|εις|ει|μεν|τε|ουσι|ον|οιμι)\b"
        match = re.match(regex, tok)
        if match:
            classe.append('active')
        else:
            classe.append('non_verb')

In [115]:
# add labels
df['label'] = classe
#sdf.to_csv("data/assets/tsne_df_pyTorch.csv")

In [116]:
df = df[df["label"] != "non_verb"]

In [117]:
def draw_groups(data, radius=10, alpha=0.25,width=600, height=400, show=True, markers=['triangle', "diamond"],
                colorstyle=["#fca486", "#91bfdb"], labels=['medium', 'active'], name="plot.png"):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    MARKERS = markers
    LABEL = labels
    
    src = bm.ColumnDataSource(data)
    
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Skip Gram")
    
    fig.scatter('x','y',
                size=10,
                legend_field="label",
                marker=factor_mark('label', MARKERS, LABEL),
                color=factor_cmap('label', colorstyle, LABEL),
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@token")]))
    export_png(fig, filename="data/assets/"+name)
    if show: pl.show(fig)
    return fig

In [118]:
draw_groups(data=df,name="torch_skip_0502_euclidean_tsne_perpl20.png")

In [None]:
df.head()