# Evaluation and Visualization (Skipgram Negative)

Once the model has been trained, we can visualize the vector space and query for specific features/classes/word.

Explorations:

* Get vector for a given word
* Predict context for the given word
* Draw the vector space for n random words in the vocabulary
* Build a dataframe, classify words (here I experimented with the greek diathesis) and draw the distribution plot

For the visualizations I used the wonderful Bokeh library.

In [2]:
from argparse import Namespace
import os
import re
import pandas as pd 
import json
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init


## Load saved files

In [3]:
args = Namespace(model = 'data/models/Skipgram_Pytorch_0602_delta.pth',
                vocab='data/vocabs/Homer_word_frequencies.json',
                word2index="data/vocabs/Homer_word2index.json",
                embeddings = 250,
                device = 'cuda'
                )

### Model

In [5]:
from utils.modules import SkipGram

In [6]:
# import vocab and lookup dictioanries
with open(args.vocab, "r", encoding="utf-8") as fp:
    vocab = json.load(fp)
    
with open(args.word2index, "r", encoding="utf-8") as fp:
    word2index = json.load(fp)

# Create a reverse lookup table
index2word = {i: w for w, i in word2index.items()}

In [7]:
# make noise distribution to sample negative examples from
word_freqs = np.array(list(vocab.values()))
unigram_dist = word_freqs / sum(word_freqs)
noise_dist = torch.from_numpy(unigram_dist ** (0.75) / np.sum(unigram_dist ** (0.75)))

In [8]:
# Initialize model
model = SkipGram(vocab_size=len(vocab),
             embeddings=args.embeddings,
             device=args.device,
             noise_dist=noise_dist,
             negs=25,
             batch_size=1000
                ).to(args.device)

In [9]:
saved_model = torch.load(os.path.join(args.model))
model.load_state_dict(saved_model['model_state_dict'])

<All keys matched successfully>

In [10]:
# Simple word embedding mdodel
print(model)

SkipGram(
  (emb_center): Embedding(30927, 250)
  (emb_context): Embedding(30927, 250)
)


### Embeddings

In [11]:
embds = model.emb_context.weight.data.cpu()

print('Embedding', embds)

Embedding tensor([[-4.0921e-01,  2.7344e-02,  2.0457e-01,  ..., -4.5655e-01,
         -1.6768e-01, -2.4301e-01],
        [-2.4140e-01,  2.4443e-01, -6.9216e-02,  ..., -2.2814e-01,
         -9.2820e-02, -3.8600e-01],
        [-3.3475e-01,  1.0144e-01, -5.6988e-02,  ..., -9.4702e-02,
         -8.2052e-02,  2.2783e-02],
        ...,
        [-5.8142e-02,  6.3311e-02,  1.4960e-03,  ...,  1.8775e-02,
          4.7389e-02,  1.3993e-02],
        [ 5.0759e-02,  1.0557e-01, -8.6467e-02,  ...,  5.0770e-03,
          3.5998e-04, -2.2773e-02],
        [-8.9901e-02,  1.4097e-01,  9.8769e-02,  ..., -9.1229e-02,
         -7.8328e-02, -5.6019e-02]])


In [12]:
print("Shape: ", embds.shape)

Shape:  torch.Size([30927, 250])


### Eval

In [13]:
def get_vector(token):
    return embds[word2index[token], :]

In [14]:
# test
get_vector('θεα')

tensor([-3.3475e-01,  1.0144e-01, -5.6988e-02, -4.7168e-01, -1.2848e-01,
         1.8859e-01,  3.9837e-01, -2.5299e-01, -2.1700e-01,  1.6781e-01,
        -1.8983e-01, -6.2733e-02, -4.7149e-03,  7.2617e-02,  1.5741e-01,
        -2.4187e-01, -1.2589e-01, -1.1791e-01,  5.4039e-01, -1.7884e-01,
         6.0811e-02, -4.6877e-01,  6.2299e-02, -6.4316e-01, -8.6290e-02,
         2.7440e-02,  1.2256e-01, -2.5569e-01, -2.1359e-01, -8.5925e-02,
        -1.1781e-01, -5.2650e-02, -1.7901e-02, -4.7510e-01,  2.4812e-01,
        -4.4943e-01, -1.3736e-01,  6.3222e-02,  9.4790e-02, -1.7962e-01,
         2.6187e-02,  2.4673e-02,  1.9496e-01,  1.5310e-01,  2.0565e-01,
        -2.2085e-01,  2.5146e-01,  4.4478e-02,  2.2858e-01, -1.1581e-01,
         1.1589e-01,  1.2963e-01, -1.1935e-01,  1.2616e-01,  6.1135e-02,
         1.6455e-01, -6.1751e-02, -2.3654e-01,  2.0411e-01,  3.1909e-01,
         2.7179e-03, -4.7829e-01, -3.1081e-01,  2.5134e-01,  3.3746e-01,
         1.2997e-01, -2.8534e-01,  2.7498e-01, -6.2

In [15]:
words = list(word2index.keys())

# Analogy finder

In [16]:
def find_next_words(target, embeddings, n=5):
    '''
    A kind of Projection formula, finds the closest vector in a vector space to the one given in input
    
    '''
    # calculate distance between target and embeddings
    distance = np.linalg.norm(target - embeddings, axis = 1)   # calc the distance of all vectors from the target
    
    idx_next_words = np.argsort(distance)[:n] # select the indx of the n closest
    
    distances = distance[idx_next_words] # select only the vectors in the precedently found array
    
    return idx_next_words, distances

In [17]:
def most_similar(word, embeddings=embds, n=7):
    target_vector = get_vector(word)
    idx, dist = find_next_words(target=target_vector, embeddings=embds,n=n)
    for i, index in enumerate(idx):
        if index != word2index[word]:
            print(f"{i}) {index2word[index]}")

In [18]:
most_similar('υπνος')

1) γλυκυς
2) πιπτεν
3) νηδυμος
4) βλεφαροισιν
5) ακουουσʼ
6) ευτε


In [22]:
most_similar('καλον')

1) ρατων
2) καθηραντες
3) δατι
4) αρψεν
5) αερτην
6) ατθεμενοι


In [23]:
def analogy(x1, x2, y, n=20):
    '''
    Analogy formula :
            x1 : x2 = y : ?
    
    n (default 5) how many possible answers do you want?
    '''
    vec_x1 = get_vector(x1)
    vec_x2 = get_vector(x2)
    vec_y = get_vector(y)
    vec_unknown = (vec_x1 - vec_x2) + vec_y
    
    idx_next_words, distances = find_next_words(vec_unknown, embds, n=n)
    
    for idx, dist in enumerate(distances):
        if idx != 0:
            print(f"{idx}) {index2word[idx_next_words[idx]]} \t (distance from x1 : {dist})")    

In [24]:
analogy('θεα', "θεος", "ανδρα")

1) ανδρα 	 (distance from x1 : 3.9938998222351074)
2) λευκωλενος 	 (distance from x1 : 4.416593074798584)
3) δρις 	 (distance from x1 : 4.459568500518799)
4) Ηρη 	 (distance from x1 : 4.466602325439453)
5) αρθενικῃ 	 (distance from x1 : 4.4850382804870605)
6) Θετις 	 (distance from x1 : 4.552821636199951)
7) στασʼ 	 (distance from x1 : 4.566750526428223)
8) αϊεις 	 (distance from x1 : 4.583210468292236)
9) νεωτερῳ 	 (distance from x1 : 4.5945210456848145)
10) υρων 	 (distance from x1 : 4.613779544830322)
11) υσμινης 	 (distance from x1 : 4.657886028289795)
12) ρμον 	 (distance from x1 : 4.660740852355957)
13) εταρον 	 (distance from x1 : 4.664992809295654)
14) σθμιον 	 (distance from x1 : 4.665258407592773)
15) μαχοιο 	 (distance from x1 : 4.666952133178711)
16) μφινομου 	 (distance from x1 : 4.674718856811523)
17) ιδοι 	 (distance from x1 : 4.679873943328857)
18) κπροκαλεσσαμενη 	 (distance from x1 : 4.683109283447266)
19) αις 	 (distance from x1 : 4.6886749267578125)


## Dimensionality reduction

In [25]:
# import scaler and dimensionality reducer
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
scaler = StandardScaler()

In [26]:
vectors_tsne = TSNE(n_components=2,
                    perplexity = 5,
                    metric='cosine',
                    n_iter=700,
                    verbose=3,
                    n_jobs=6).fit_transform(embds)

[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 30927 samples in 0.007s...
[t-SNE] Computed neighbors for 30927 samples in 27.991s...
[t-SNE] Computed conditional probabilities for sample 1000 / 30927
[t-SNE] Computed conditional probabilities for sample 2000 / 30927
[t-SNE] Computed conditional probabilities for sample 3000 / 30927
[t-SNE] Computed conditional probabilities for sample 4000 / 30927
[t-SNE] Computed conditional probabilities for sample 5000 / 30927
[t-SNE] Computed conditional probabilities for sample 6000 / 30927
[t-SNE] Computed conditional probabilities for sample 7000 / 30927
[t-SNE] Computed conditional probabilities for sample 8000 / 30927
[t-SNE] Computed conditional probabilities for sample 9000 / 30927
[t-SNE] Computed conditional probabilities for sample 10000 / 30927
[t-SNE] Computed conditional probabilities for sample 11000 / 30927
[t-SNE] Computed conditional probabilities for sample 12000 / 30927
[t-SNE] Computed conditional probabilities for sa

In [27]:
vectors_tsne = scaler.fit_transform(vectors_tsne)

## Build datframe

In [46]:
df = pd.DataFrame(data=vectors_tsne, columns=["x", "y"])
df['token'] = words

In [47]:
# find and insert labels for diathesis
classe = []
faslse_positives = ["που", "αυτου","και","ειναι","κατʼ"]
for tok in df['token']:
    if tok in faslse_positives:
        classe.append('non_verb')
        continue
        # just a selection of middle endings
    regex = r"\w+(μαι|σαι|σο|ται|το|μεθα|μεθʼ|σθε|σθʼ|σθ|νται|ντʼ|ντο|σθαι|μην|ου|ιο|μην|σθω|σθων|μεν\w{1,3})\b"
    match = re.match(regex, tok)
    if match:
        classe.append('medium')
    else:
        # just a selection of active endings
        regex = r"\w+(μι|σι|τι|μεν|τε|ντι|ω|εις|ει|μεν|τε|ουσι|ον|οιμι)\b"
        match = re.match(regex, tok)
        if match:
            classe.append('active')
        else:
            classe.append('non_verb')

In [48]:
# add labels
df['label'] = classe

In [49]:
df.head()

Unnamed: 0,x,y,token,label
0,-1.458551,0.386603,μηνιν,non_verb
1,0.88082,0.220249,αειδε,non_verb
2,0.702354,-2.004626,θεα,non_verb
3,0.065237,-0.315224,Πηληϊαδεω,active
4,0.066461,-0.315274,Αχιληος,non_verb


In [32]:
df.to_csv("data/assets/tsne_df_pyTorch.csv")

## Visualization

In [33]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook, export_png
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap, factor_mark, linear_cmap
output_notebook()

In [53]:
import random
def generate_random_idx(n=250):
    rand_numbers = []
    for _ in range(n):
        rand_numbers.append(random.randint(0, len(words)))
    return rand_numbers

In [59]:
#test_words = ["εθηκε", 'επος', 'εφατʼ', 'ελαφρον', 'Αθηνη']

# extract n random words from the vocabulary
test_idx = generate_random_idx(n=15)
test_words = [index2word[i] for i in test_idx]

# extract the vector coordinates
x= []
y = []
for word in test_words:
    x.append(vectors_tsne[word2index[word],0])
    y.append(vectors_tsne[word2index[word], 1])

In [60]:
test_df = pd.DataFrame({'word' : test_words, 'x' : x, 'y' : y})

In [61]:
test_df.head()

Unnamed: 0,word,x,y
0,ηραμον,0.125431,-0.184202
1,θεαι,0.672803,0.893808
2,Οδυσσεος,0.305406,-1.981847
3,αισσοντα,0.902621,-0.007758
4,εισηλθʼ,0.050273,0.015602


In [62]:
from bokeh.palettes import Category20_9
def draw_test_words(data, alpha=0.69,width=600, height=400, show=True):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    src = bm.ColumnDataSource(data)
    
    mapper = linear_cmap(field_name='y', palette=Category20_9 ,low=min(y) ,high=max(y))
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Visualize chosen vectors")
    
    fig.scatter('x','y',
                size=10,
                line_color=mapper,
                color=mapper,
                fill_alpha=alpha,
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@word")]))
    if show: pl.show(fig)
    return fig

In [63]:
draw_test_words(data=test_df)

## Subset

In [50]:
df = df[df["label"] != "non_verb"]

In [51]:
def draw_groups(data, radius=10, alpha=0.25,width=600, height=400, show=True, markers=['triangle', "diamond"],
                colorstyle=["#fca486", "#91bfdb"], labels=['medium', 'active'], name="plot.png"):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    MARKERS = markers
    LABEL = labels
    
    src = bm.ColumnDataSource(data)
    
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Skip Gram")
    
    fig.scatter('x','y',
                size=10,
                legend_field="label",
                marker=factor_mark('label', MARKERS, LABEL),
                color=factor_cmap('label', colorstyle, LABEL),
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@token")]))
    export_png(fig, filename="data/assets/"+name)
    if show: pl.show(fig)
    return fig

In [52]:
draw_groups(data=df,name="torch_skip_0502.png")

In [None]:
df.head()