# Evaluation and Visualization (Skipgram Negative)

Once the model has been trained, we can visualize the vector space and query for specific features/classes/word.

Explorations:

* Get vector for a given word
* Predict context for the given word
* Draw the vector space for n random words in the vocabulary
* Build a dataframe, classify words (here I experimented with the greek diathesis) and draw the distribution plot

For the visualizations I used the wonderful Bokeh library.

In [1]:
from argparse import Namespace
import os
import re
import pandas as pd 
import json
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init


## Load saved files

In [2]:
args = Namespace(model = 'data/models/Skipgram_Pytorch_0502_gamma.pth',
                vocab='data/vocabs/Homer_word_frequencies.json',
                word2index="data/vocabs/Homer_word2index.json",
                embeddings = 100,
                device = 'cuda'
                )

### Model

In [3]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embeddings, device='cpu', negs=15, noise_dist=None):
        super(CBOW, self).__init__()

        self.vocab_size = vocab_size
        self.embd_size = embeddings
        self.negs = negs
        self.device = device
        self.noise_dist = noise_dist
        self.embeddings_target = nn.Embedding(vocab_size, embeddings)
        self.embeddings_context = nn.Embedding(vocab_size, embeddings)

        self.initialize_embeddings(emb_size=embeddings)

    def initialize_embeddings(self, emb_size):
        custom_range = 0.5 / emb_size
        init.uniform_(self.embeddings_target.weight.data,  -
                      custom_range, custom_range)
        init.constant_(self.embeddings_context.weight.data, 0)

    def forward(self, target, context):
        # computing embeddings for target and context words
        emb_input = self.embeddings_target(target)  # bs, emb_dim (4096,100)
        # mask some terms in the input to prevent overfitting (https://github.com/keras-team/keras/issues/7290)
        emb_input = F.dropout(emb_input, 0.1)
        emb_context = self.embeddings_context(
            context)  # bs, emb_dim (4096,100)

        if self.noise_dist is None:
            self.noise_dist = torch.ones(self.vocab_size)
        # Find out how many negative examples we need (here batch size * negs).
        negs_number = context.shape[0] * self.negs
        # build negs example
        # coz bs*num_neg_samples > vocab_size
        negative_example = torch.multinomial(
            self.noise_dist, negs_number, replacement=True)
        # Move to cuda, without creating another tensor (viewes share the same underlying data with the copied tensors)
        negative_example = negative_example.view(
            context.shape[0], self.negs).to(self.device)  # bs, num_neg_samples
        # Calculate the embeddings
        emb_neg = self.embeddings_target(negative_example)

        score = torch.sum(torch.mul(emb_input, emb_context), dim=1)
        score = torch.clamp(score, max=10, min=-10)
        score = -1 * F.logsigmoid(score)

        neg_score = torch.bmm(emb_neg, emb_input.unsqueeze(2)).squeeze()
        neg_score = torch.clamp(neg_score, max=10, min=-10)
        neg_score = -1 * torch.sum(F.logsigmoid(-neg_score), dim=1)

        return torch.mean(score + neg_score)

    def save_embedding(self, id2word, fp):
        embedding = self.embeddings_target.weight.cpu().data.numpy()
        with open(fp, 'w') as f:
            f.write('%d %d\n' % (len(id2word), self.embd_size))
            for wid, w in id2word.items():
                e = ' '.join(map(lambda x: str(x), embedding[wid]))
                f.write('%s %s\n' % (w, e))


In [4]:
# import vocab and lookup dictioanries
with open(args.vocab, "r", encoding="utf-8") as fp:
    vocab = json.load(fp)
    
with open(args.word2index, "r", encoding="utf-8") as fp:
    word2index = json.load(fp)

# Create a reverse lookup table
index2word = {i: w for w, i in word2index.items()}

In [5]:
# make noise distribution to sample negative examples from
word_freqs = np.array(list(vocab.values()))
unigram_dist = word_freqs / sum(word_freqs)
noise_dist = torch.from_numpy(unigram_dist ** (0.75) / np.sum(unigram_dist ** (0.75)))

In [6]:
# Initialize model
model = CBOW(vocab_size=len(vocab),
             embeddings=args.embeddings,
             device=args.device,
             noise_dist=noise_dist,
             negs=15).to(args.device)

In [7]:
saved_model = torch.load(os.path.join(args.model))
model.load_state_dict(saved_model['model_state_dict'])

<All keys matched successfully>

In [8]:
# Simple word embedding mdodel
print(model)

CBOW(
  (embeddings_target): Embedding(30927, 100)
  (embeddings_context): Embedding(30927, 100)
)


### Embeddings

In [9]:
embds = model.embeddings_target.weight.data.cpu()

print('Embedding', embds)

Embedding tensor([[0.3844, 0.3912, 0.3835,  ..., 0.3924, 0.3838, 0.3852],
        [0.3033, 0.2995, 0.3021,  ..., 0.3016, 0.3020, 0.2970],
        [0.7528, 0.7546, 0.7564,  ..., 0.7525, 0.7503, 0.7501],
        ...,
        [1.0331, 0.9848, 0.9903,  ..., 1.0275, 1.0049, 0.9962],
        [1.0437, 1.0297, 1.0425,  ..., 1.0519, 1.0445, 1.0331],
        [1.0230, 1.0032, 0.9868,  ..., 1.0092, 1.0095, 0.9890]])


In [10]:
print("Shape: ", embds.shape)

Shape:  torch.Size([30927, 100])


### Eval

In [11]:
def get_vector(token):
    return embds[word2index[token], :]

In [12]:
# test
get_vector('θεα')

tensor([0.7528, 0.7546, 0.7564, 0.7592, 0.7596, 0.7539, 0.7574, 0.7603, 0.7563,
        0.7594, 0.7595, 0.7519, 0.7580, 0.7603, 0.7532, 0.7549, 0.7560, 0.7582,
        0.7494, 0.7508, 0.7508, 0.7515, 0.7501, 0.7600, 0.7509, 0.7572, 0.7563,
        0.7509, 0.7501, 0.7519, 0.7586, 0.7599, 0.7563, 0.7531, 0.7484, 0.7569,
        0.7501, 0.7538, 0.7595, 0.7503, 0.7579, 0.7567, 0.7503, 0.7514, 0.7576,
        0.7488, 0.7565, 0.7593, 0.7530, 0.7564, 0.7528, 0.7576, 0.7546, 0.7521,
        0.7486, 0.7578, 0.7606, 0.7545, 0.7522, 0.7561, 0.7489, 0.7512, 0.7578,
        0.7524, 0.7509, 0.7553, 0.7509, 0.7601, 0.7529, 0.7567, 0.7542, 0.7547,
        0.7568, 0.7555, 0.7627, 0.7599, 0.7554, 0.7543, 0.7511, 0.7559, 0.7543,
        0.7605, 0.7594, 0.7569, 0.7516, 0.7526, 0.7514, 0.7523, 0.7584, 0.7601,
        0.7563, 0.7589, 0.7574, 0.7584, 0.7490, 0.7511, 0.7526, 0.7525, 0.7503,
        0.7501])

In [13]:
words = list(word2index.keys())

# Analogy finder

In [14]:
def find_next_words(target, embeddings, n=5):
    '''
    A kind of Projection formula, finds the closest vector in a vector space to the one given in input
    
    '''
    # calculate distance between target and embeddings
    distance = np.linalg.norm(target - embeddings, axis = 1)   # calc the distance of all vectors from the target
    
    idx_next_words = np.argsort(distance)[:n] # select the indx of the n closest
    
    distances = distance[idx_next_words] # select only the vectors in the precedently found array
    
    return idx_next_words, distances

In [15]:
def most_similar(word, embeddings=embds, n=7):
    target_vector = get_vector(word)
    idx, dist = find_next_words(target=target_vector, embeddings=embds,n=n)
    for i, index in enumerate(idx):
        if index != word2index[word]:
            print(f"{i}) {index2word[index]}")

In [18]:
most_similar('υπνος')

1) ειν
2) μεγαροις
3) τοσσον
4) δωματα
5) μ
6) διαμπερες


In [17]:
most_similar('Αχιλλευς')

1) θαλασσης
2) ους
3) ελθων
4) γλαυκωπις
5) τετευχατο
6) θειης


In [19]:
def analogy(x1, x2, y, n=20):
    '''
    Analogy formula :
            x1 : x2 = y : ?
    
    n (default 5) how many possible answers do you want?
    '''
    vec_x1 = get_vector(x1)
    vec_x2 = get_vector(x2)
    vec_y = get_vector(y)
    vec_unknown = (vec_x1 - vec_x2) + vec_y
    
    idx_next_words, distances = find_next_words(vec_unknown, embds, n=n)
    
    for idx, dist in enumerate(distances):
        if idx != 0:
            print(f"{idx}) {index2word[idx_next_words[idx]]} \t (distance from x1 : {dist})")    

In [20]:
analogy('θεα', "βροτον", "επος")

1) περ 	 (distance from x1 : 0.3016490936279297)
2) αρα 	 (distance from x1 : 0.30882787704467773)
3) νυν 	 (distance from x1 : 0.3921869397163391)
4) ες 	 (distance from x1 : 0.4051879346370697)
5) κεν 	 (distance from x1 : 0.4302546977996826)
6) εκ 	 (distance from x1 : 0.4433017373085022)
7) ηδε 	 (distance from x1 : 0.4542503356933594)
8) μη 	 (distance from x1 : 0.4733108878135681)
9) ος 	 (distance from x1 : 0.49100425839424133)
10) ενι 	 (distance from x1 : 0.516639232635498)
11) οτε 	 (distance from x1 : 0.6731225848197937)
12) μιν 	 (distance from x1 : 0.7186228036880493)
13) ουδε 	 (distance from x1 : 0.7198626399040222)
14) εγω 	 (distance from x1 : 0.7503724098205566)
15) επʼ 	 (distance from x1 : 0.8132417798042297)
16) κατα 	 (distance from x1 : 0.8635571599006653)
17) μοι 	 (distance from x1 : 0.8773649334907532)
18) η 	 (distance from x1 : 1.0969762802124023)
19) μαλα 	 (distance from x1 : 1.1022915840148926)


## Dimensionality reduction

In [21]:
# import scaler and dimensionality reducer
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
scaler = StandardScaler()

In [22]:
vectors_tsne = TSNE(n_components=2,
                    perplexity = 7,
                    metric='cosine',
                    n_iter=1000,
                    verbose=3,
                    n_jobs=6).fit_transform(embds)

[t-SNE] Computing 22 nearest neighbors...
[t-SNE] Indexed 30927 samples in 0.003s...
[t-SNE] Computed neighbors for 30927 samples in 25.512s...
[t-SNE] Computed conditional probabilities for sample 1000 / 30927
[t-SNE] Computed conditional probabilities for sample 2000 / 30927
[t-SNE] Computed conditional probabilities for sample 3000 / 30927
[t-SNE] Computed conditional probabilities for sample 4000 / 30927
[t-SNE] Computed conditional probabilities for sample 5000 / 30927
[t-SNE] Computed conditional probabilities for sample 6000 / 30927
[t-SNE] Computed conditional probabilities for sample 7000 / 30927
[t-SNE] Computed conditional probabilities for sample 8000 / 30927
[t-SNE] Computed conditional probabilities for sample 9000 / 30927
[t-SNE] Computed conditional probabilities for sample 10000 / 30927
[t-SNE] Computed conditional probabilities for sample 11000 / 30927
[t-SNE] Computed conditional probabilities for sample 12000 / 30927
[t-SNE] Computed conditional probabilities for sa

In [23]:
vectors_tsne = scaler.fit_transform(vectors_tsne)

## Build datframe

In [24]:
df = pd.DataFrame(data=vectors_tsne, columns=["x", "y"])
df['token'] = words

In [25]:
# find and insert labels for diathesis
classe = []
faslse_positives = ["που", "αυτου","και","ειναι","κατʼ"]
for tok in df['token']:
    if tok in faslse_positives:
        classe.append('non_verb')
        continue
        # just a selection of middle endings
    regex = r"\w+(μαι|σαι|σο|ται|το|μεθα|μεθʼ|σθε|σθʼ|σθ|νται|ντʼ|ντο|σθαι|μην|ου|ιο|μην|σθω|σθων|μεν\w{1,3})\b"
    match = re.match(regex, tok)
    if match:
        classe.append('medium')
    else:
        # just a selection of active endings
        regex = r"\w+(μι|σι|τι|μεν|τε|ντι|ω|εις|ει|μεν|τε|ουσι|ον|οιμι)\b"
        match = re.match(regex, tok)
        if match:
            classe.append('active')
        else:
            classe.append('non_verb')

In [26]:
# add labels
df['label'] = classe

In [27]:
df.head()

Unnamed: 0,x,y,token,label
0,-1.638281,1.744004,μηνιν,non_verb
1,-2.363808,-0.764043,αειδε,non_verb
2,-1.018299,1.315216,θεα,non_verb
3,-0.01021,2.397311,Πηληϊαδεω,active
4,1.156012,1.596987,Αχιληος,non_verb


In [28]:
df.to_csv("data/assets/tsne_df_pyTorch.csv")

In [29]:
df = df[df['label'] != 'non_verb']

## Visualization

In [30]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook, export_png
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap, factor_mark, linear_cmap
output_notebook()

In [31]:
import random
def generate_random_idx(n=250):
    rand_numbers = []
    for _ in range(n):
        rand_numbers.append(random.randint(0, len(words)))
    return rand_numbers

In [32]:
#test_words = ["εθηκε", 'επος', 'εφατʼ', 'ελαφρον', 'Αθηνη']

# extract n random words from the vocabulary
test_idx = generate_random_idx(n=250)
test_words = [index2word[i] for i in test_idx]

# extract the vector coordinates
x= []
y = []
for word in test_words:
    x.append(vectors_tsne[word2index[word],0])
    y.append(vectors_tsne[word2index[word], 1])

In [33]:
test_df = pd.DataFrame({'word' : test_words, 'x' : x, 'y' : y})

In [34]:
test_df.head()

Unnamed: 0,word,x,y
0,αγυριν,-1.864396,-0.977279
1,κορυθʼ,0.002994,-0.594817
2,ημυνε,0.408049,-1.062029
3,Αχαιικον,-0.649692,0.474874
4,τροπιος,1.659261,0.842284


In [35]:
from bokeh.palettes import Category20_9
def draw_test_words(data, alpha=0.69,width=600, height=400, show=True):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    src = bm.ColumnDataSource(data)
    
    mapper = linear_cmap(field_name='y', palette=Category20_9 ,low=min(y) ,high=max(y))
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Visualize chosen vectors")
    
    fig.scatter('x','y',
                size=10,
                line_color=mapper,
                color=mapper,
                fill_alpha=alpha,
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@word")]))
    if show: pl.show(fig)
    return fig

In [36]:
draw_test_words(data=test_df)

## Subset

In [37]:
def draw_groups(data, radius=10, alpha=0.25,width=600, height=400, show=True, markers=['triangle', 'asterisk'],
                colorstyle=['#2ca02c', '#e34a33'], labels=['medium', 'active'], name="plot.png"):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    MARKERS = markers
    LABEL = labels
    
    src = bm.ColumnDataSource(data)
    
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Skip Gram")
    
    fig.scatter('x','y',
                size=10,
                legend_field="label",
                marker=factor_mark('label', MARKERS, LABEL),
                color=factor_cmap('label', colorstyle, LABEL),
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@token")]))
    export_png(fig, filename="data/assets/"+name)
    if show: pl.show(fig)
    return fig

In [38]:
draw_groups(data=df,name="torch_skip_0502.png")