# Evaluation and Visualization (Skipgram Negative)

Once the model has been trained, we can visualize the vector space and query for specific features/classes/word.

Explorations:

* Get vector for a given word
* Predict context for the given word
* Draw the vector space for n random words in the vocabulary
* Build a dataframe, classify words (here I experimented with the greek diathesis) and draw the distribution plot

For the visualizations I used the wonderful Bokeh library.

In [1]:
from argparse import Namespace
import os
import re
import pandas as pd 
import json
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F


## Load saved files

In [2]:
args = Namespace(model = 'data/models/Skipgram_Pytorch_0502_gamma.pth',
                vocab='data/vocabs/Homer_word_frequencies.json',
                word2index="data/vocabs/Homer_word2index.json",
                embeddings = 100,
                device = 'cuda'
                )

### Model

In [3]:
# NN module
class CBOW(nn.Module):
    def __init__(self, vocab_size, embeddings,device='cpu',noise_dist=None,negs=15):
        super(CBOW, self).__init__()

        self.vocab_size = vocab_size
        self.negs = negs
        self.device = device
        self.noise_dist = noise_dist

        self.embeddings_target = nn.Embedding(vocab_size, embeddings,padding_idx=0)
        self.embeddings_context = nn.Embedding(vocab_size, embeddings,padding_idx=0)

        self.embeddings_target.weight.data.uniform_(-1,1)
        self.embeddings_context.weight.data.uniform_(-1, 1)

    def forward(self,target, context,debug=False):
        # FIXME: Check if everything is implemented correctly
        # or if we need a softmax here

        # computing out loss
        emb_input = self.embeddings_target(target)  # bs, emb_dim
        emb_context = self.embeddings_context(context)  # bs, emb_dim

        emb_product = torch.mul(emb_input, emb_context)  # bs, emb_dim
        emb_product = torch.sum(emb_product, dim=1)  # bs

        out_loss = F.logsigmoid(emb_product)  # bs

        if self.negs > 0:
            # computing negative loss
            if self.noise_dist is None:
                self.noise_dist = torch.ones(self.vocab_size)

            num_neg_samples_for_this_batch = context.shape[0] * self.negs
            # coz bs*num_neg_samples > vocab_size
            negative_example = torch.multinomial(
                self.noise_dist, num_neg_samples_for_this_batch, replacement=True)



            negative_example = negative_example.view(context.shape[0], self.negs).to(self.device)  # bs, num_neg_samples
            emb_negative = self.embeddings_context(negative_example)  # bs, neg_samples, emb_dim
            emb_product_neg_samples = torch.bmm(emb_negative.neg(), emb_input.unsqueeze(2))  # bs, neg_samples, 1

            noise_loss = F.logsigmoid(emb_product_neg_samples).squeeze(2).sum(1)  # bs


            total_loss = -(out_loss + noise_loss).mean()

            return total_loss

        else:
            return -(out_loss).mean()


In [4]:
# import vocab and lookup dictioanries
with open(args.vocab, "r", encoding="utf-8") as fp:
    vocab = json.load(fp)
    
with open(args.word2index, "r", encoding="utf-8") as fp:
    word2index = json.load(fp)

# Create a reverse lookup table
index2word = {i: w for w, i in word2index.items()}

In [5]:
# Initialize model
model = CBOW(vocab_size=len(vocab),
             embeddings=args.embeddings,
             device=args.device,
             noise_dist=None,  # TODO: See later if this works
             negs=15).to(args.device)

In [6]:
saved_model = torch.load(os.path.join(args.model))
model.load_state_dict(saved_model['model_state_dict'])

<All keys matched successfully>

In [7]:
# Simple word embedding mdodel
print(model)

CBOW(
  (embeddings_target): Embedding(30927, 100, padding_idx=0)
  (embeddings_context): Embedding(30927, 100, padding_idx=0)
)


### Embeddings

In [8]:
embds = model.embeddings_target.weight.data.cpu()

print('Embedding', embds)

Embedding tensor([[-1.1602e-01,  1.1290e-01,  1.1748e-01,  ...,  1.0396e-01,
         -1.1185e-01, -1.1392e-01],
        [-1.0117e-01,  1.0125e-01,  1.0162e-01,  ...,  1.0197e-01,
         -1.0117e-01, -1.1047e-01],
        [-2.6160e-01,  2.3634e-01,  2.1379e-01,  ...,  2.5945e-01,
         -2.4065e-01, -2.3100e-01],
        ...,
        [ 6.0985e-04, -2.0532e-04,  1.8179e-03,  ...,  3.3687e-03,
         -4.6987e-03, -7.3201e-04],
        [ 1.1247e-03,  2.4306e-03, -4.4913e-03,  ...,  4.4538e-03,
          4.7148e-03, -2.0060e-03],
        [-4.1620e-03,  2.8161e-03,  4.6508e-03,  ..., -2.1850e-03,
         -3.6790e-03, -4.6172e-03]])


In [9]:
print("Shape: ", embds.shape)

Shape:  torch.Size([30927, 100])


### Eval

In [10]:
def get_vector(token):
    return embds[word2index[token], :]

In [11]:
# test
get_vector('θεα')

tensor([-0.2616,  0.2363,  0.2138,  0.2439,  0.1649,  0.2417, -0.2362, -0.2545,
        -0.2338,  0.1866,  0.2611,  0.2645, -0.2786, -0.2576,  0.2462,  0.2472,
         0.2430, -0.2324,  0.2680,  0.2483, -0.2507,  0.2459, -0.2549,  0.2630,
         0.2557,  0.1939, -0.2009, -0.2533, -0.1801,  0.1649,  0.2400, -0.2457,
        -0.2437,  0.2675,  0.1966, -0.2749, -0.2348,  0.2318, -0.2883, -0.2483,
        -0.2782,  0.2316, -0.2530,  0.2769,  0.2425, -0.2400, -0.2671, -0.2550,
        -0.2609, -0.2850,  0.2523, -0.2377,  0.2323,  0.2397, -0.2071, -0.2401,
        -0.2237, -0.2337,  0.2281, -0.2758,  0.2397, -0.2828,  0.2326, -0.2341,
        -0.2296, -0.2629,  0.2472, -0.2204, -0.2446, -0.1437, -0.2711, -0.2475,
         0.2588, -0.2373,  0.2361,  0.2425,  0.2026, -0.2650,  0.2617,  0.2529,
        -0.2133,  0.2638, -0.2515, -0.2628, -0.2687,  0.2487, -0.1861,  0.2448,
        -0.2714, -0.2603,  0.2584, -0.2581,  0.2425, -0.2420, -0.2505, -0.2438,
        -0.2613,  0.2594, -0.2407, -0.23

In [12]:
words = list(word2index.keys())

# Analogy finder

In [13]:
def find_next_words(target, embeddings, n=5):
    '''
    A kind of Projection formula, finds the closest vector in a vector space to the one given in input
    
    '''
    # calculate distance between target and embeddings
    distance = np.linalg.norm(target - embeddings, axis = 1)   # calc the distance of all vectors from the target
    
    idx_next_words = np.argsort(distance)[:n] # select the indx of the n closest
    
    distances = distance[idx_next_words] # select only the vectors in the precedently found array
    
    return idx_next_words, distances

In [16]:
def analogy(x1, x2, y, n=20):
    '''
    Analogy formula :
            x1 : x2 = y : ?
    
    n (default 5) how many possible answers do you want?
    '''
    vec_x1 = get_vector(x1)
    vec_x2 = get_vector(x2)
    vec_y = get_vector(y)
    vec_unknown = (vec_x1 - vec_x2) + vec_y
    
    idx_next_words, distances = find_next_words(vec_unknown, embds, n=n)
    
    for idx, dist in enumerate(distances):
        print(f"{idx+1}) {index2word[idx]} \t (distance from x1 : {dist})")    

In [17]:
analogy('θεα', "βροτον", "επος")

1) μηνιν 	 (distance from x1 : 0.7414302825927734)
2) αειδε 	 (distance from x1 : 0.8177877068519592)
3) θεα 	 (distance from x1 : 0.846998929977417)
4) Πηληϊαδεω 	 (distance from x1 : 0.8520642518997192)
5) Αχιληος 	 (distance from x1 : 0.9112642407417297)
6) ουλομενην 	 (distance from x1 : 0.9175994396209717)
7) μυριʼ 	 (distance from x1 : 0.9310554265975952)
8) Αχαιοις 	 (distance from x1 : 0.9912585616111755)
9) αλγεʼ 	 (distance from x1 : 0.9931339621543884)
10) εθηκε 	 (distance from x1 : 1.025689721107483)
11) πολλας 	 (distance from x1 : 1.0319736003875732)
12) ιφθιμους 	 (distance from x1 : 1.033264398574829)
13) ψυχας 	 (distance from x1 : 1.0433059930801392)
14) Αϊδι 	 (distance from x1 : 1.0468624830245972)
15) προϊαψεν 	 (distance from x1 : 1.0565787553787231)
16) ηρωων 	 (distance from x1 : 1.0723127126693726)
17) αυτους 	 (distance from x1 : 1.0872044563293457)
18) ελωρια 	 (distance from x1 : 1.104622721672058)
19) τευχε 	 (distance from x1 : 1.1085482835769653)
20) κυν

## Dimensionality reduction

In [18]:
# import scaler and dimensionality reducer
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
scaler = StandardScaler()

In [19]:
vectors_tsne = TSNE(n_components=2,
                    perplexity = 7,
                    metric='cosine',
                    n_iter=1000,
                    verbose=3,
                    n_jobs=6).fit_transform(embds)

[t-SNE] Computing 22 nearest neighbors...
[t-SNE] Indexed 30927 samples in 0.000s...
[t-SNE] Computed neighbors for 30927 samples in 23.882s...
[t-SNE] Computed conditional probabilities for sample 1000 / 30927
[t-SNE] Computed conditional probabilities for sample 2000 / 30927
[t-SNE] Computed conditional probabilities for sample 3000 / 30927
[t-SNE] Computed conditional probabilities for sample 4000 / 30927
[t-SNE] Computed conditional probabilities for sample 5000 / 30927
[t-SNE] Computed conditional probabilities for sample 6000 / 30927
[t-SNE] Computed conditional probabilities for sample 7000 / 30927
[t-SNE] Computed conditional probabilities for sample 8000 / 30927
[t-SNE] Computed conditional probabilities for sample 9000 / 30927
[t-SNE] Computed conditional probabilities for sample 10000 / 30927
[t-SNE] Computed conditional probabilities for sample 11000 / 30927
[t-SNE] Computed conditional probabilities for sample 12000 / 30927
[t-SNE] Computed conditional probabilities for sa

In [21]:
vectors_tsne = scaler.fit_transform(vectors_tsne)

## Build datframe

In [22]:
df = pd.DataFrame(data=vectors_tsne, columns=["x", "y"])
df['token'] = words

In [23]:
# find and insert labels for diathesis
classe = []
faslse_positives = ["που", "αυτου","και","ειναι","κατʼ"]
for tok in df['token']:
    if tok in faslse_positives:
        classe.append('non_verb')
        continue
        # just a selection of middle endings
    regex = r"\w+(μαι|σαι|σο|ται|το|μεθα|μεθʼ|σθε|σθʼ|σθ|νται|ντʼ|ντο|σθαι|μην|ου|ιο|μην|σθω|σθων|μεν\w{1,3})\b"
    match = re.match(regex, tok)
    if match:
        classe.append('medium')
    else:
        # just a selection of active endings
        regex = r"\w+(μι|σι|τι|μεν|τε|ντι|ω|εις|ει|μεν|τε|ουσι|ον|οιμι)\b"
        match = re.match(regex, tok)
        if match:
            classe.append('active')
        else:
            classe.append('non_verb')

In [24]:
# add labels
df['label'] = classe

In [25]:
df.head()

Unnamed: 0,x,y,token,label
0,0.920225,-1.391794,μηνιν,non_verb
1,0.802272,-1.816971,αειδε,non_verb
2,-2.106863,0.177849,θεα,non_verb
3,1.889725,-1.670327,Πηληϊαδεω,active
4,-2.062306,0.558681,Αχιληος,non_verb


In [26]:
df.to_csv("data/assets/tsne_df_pyTorch.csv")

In [27]:
df = df[df['label'] != 'non_verb']

## Visualization

In [28]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook, export_png
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap, factor_mark, linear_cmap
output_notebook()

In [29]:
import random
def generate_random_idx(n=250):
    rand_numbers = []
    for _ in range(n):
        rand_numbers.append(random.randint(0, len(words)))
    return rand_numbers

In [30]:
#test_words = ["εθηκε", 'επος', 'εφατʼ', 'ελαφρον', 'Αθηνη']

# extract n random words from the vocabulary
test_idx = generate_random_idx(n=250)
test_words = [index2word[i] for i in test_idx]

# extract the vector coordinates
x= []
y = []
for word in test_words:
    x.append(vectors_tsne[word2index[word],0])
    y.append(vectors_tsne[word2index[word], 1])

In [31]:
test_df = pd.DataFrame({'word' : test_words, 'x' : x, 'y' : y})

In [32]:
test_df.head()

Unnamed: 0,word,x,y
0,παννυχοι,0.566474,-0.700381
1,ποτιδεγμενοι,0.515328,-1.316175
2,νεικειων,-1.334799,-0.948954
3,ρνις,0.991941,0.209749
4,τισετε,0.437115,0.851202


In [33]:
from bokeh.palettes import Category20_9
def draw_test_words(data, alpha=0.69,width=600, height=400, show=True):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    src = bm.ColumnDataSource(data)
    
    mapper = linear_cmap(field_name='y', palette=Category20_9 ,low=min(y) ,high=max(y))
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Visualize chosen vectors")
    
    fig.scatter('x','y',
                size=10,
                line_color=mapper,
                color=mapper,
                fill_alpha=alpha,
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@word")]))
    if show: pl.show(fig)
    return fig

In [34]:
draw_test_words(data=test_df)

## Subset

In [36]:
def draw_groups(data, radius=10, alpha=0.25,width=600, height=400, show=True, markers=['triangle', 'asterisk'],
                colorstyle=['#2ca02c', '#e34a33'], labels=['medium', 'active'], name="plot.png"):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    MARKERS = markers
    LABEL = labels
    
    src = bm.ColumnDataSource(data)
    
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Skip Gram")
    
    fig.scatter('x','y',
                size=10,
                legend_field="label",
                marker=factor_mark('label', MARKERS, LABEL),
                color=factor_cmap('label', colorstyle, LABEL),
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@token")]))
    export_png(fig, filename="data/assets/"+name)
    if show: pl.show(fig)
    return fig

In [38]:
draw_groups(data=df,name="torch_skip_0502.png")