# Evaluation and Visualization (Skipgram Negative)

Once the model has been trained, we can visualize the vector space and query for specific features/classes/word.
In this notebook I shall explore the distribution of active and middle-passive verbs in the homeric texts.

In [189]:
from argparse import Namespace
import os
import re
import pandas as pd 
import json
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F


## Load saved files

In [21]:
args = Namespace(model = '../../data/models/Skipgram_Pytorch_0502_beta.pth',
                vocab='../../data/vocabularies/Homer_word_frequencies.json',
                 w2i="../../data/vocabularies/Homer_word2index.json",
                embeddings = 100,
                device = 'cuda'
                )

### Model

In [22]:
# NN module
class CBOW(nn.Module):
    def __init__(self, vocab_size, embeddings,device='cpu',noise_dist=None,negs=15):
        super(CBOW, self).__init__()

        self.vocab_size = vocab_size
        self.negs = negs
        self.device = device
        self.noise_dist = noise_dist

        self.embeddings_target = nn.Embedding(vocab_size, embeddings,padding_idx=0)
        self.embeddings_context = nn.Embedding(vocab_size, embeddings,padding_idx=0)

        self.embeddings_target.weight.data.uniform_(-1,1)
        self.embeddings_context.weight.data.uniform_(-1, 1)

    def forward(self,target, context,debug=False):
        # FIXME: Check if everything is implemented correctly
        # or if we need a softmax here

        # computing out loss
        emb_input = self.embeddings_target(target)  # bs, emb_dim
        emb_context = self.embeddings_context(context)  # bs, emb_dim

        emb_product = torch.mul(emb_input, emb_context)  # bs, emb_dim
        emb_product = torch.sum(emb_product, dim=1)  # bs

        out_loss = F.logsigmoid(emb_product)  # bs

        if self.negs > 0:
            # computing negative loss
            if self.noise_dist is None:
                self.noise_dist = torch.ones(self.vocab_size)

            num_neg_samples_for_this_batch = context.shape[0] * self.negs
            # coz bs*num_neg_samples > vocab_size
            negative_example = torch.multinomial(
                self.noise_dist, num_neg_samples_for_this_batch, replacement=True)



            negative_example = negative_example.view(context.shape[0], self.negs).to(self.device)  # bs, num_neg_samples
            emb_negative = self.embeddings_context(negative_example)  # bs, neg_samples, emb_dim
            emb_product_neg_samples = torch.bmm(emb_negative.neg(), emb_input.unsqueeze(2))  # bs, neg_samples, 1

            noise_loss = F.logsigmoid(emb_product_neg_samples).squeeze(2).sum(1)  # bs


            total_loss = -(out_loss + noise_loss).mean()

            return total_loss

        else:
            return -(out_loss).mean()


In [23]:
# import vocab and lookup dictioanries
with open(args.vocab, "r", encoding="utf-8") as fp:
    vocab = json.load(fp)
    
with open(args.w2i, "r", encoding="utf-8") as fp:
    w2i = json.load(fp)

# Create a reverse lookup table
index2word = {i: w for w, i in w2i.items()}

In [16]:
# Initialize model
model = CBOW(vocab_size=len(vocab),
             embeddings=args.embeddings,
             device=args.device,
             noise_dist=None,  # TODO: See later if this works
             negs=15).to(args.device)

In [18]:
saved_model = torch.load(os.path.join(args.model))
model.load_state_dict(saved_model['model_state_dict'])

<All keys matched successfully>

In [19]:
# Simple word embedding mdodel
print(model)

CBOW(
  (embeddings_target): Embedding(30927, 100, padding_idx=0)
  (embeddings_context): Embedding(30927, 100, padding_idx=0)
)


### Embeddings

In [27]:
embds = model.embeddings_target.weight.data.cpu()

print('Embedding', embds)

Embedding tensor([[ 0.9948,  0.8098, -0.1325,  ...,  0.3365, -0.1501, -1.7007],
        [ 0.0713,  0.0431, -0.0376,  ...,  1.0334, -2.7057,  1.0363],
        [-0.1440,  1.0779,  0.0621,  ..., -0.0483, -0.9865,  0.2156],
        ...,
        [ 0.3249, -1.2106,  0.0466,  ...,  0.1353,  0.6111, -0.0626],
        [ 0.6348,  1.2222,  1.2360,  ..., -1.1479, -0.0362,  0.8170],
        [-0.2909, -1.3451, -0.3663,  ...,  0.9110, -0.1364,  1.2354]])


In [28]:
print("Shape: ", embds.shape)

Shape:  torch.Size([30927, 100])


### Eval

In [29]:
def get_vector(token):
    return embds[w2i[token], :]

In [108]:
# test
get_vector('θεα')

tensor([-0.1440,  1.0779,  0.0621, -0.7734, -0.6512, -0.8144, -0.3457,  0.1685,
         0.6755, -0.1850, -1.4117, -0.0287,  1.2630, -0.0361, -0.7264,  0.4768,
         1.1049,  0.4229,  0.0808, -0.3495, -0.8283, -0.1402,  0.4246, -0.4569,
        -0.2953, -1.2036,  0.7600, -0.3579, -0.0433, -0.0833,  0.4617,  0.0479,
        -0.1828,  0.9852, -0.2491,  0.5730, -0.4196,  0.5567,  0.3531, -0.3749,
        -1.3887, -0.5284,  0.4059, -0.1466, -1.7987, -1.1307, -0.0101,  0.4466,
         0.0992, -0.2791, -0.0156,  0.5217, -0.6546, -0.4434, -0.2115,  0.0284,
        -1.4619,  0.5747, -0.0052, -0.7531, -0.6496, -0.5352,  1.1632,  1.0489,
        -0.3385,  0.6922,  0.1404,  0.4606, -0.6115,  0.4626, -0.6609,  1.6730,
        -0.1165,  0.6315,  1.1245,  0.1469, -0.7996,  0.3502, -0.8791, -0.9832,
        -0.0785,  1.3905, -0.3799,  1.6719, -0.0421,  0.7194,  0.4723,  1.1263,
        -1.0692, -0.0041, -0.1501, -0.2352,  0.1635, -0.2810, -0.1009,  0.1636,
        -0.3384, -0.0483, -0.9865,  0.21

In [47]:
words = list(w2i.keys())

# Analogy finder

In [224]:
def find_next_words(target, embeddings, n=5):
    '''
    A kind of Projection formula, finds the closest vector in a vector space to the one given in input
    
    '''
    # calculate distance between target and embeddings
    distance = np.linalg.norm(target - embeddings, axis = 1)   # calc the distance of all vectors from the target
    
    idx_next_words = np.argsort(distance)[:n] # select the indx of the n closest
    
    distances = distance[idx_next_words] # select only the vectors in the precedently found array
    
    return idx_next_words, distances

In [233]:
def analogy(x1, x2, y, n=5):
    '''
    Analogy formula :
            x1 : x2 = y : ?
    
    n (default 5) how many possible answers do you want?
    '''
    vec_x1 = get_vector(x1)
    vec_x2 = get_vector(x2)
    vec_y = get_vector(y)
    vec_unknown = (vec_x1 - vec_x2) + vec_y
    
    idx_next_words, distances = find_next_words(vec_unknown, embds, n=n)
    
    for idx, dist in enumerate(distances):
        print(f"{idx+1}) {index2word[idx]} \t (distance from x1 : {dist})")    

In [234]:
analogy('θεα', "βροτον", "επος")

1) μηνιν 	 (distance from x1 : 11.604100227355957)
2) αειδε 	 (distance from x1 : 11.89823055267334)
3) θεα 	 (distance from x1 : 13.545354843139648)
4) Πηληϊαδεω 	 (distance from x1 : 13.612858772277832)
5) Αχιληος 	 (distance from x1 : 13.741549491882324)


## Dimensionality reduction

In [50]:
# import scaler and dimensionality reducer
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
scaler = StandardScaler()

In [51]:
vectors_tsne = TSNE(n_components=2,
                    perplexity = 7,
                    metric='cosine',
                    n_iter=500,
                    verbose=3,
                    n_jobs=6).fit_transform(embds)

[t-SNE] Computing 22 nearest neighbors...
[t-SNE] Indexed 30927 samples in 0.003s...
[t-SNE] Computed neighbors for 30927 samples in 25.405s...
[t-SNE] Computed conditional probabilities for sample 1000 / 30927
[t-SNE] Computed conditional probabilities for sample 2000 / 30927
[t-SNE] Computed conditional probabilities for sample 3000 / 30927
[t-SNE] Computed conditional probabilities for sample 4000 / 30927
[t-SNE] Computed conditional probabilities for sample 5000 / 30927
[t-SNE] Computed conditional probabilities for sample 6000 / 30927
[t-SNE] Computed conditional probabilities for sample 7000 / 30927
[t-SNE] Computed conditional probabilities for sample 8000 / 30927
[t-SNE] Computed conditional probabilities for sample 9000 / 30927
[t-SNE] Computed conditional probabilities for sample 10000 / 30927
[t-SNE] Computed conditional probabilities for sample 11000 / 30927
[t-SNE] Computed conditional probabilities for sample 12000 / 30927
[t-SNE] Computed conditional probabilities for sa

In [52]:
vectors_tsne = scaler.fit_transform(vectors_tsne)

## Build datframe

In [84]:
df = pd.DataFrame(data=vectors_tsne, columns=["x", "y"])
df['token'] = words

In [85]:
# find and insert labels for diathesis
classe = []
faslse_positives = ["που", "αυτου","και","ειναι","κατʼ"]
for tok in df['token']:
    if tok in faslse_positives:
        classe.append('non_verb')
        continue
        # just a selection of middle endings
    regex = r"\w+(μαι|σαι|σο|ται|το|μεθα|μεθʼ|σθε|σθʼ|σθ|νται|ντʼ|ντο|σθαι|μην|ου|ιο|μην|σθω|σθων|μεν\w{1,3})\b"
    match = re.match(regex, tok)
    if match:
        classe.append('medium')
    else:
        # just a selection of active endings
        regex = r"\w+(μι|σι|τι|μεν|τε|ντι|ω|εις|ει|μεν|τε|ουσι|ον|οιμι)\b"
        match = re.match(regex, tok)
        if match:
            classe.append('active')
        else:
            classe.append('non_verb')

In [86]:
# add labels
df['label'] = classe

In [87]:
df.head()

Unnamed: 0,x,y,token,label
0,0.388673,1.164357,μηνιν,non_verb
1,0.072181,0.86487,αειδε,non_verb
2,-1.546246,-0.931389,θεα,non_verb
3,-0.097304,-0.734222,Πηληϊαδεω,active
4,-0.09672,-0.735453,Αχιληος,non_verb


In [89]:
df.to_csv("../../data/tsne_df_pyTorch.csv")

In [98]:
df = df[df['label'] != 'non_verb']

## Visualization

In [174]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook, export_png
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap, factor_mark, linear_cmap
output_notebook()

In [246]:
import random
def generate_random_idx(n=250):
    rand_numbers = []
    for _ in range(n):
        rand_numbers.append(random.randint(0, len(words)))
    return rand_numbers

In [253]:
#test_words = ["εθηκε", 'επος', 'εφατʼ', 'ελαφρον', 'Αθηνη']

# extract n random words from the vocabulary
test_idx = generate_random_idx(n=250)
test_words = [index2word[i] for i in test_idx]

# extract the vector coordinates
x= []
y = []
for word in test_words:
    x.append(vectors_tsne[w2i[word],0])
    y.append(vectors_tsne[w2i[word], 1])

In [254]:
test_df = pd.DataFrame({'word' : test_words, 'x' : x, 'y' : y})

In [255]:
test_df.head()

Unnamed: 0,word,x,y
0,σχειν,-0.071386,-0.129349
1,εθοντες,-0.274086,2.266622
2,απατερθε,-1.564744,-0.910425
3,μαργε,1.802003,0.608254
4,ξ,-0.63711,0.542897


In [256]:
from bokeh.palettes import Category20_5
def draw_test_words(data, alpha=0.69,width=600, height=400, show=True):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    src = bm.ColumnDataSource(data)
    
    mapper = linear_cmap(field_name='y', palette=Category20_5 ,low=min(y) ,high=max(y))
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Visualize chosen vectors")
    
    fig.scatter('x','y',
                size=10,
                line_color=mapper,
                color=mapper,
                fill_alpha=alpha,
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@word")]))
    if show: pl.show(fig)
    return fig

In [257]:
draw_test_words(data=test_df)

## Subset

In [102]:
def draw_groups(data, radius=10, alpha=0.25,width=600, height=400, show=True, markers=['triangle', 'asterisk'],
                colorstyle=['#2ca02c', '#e34a33'], labels=['medium', 'active'], name="plot.png"):
    """ draws an interactive plot for data points with auxilirary info on hover """
    
    MARKERS = markers
    LABEL = labels
    
    src = bm.ColumnDataSource(data)
    
    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height, title="Skip Gram")
    
    fig.scatter('x','y',
                size=10,
                legend_field="label",
                marker=factor_mark('label', MARKERS, LABEL),
                color=factor_cmap('label', colorstyle, LABEL),
                source=src)

    
    fig.add_tools(bm.HoverTool(tooltips=[("token", "@token")]))
    export_png(fig, filename="../../data/plots/"+name)
    if show: pl.show(fig)
    return fig

In [103]:
draw_groups(data=df,name="torch_skip_0502.png")