In [None]:
# %load_ext autoreload
# %autoreload 2
# %load_ext lab_black

In [1]:
import numpy as np
import pandas as pd
import torch
import sys

from sklearn.manifold import TSNE
import plotly.graph_objects as go

sys.path.append("../")

## Loading Model and Vocabulary

In [2]:
folder = "weights/skipgram_MalaysiaKini_200k"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = torch.load(f"{folder}/model.pt", map_location=device)
vocab = torch.load(f"{folder}/vocab.pt")



NVIDIA GeForce RTX 3090 with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the NVIDIA GeForce RTX 3090 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/




NVIDIA RTX A5000 with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the NVIDIA RTX A5000 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/




ImportError: /usr/local/lib/python3.6/dist-packages/torchtext/_torchtext.so: undefined symbol: _ZN3c106ivalue6Future15extractDataPtrsERKNS_6IValueE

## Getting Embeddings

In [6]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

(1715, 300)

# Visualization with t-SNE

In [7]:
# get embeddings
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_trans = tsne.fit_transform(embeddings_df)
embeddings_df_trans = pd.DataFrame(embeddings_df_trans)

# get token order
embeddings_df_trans.index = vocab.get_itos()

# if token is a number
is_numeric = embeddings_df_trans.index.str.isnumeric()

In [9]:
color = np.where(is_numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_trans[0],
        y=embeddings_df_trans[1],
        mode="text",
        text=embeddings_df_trans.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
fig.write_html("../word2vec_visualization.html")

# Find Similar Words

In [8]:
def get_top_similar(word: str, topN: int = 10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [9]:
vocab.get_itos()

['<unk>',
 '.',
 ',',
 'yang',
 'dan',
 'di',
 'ini',
 'untuk',
 'tidak',
 'dengan',
 'itu',
 'dalam',
 'pada',
 ')',
 '(',
 'kepada',
 'mereka',
 'negara',
 'kita',
 'akan',
 'saya',
 'kerajaan',
 'adalah',
 'rakyat',
 'malaysia',
 'oleh',
 'juga',
 'atau',
 'ada',
 'lebih',
 'sebagai',
 'daripada',
 'orang',
 'telah',
 'ia',
 'menteri',
 'dari',
 'politik',
 'menjadi',
 'ke',
 '-',
 'kerana',
 'boleh',
 'seperti',
 'lagi',
 'tahun',
 '?',
 'bahawa',
 'bagi',
 'satu',
 'covid-19',
 'hari',
 'parlimen',
 'perlu',
 'tetapi',
 'umno',
 'parti',
 'secara',
 'lain',
 'bukan',
 'beliau',
 'dapat',
 'sudah',
 'ialah',
 'semua',
 'hanya',
 'kes',
 'jika',
 'ketika',
 'perdana',
 'masa',
 'sama',
 'tersebut',
 'ahli',
 'pihak',
 'apabila',
 'serta',
 'masih',
 'seorang',
 'melayu',
 'dia',
 'apa',
 'antara',
 "'",
 'dunia',
 'besar',
 'baru',
 'ekonomi',
 'terhadap',
 'namun',
 'banyak',
 'mempunyai',
 'selepas',
 'atas',
 'dua',
 'kuasa',
 'beberapa',
 'sebelum',
 'islam',
 'ph',
 'pula',
 'b

In [11]:
get_top_similar("yang")

{'betapa': 0.21499883,
 '<unk>': 0.19486588,
 'sepenuhnya': 0.18659918,
 'dan': 0.17052737,
 'mana-mana': 0.17009011,
 'melalui': 0.17002897,
 'ia': 0.16934702,
 'tokoh': 0.1640118,
 'sifat': 0.16323656,
 'keyakinan': 0.1630928}

# Vector Equations

In [22]:
emb1 = embeddings[vocab["king"]]
emb2 = embeddings[vocab["man"]]
emb3 = embeddings[vocab["woman"]]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings_norm, emb4).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(vocab.lookup_token(word_id), dists[word_id]))

king: 0.554
woman: 0.462
æthelfrith: 0.334
jacobites: 0.300
copeland: 0.296
