# Hrusso Aka Word Embeddings

In [None]:
import csv
import numpy as np
from numpy.linalg import norm
import pandas as pd
import pickle as pkl
import gensim.downloader
from gensim.models import Word2Vec
from sklearn.decomposition import TruncatedSVD
from scipy.stats import spearmanr
from keybert import KeyBERT
from collections import Counter

## Data Import

In [None]:
# Bilingual Hrusso dictionary digitalised from a NEILAC resource
# The file contains three columns: the Hrusso Aka words, the part-of-speech, the English translation
bidict_df = pd.read_csv("../data/hrusso_dict.csv", names=['word', 'pos', 'eng'])

with open('../data/train_data.txt', 'r') as file:
    sentences = file.readlines()

# Strip newline characters from the end of each line
sentences = [sentence.strip().split() for sentence in sentences]

In [3]:
# Get a list of unique words
unique_words = list(set([word for sentence in sentences for word in sentence]))
with open('unique_words.pkl', 'wb') as f:
    pkl.dump(unique_words, f)

## Model Training

### Hyperparameters

In [4]:
methods = ["w2v", "svd", "clwe-reg", "clwe-orth"]
clwe_base = ["w2v", "svd"]
w2v_algs = ["cbow", "sg"]
dims = [50, 100, 200]
win_sizes = [2, 5, 10]
seed = 42

### Helper Functions

SVD Helpers

In [5]:
def get_co_mat(sentences, win_size, unique_words):
    # Create and initialize co-occurrence matrix
    matrix_size = len(unique_words)
    co_mat = np.zeros((matrix_size, matrix_size))

    # Fill co-occurrence matrix
    for sentence in sentences:
        for target_id, target in enumerate(sentence):
            context_win = sentence[max(target_id - win_size, 0) : min(target_id + win_size, len(sentence) + 1)]
            # Iterate through the context window and update the co-occurrence matrix
            for context in context_win:
                if context != target:
                    context_id = unique_words.index(context)
                    co_mat[target_id, context_id] += 1
    
    return co_mat

CLWE Helpers

In [None]:
def get_seed_lexicon(sentences, no_seed_words = 100):
    kw_model = KeyBERT()
    seed_lexicon = []
    flattened_sent = [word for sentence in sentences for word in sentence]
    word_freq = Counter(flattened_sent)

    # Get the most common words that are present in the dictionary
    for word, _ in word_freq.most_common():
        if len(seed_lexicon) >= no_seed_words:
            break
        if word in bidict_df['word'].values:
            eng_word = bidict_df.loc[bidict_df['word'] == word, 'eng'].values[0]
            if len(eng_word.split()) > 1:
                eng_word = kw_model.extract_keywords(eng_word, keyphrase_ngram_range=(1, 1), stop_words=None)[0][0]
            seed_lexicon.append((word, eng_word))
    
    return seed_lexicon

def get_embedding_matrices(source_we_model, target_we_model, seed_lexicon, is_hru_svd = False):
    source_we = []
    target_we = []

    for word, eng_word in seed_lexicon:
        if is_hru_svd:
            if word in unique_words and eng_word in target_we_model.index_to_key:
                source_we.append(source_we_model.components_[:, unique_words.index(word)])
                target_we.append(target_we_model.get_vector(eng_word))
        else:
            if word in source_we_model.wv and eng_word in target_we_model.index_to_key:
                source_we.append(source_we_model.wv[word])
                target_we.append(target_we_model.get_vector(eng_word))
            else:
                print(f"Word {word} or {eng_word} not in vocabulary")
    
    # Normalize the embedding matrices
    source_we, target_we = np.array(source_we), np.array(target_we)
    normalized_source_we = source_we / np.linalg.norm(source_we, axis=1, keepdims=True)
    normalized_target_we = target_we / np.linalg.norm(target_we, axis=1, keepdims=True)

    return normalized_source_we, normalized_target_we

def learn_transformation_matrix(Xs, Xt, learning_rate=0.01, epochs=100):
    # Initialize W
    W = np.random.randn(Xs.shape[1], Xt.shape[1])

    def mse_loss(W, Xs, Xt):
        transformed_Xs = np.dot(Xs, W)
        mse = np.mean(np.linalg.norm(transformed_Xs - Xt, axis=1) ** 2)
        return mse

    # Gradient descent
    for epoch in range(epochs):
        # Shuffle indices
        indices = np.random.permutation(len(Xs))

        for idx in indices:
            xs_sample = Xs[idx:idx+1]
            xt_sample = Xt[idx:idx+1]

            gradient = np.dot(xs_sample.T, np.dot(xs_sample, W) - xt_sample)

            # Update W
            W -= learning_rate * gradient

        # Calculate and print MSE loss
        loss = mse_loss(W, Xs, Xt)
        # if (epoch+1) % 10 == 0:
        #     print(f'Epoch {epoch+1}/{epochs}, Loss: {loss}')

    return W

def learn_orthogonal_transformation_matrix(Xs, Xt):
    product = np.dot(Xt.T, Xs)

    # Compute SVD of the product
    U, _, Vt = np.linalg.svd(product)

    # Construct orthogonal matrix W using U and V
    W = np.dot(Vt.T, U.T)

    return W

### Training

In [7]:
for method in methods:
    if method == "w2v":
        for dim in dims:
            for win_size in win_sizes:
                for idx, algorithm in enumerate(w2v_algs):
                    model = Word2Vec(sentences, vector_size=dim, window=win_size, min_count=1,\
                                     sample=1e-3, sg=idx, hs=0, negative=5, seed=seed)
                    model.save(f"emb_models/{method}_{dim}_{win_size}_{algorithm}.model")
    
    elif method == "svd":
        for win_size in win_sizes:
            co_mat = get_co_mat(sentences, win_size, unique_words)
            for dim in dims:
                model = TruncatedSVD(n_components=dim)
                model.fit_transform(co_mat)
                with open(f"emb_models/{method}_{dim}_{win_size}.pkl", 'wb') as f:
                    pkl.dump(model, f)

    elif method == "clwe-reg":
        eng_model = gensim.downloader.load("glove-wiki-gigaword-100")
        for base in clwe_base:
            if base == "w2v":
                hrusso_model = Word2Vec.load(f"emb_models/w2v_100_10_sg.model")
                Xs, Xt = get_embedding_matrices(hrusso_model, eng_model, get_seed_lexicon(sentences))
                W = learn_transformation_matrix(Xs, Xt, epochs=500)
                hrusso_model.wv.vectors = np.dot(hrusso_model.wv.vectors, W)
                hrusso_model.save(f"emb_models/{method}_{base}_100_10_sg.model")
            else:
                hrusso_model = []
                with open(f"emb_models/svd_100_5.pkl", 'rb') as f:
                    hrusso_model = pkl.load(f)
                # eng_model = gensim.downloader.load("glove-wiki-gigaword-100")
                Xs, Xt = get_embedding_matrices(hrusso_model, eng_model, get_seed_lexicon(sentences), is_hru_svd=True)
                W = learn_transformation_matrix(Xs, Xt, epochs=500)
                hrusso_model.components_ = np.dot(hrusso_model.components_.T, W).T
                with open(f"emb_models/{method}_{base}_100_5.pkl", 'wb') as f:
                    pkl.dump(hrusso_model, f)

    if method == "clwe-orth":
        # eng_model = gensim.downloader.load("glove-wiki-gigaword-100")
        for base in clwe_base:
            if base == "w2v":
                hrusso_model = Word2Vec.load(f"emb_models/w2v_100_10_sg.model")
                Xs, Xt = get_embedding_matrices(hrusso_model, eng_model, get_seed_lexicon(sentences))
                W = learn_orthogonal_transformation_matrix(Xs, Xt)
                hrusso_model.wv.vectors = np.dot(hrusso_model.wv.vectors, W)
                hrusso_model.save(f"emb_models/{method}_{base}_100_10_sg.model")
            else:
                hrusso_model = []
                with open(f"emb_models/svd_100_5.pkl", 'rb') as f:
                    hrusso_model = pkl.load(f)
                eng_model = gensim.downloader.load("glove-wiki-gigaword-100")
                Xs, Xt = get_embedding_matrices(hrusso_model, eng_model, get_seed_lexicon(sentences), is_hru_svd=True)
                W = learn_orthogonal_transformation_matrix(Xs, Xt)
                hrusso_model.components_ = np.dot(hrusso_model.components_.T, W).T
                with open(f"emb_models/{method}_{base}_100_5.pkl", 'wb') as f:
                    pkl.dump(hrusso_model, f)

## Model Evaluation

### Hypernym-Hyponym Data Import

In [9]:
emb_eval_data_file = "valid_emb_eval_pairs.csv"

# Load the evaluation data, where the hypernym is the key and the hyponyms are the value
emb_eval_data = {}
with open(emb_eval_data_file, 'r', newline='') as csvfile:
    csv_reader = csv.reader(csvfile)
    for row in csv_reader:
        key = row[0]
        values = row[1:]
        emb_eval_data[key] = values

# Check if all words in the evaluation data exist in the sentences
for key in emb_eval_data.keys():
    for value in emb_eval_data[key]:
        found = False
        for sentence in sentences:
            if value in sentence:
                found = True
                break
        # If the word is not found in any sentence, print it
        if not found:
            print(value)
    found = False
    for sentence in sentences:
        if key in sentence:
            found = True
            break
    # If the word is not found in any sentence, print it
    if not found:
        print(key)

### Build Gold Standard
The gold standard vectors have similarity scores of 1 for the hyponyms of their corresponding hypernym and 0 otherwise.

In [10]:
hyponyms = [value for values in emb_eval_data.values() for value in values]

gold_standard = []
for key in emb_eval_data.keys():
    gold_standard.append([0 for _ in range(len(hyponyms))])
    for value in emb_eval_data[key]:
        gold_standard[-1][hyponyms.index(value)] = 1

### Compare with the Similarity Scores from the Models

In [11]:
def evaluate_w2v(filename, emb_eval_data, hyponyms, gold_standard):
    model = Word2Vec.load(filename)
    model_sim = []
    for key in emb_eval_data.keys():
        similarities = []
        for value in hyponyms:
            similarities.append(model.wv.similarity(key, value))
        model_sim.append(similarities)
    correlation = spearmanr(np.array(model_sim).flatten(), np.array(gold_standard).flatten())
    new_eval_value = (correlation[0], correlation[1])

    return new_eval_value

def evaluate_svd(filename, emb_eval_data, unique_words, hyponyms, gold_standard):
    model = pkl.load(filename)
    model_sim = []
    for key in emb_eval_data.keys():
        similarities = []
        for value in hyponyms:
            key_vec = model.components_[:, unique_words.index(key)]
            value_vec = model.components_[:, unique_words.index(value)]
            similarities.append(np.dot(key_vec,value_vec)/(norm(key_vec)*norm(value_vec)))
        model_sim.append(similarities)
    correlation = spearmanr(np.array(model_sim).flatten(), np.array(gold_standard).flatten())
    new_eval_value = (correlation[0], correlation[1])

    return new_eval_value

In [12]:
eval_df = pd.DataFrame(columns=["method", "dim", "win_size = 2", "win_size = 5", "win_size = 10"])

for method in methods:
    if method not in ["clwe-reg", "clwe-orth"]:
        for dim in dims:
            if method == "w2v":
                for alg in w2v_algs:
                    eval_df.loc[len(eval_df)] = [method+'-'+alg, dim, (0.0, 0.0), (0.0, 0.0), (0.0, 0.0)]
            else:
                eval_df.loc[len(eval_df)] = [method, dim, (0.0, 0.0), (0.0, 0.0), (0.0, 0.0)]

            for win_size in win_sizes:
                if method == "w2v":
                    for alg in w2v_algs:
                        filename = f"emb_models/{method}_{dim}_{win_size}_{alg}.model"
                        new_eval_value = evaluate_w2v(filename, emb_eval_data, hyponyms, gold_standard)
                        if alg == "cbow":
                            eval_df.at[len(eval_df) - 2, f"win_size = {win_size}"] = new_eval_value
                        else:
                            eval_df.at[len(eval_df) - 1, f"win_size = {win_size}"] = new_eval_value

                if method == "svd":
                    with open(f"emb_models/{method}_{dim}_{win_size}.pkl", 'rb') as f:
                        new_eval_value = evaluate_svd(f, emb_eval_data, unique_words, hyponyms, gold_standard)
                        eval_df.at[len(eval_df) - 1, f"win_size = {win_size}"] = new_eval_value
    
    elif method == "clwe-reg" or method == "clwe-orth":
        for base in clwe_base:
            if base == "w2v":
                eval_df.loc[len(eval_df)] = [method+'-'+base, 100, ('NA', 'NA'), ('NA', 'NA'), (0.0, 0.0)]
                filename = f"emb_models/{method}_{base}_100_10_sg.model"
                new_eval_value = evaluate_w2v(filename, emb_eval_data, hyponyms, gold_standard)
                eval_df.at[len(eval_df) - 1, f"win_size = {10}"] = new_eval_value

            else:
                eval_df.loc[len(eval_df)] = [method+'-'+base, 100, ('NA', 'NA'), (0.0, 0.0), ('NA', 'NA')]
                with open(f"emb_models/{method}_{base}_100_5.pkl", 'rb') as f:
                        new_eval_value = evaluate_svd(f, emb_eval_data, unique_words, hyponyms, gold_standard)
                        eval_df.at[len(eval_df) - 1, f"win_size = {5}"] = new_eval_value

In [13]:
eval_df

Unnamed: 0,method,dim,win_size = 2,win_size = 5,win_size = 10
0,w2v-cbow,50,"(0.010152260353641482, 0.7546531507406037)","(0.002238703565161967, 0.9450606871145031)","(0.007097905101947633, 0.8270498048380326)"
1,w2v-sg,50,"(0.004043549850253786, 0.9009455285831092)","(0.014907336195371934, 0.646309195959095)","(0.030127049528071128, 0.35363272848306226)"
2,w2v-cbow,100,"(0.012998364110901658, 0.6890637744038759)","(0.007375573761192528, 0.8203971187212318)","(0.004616241508403861, 0.8870045336497803)"
3,w2v-sg,100,"(0.007670596738480087, 0.8133427588175175)","(0.020668960894863767, 0.5245878217296382)","(0.025562870941733165, 0.43128663050119087)"
4,w2v-cbow,200,"(0.01084643200175372, 0.738470746092208)","(0.007826785332465483, 0.8096141716256643)","(0.009822528889527348, 0.7623781244596826)"
5,w2v-sg,200,"(0.013119844149321295, 0.6863114549863787)","(0.027992471710125998, 0.3887871995343488)","(0.023601836035816087, 0.46747047984395595)"
6,svd,50,"(0.02727227492340868, 0.40111170399743046)","(0.03361526205983512, 0.3006593994280169)","(0.018204651471743438, 0.5751979445275196)"
7,svd,100,"(0.02015701374601437, 0.5349115665452605)","(0.03425737083433895, 0.29151564983428385)","(0.016729536719504935, 0.6065557391892258)"
8,svd,200,"(0.009076296712712378, 0.7799473156443644)","(0.02648264837548188, 0.4148900950430032)","(0.0061260647945905, 0.8504289287301849)"
9,clwe-reg-w2v,100,"(NA, NA)","(NA, NA)","(0.04060904113037922, 0.2111086873581284)"
