In [1]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
glove_file = datapath('glove.6B.100d.txt')
model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [None]:
from scipy import spatial
from scipy.stats import spearmanr
from scipy.spatial.distance import cosine
import numpy as np

def cos_sim(a, b):
    return 1 - spatial.distance.cosine(
        a, b
    )  # distance = 1 - similarlity, because scipy only gives distance

def find_closest_word(vec, embeddings, exclude_ids):
    max_similarity = -float("inf")
    best_idx = -1

    for idx, emb in enumerate(embeddings):
        if idx in exclude_ids:
            continue
        similarity = cos_sim(vec, emb)
        if similarity > max_similarity:
            max_similarity = similarity
            best_idx = idx

    return best_idx

def load_specific_categories(file_path, semantic_category, syntactic_category):
    semantic = []
    syntactic = []
    current_group = None

    with open(file_path, "r") as f:
        for line in f:
            if line.startswith(":"):
                if semantic_category in line.lower():
                    current_group = semantic
                elif syntactic_category in line.lower():
                    current_group = syntactic
                else:
                    current_group = None
            elif current_group is not None:
                words = line.strip().split()
                if len(words) == 4:
                    current_group.append(words)

    return semantic, syntactic

def evaluate_word_analogies_with_custom_functions(model, file_path):
    semantic_analogies, syntactic_analogies = load_specific_categories(
        file_path, 'semantic', 'syntactic'
    )

    word_to_idx = model.key_to_index
    embeddings = model.vectors  

    syntactic_accuracy = evaluate_analogies(syntactic_analogies, word_to_idx, embeddings)
    print(f"Syntactic Accuracy: {syntactic_accuracy}")

    semantic_accuracy = evaluate_analogies(semantic_analogies, word_to_idx, embeddings)
    print(f"Semantic Accuracy: {semantic_accuracy}")


def load_analogies(file_path):
    analogies = []
    current_category = None

    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith(":"):
                current_category = line[1:].strip()
            elif line and not line.startswith("//"):
                words = line.split()
                if len(words) == 4:
                    analogies.append(tuple(words))
    
    return analogies

def evaluate_analogies(analogy_data, word_to_idx, embeddings):
    correct = 0
    total = 0

    for word1, word2, word3, word4 in analogy_data:
        if all(word in word_to_idx for word in [word1, word2, word3, word4]):
            idx1 = word_to_idx[word1]
            idx2 = word_to_idx[word2]
            idx3 = word_to_idx[word3]
            idx4 = word_to_idx[word4]

            vec = embeddings[idx2] - embeddings[idx1] + embeddings[idx3]

            # get the most similar word to the analogy vector
            predicted_idx = find_closest_word(vec, embeddings, {idx1, idx2, idx3})
            
            # ensure the predicted word matches the fourth word in the analogy
            if predicted_idx == idx4:
                correct += 1

            total += 1
        else:
            print(f"Skipping analogy {word1}, {word2}, {word3}, {word4} due to missing words")

    # Return accuracy if total > 0, otherwise 0
    return correct / total if total > 0 else 0


def load_wordsim353(file_path):
    word_pairs = []
    human_scores = []

    with open(file_path, "r") as f:
        next(f)
        for line in f:
            word1, word2, score = line.strip().split()
            word_pairs.append((word1, word2))
            human_scores.append(float(score))

    return word_pairs, human_scores


def calculate_model_similarity(word_pairs, model, word_to_idx):
    model_scores = []
    embeddings = model.vectors  # access embeddings directly from the KeyedVectors object

    for word1, word2 in word_pairs:
        if word1 in word_to_idx and word2 in word_to_idx:
            idx1 = word_to_idx[word1]
            idx2 = word_to_idx[word2]
            # get cosine similarity (dot product is equivalent to 1 - cosine distance)
            similarity = 1 - cosine(embeddings[idx1], embeddings[idx2])
            model_scores.append(similarity)
        else:
            model_scores.append(None)  # handle OOV (out-of-vocabulary) words
    return model_scores

def compute_spearman_correlation(human_scores, model_scores):
    valid_scores = [(h, m) for h, m in zip(human_scores, model_scores) if m is not None]
    filtered_human_scores, filtered_model_scores = zip(*valid_scores)

    correlation, _ = spearmanr(filtered_human_scores, filtered_model_scores)
    return correlation

def compute_mse(human_scores, model_scores):
    #remove null values from model_scores
    valid_scores = [(h, m) for h, m in zip(human_scores, model_scores) if m is not None]
    filtered_human_scores, filtered_model_scores = zip(*valid_scores)

    mse = np.mean((np.array(filtered_model_scores) - np.array(filtered_human_scores)) ** 2)
    return mse

def compute_average_human_score(human_scores):
    return sum(human_scores) / len(human_scores)

def load_wordsim353(file_path):
    word_pairs = []
    human_scores = []

    with open(file_path, "r") as f:
        next(f)
        for line in f:
            word1, word2, score = line.strip().split()
            word_pairs.append((word1, word2))
            human_scores.append(float(score))

    return word_pairs, human_scores

In [17]:
semantic_analogies, syntactic_analogies = load_specific_categories(
        'word-analogies.txt', 'semantic', 'syntactic'
    )

word_to_idx = model.key_to_index
embeddings = model.vectors  

syntactic_accuracy = evaluate_analogies(syntactic_analogies, word_to_idx, embeddings)
# print(f"Syntactic Accuracy: {syntactic_accuracy}")

semantic_accuracy = evaluate_analogies(semantic_analogies, word_to_idx, embeddings)
# print(f"Semantic Accuracy: {semantic_accuracy}")

In [20]:
syntactic_accuracy

0

In [18]:
semantic_accuracy

0

In [4]:
word_pairs, human_scores = load_wordsim353('wordsim353/wordsim_similarity_goldstandard.txt') 

In [5]:
model_scores = calculate_model_similarity(word_pairs, model, model.key_to_index)

In [6]:
# remove null values from both human_scores and model_scores
valid_scores = [(h, m) for h, m in zip(human_scores, model_scores) if m is not None]
filtered_human_scores, filtered_model_scores = zip(*valid_scores) if valid_scores else ([], [])

In [7]:
mse = compute_mse(human_scores, model_scores)
mse

27.76131250197865

In [8]:
if filtered_human_scores and filtered_model_scores:
    correlation, _ = spearmanr(filtered_human_scores, filtered_model_scores)
    print(f"spearman correlation between model similarity and human judgment: {correlation}")
else:
    print("no valid word pairs to compute spearman correlation.")

spearman correlation between model similarity and human judgment: 0.6008007400861295


Model is highly correlated to human judgement

## Get top 10 similar words

In [9]:
def compute_top_k_dot_product(query_word, model, k=10):
    if query_word not in model.key_to_index:
        raise ValueError(f"Word '{query_word}' not in vocabulary.")
    
    query_vector = model[query_word]  
    
    # all word vectors (corpus vectors)
    corpus_vectors = model.vectors
    
    #  dot product between the query vector and all word vectors in the corpus
    dot_products = np.dot(corpus_vectors, query_vector)
    
    # top k indices that correspond to the highest dot product values
    top_k_indices = np.argsort(dot_products)[-k:][::-1]  # Sorting in descending order
    
    # top k words and their corresponding similarity scores
    top_k_words = [model.index_to_key[i] for i in top_k_indices]
    top_k_scores = dot_products[top_k_indices]
    
    return top_k_words, top_k_scores


In [10]:
query_word = "king"
top_k_words, top_k_scores = compute_top_k_dot_product(query_word, model, k=10)

for word in top_k_words:
    print(word)

king
emperor
prince
queen
son
ii
throne
father
lord
kingdom
