In [1]:
import numpy as np

In [2]:
x = np.array([[2, 5, 9, 10],
              [0, -8, 3, -3]])
y = np.array([[-6, 7, 1, -3],
              [-3, 7, 9, 0],
              [-6, 10, 10, -5]])

In [3]:
array = np.zeros((len(x),len(y)))
for i in range(len(x)):
    abs_x = np.linalg.norm(x[i])
    for j in range(len(y)):
        abs_y = np.linalg.norm(y[j])
        value = (x[i] @ y[j]) / (abs_x * abs_y)
        array[i,j] = value

-------------------

In [1]:
from embeddings import *
from test_analogies import *

In [2]:
# Load embeddings from a file
embeddings = Embeddings.from_file("data/glove_50d.txt")

In [6]:
word = embeddings[['king']]
word

array([[ 0.50451 ,  0.68607 , -0.59517 , -0.022801,  0.60046 , -0.13498 ,
        -0.08813 ,  0.47377 , -0.61798 , -0.31012 , -0.076666,  1.493   ,
        -0.034189, -0.98173 ,  0.68229 ,  0.81722 , -0.51874 , -0.31503 ,
        -0.55809 ,  0.66421 ,  0.1961  , -0.13495 , -0.11476 , -0.30344 ,
         0.41177 , -2.223   , -1.0756  , -1.0783  , -0.34354 ,  0.33505 ,
         1.9927  , -0.04234 , -0.64319 ,  0.71125 ,  0.49159 ,  0.16754 ,
         0.34344 , -0.25663 , -0.8523  ,  0.1661  ,  0.40102 ,  1.1685  ,
        -1.0137  , -0.21585 , -0.15155 ,  0.78321 , -0.91241 , -1.6106  ,
        -0.64426 , -0.51042 ]], dtype=float32)

In [7]:
vecs = []
get_closest_words = []

for e in embeddings.words:
    cos_sim = cosine_sim(word,embeddings[[e]])
    vecs.append((e, cos_sim))
# Sort by similarity and get top 4
top_4 = sorted(vecs,key=lambda x: x[1], reverse=True)[:4]
get_closest_words = [word for word, _ in top_4]
print("Top 4 closest words to 'king':", top_4)
print(get_closest_words)


Top 4 closest words to 'king': [('king', array([[1.00000012]])), ('prince', array([[0.82361799]])), ('queen', array([[0.78390437]])), ('ii', array([[0.77462304]]))]
['king', 'prince', 'queen', 'ii']


In [8]:
vectors = embeddings["king", "man", "woman"]

In [9]:
for vec in vectors:
    vec.reshape(1,-1)
    print(vec.shape)

(50,)
(50,)
(50,)


In [10]:
closest_words = []
for vector in vectors: # iterating over vector array
    vecs = [] # We need to store the top 4 words for each vector
    vector = vector.reshape(1,-1)# Shaping the vector a 2D array for correct computation
    for e in embeddings.words:
        cos_sim_val = cosine_sim(vector, embeddings[[e]])
        vecs.append((e, cos_sim_val))
    # Sorting for top 4 neighbors
    top_4 = sorted(vecs,key=lambda x: x[1], reverse=True)[:4]
    get_closest_words = [word for word, _ in top_4]
    closest_words.append(get_closest_words)
print(closest_words)

[['king', 'prince', 'queen', 'ii'], ['man', 'woman', 'boy', 'another'], ['woman', 'girl', 'man', 'mother']]


-----------

In [11]:
vecs = embeddings["king", "man", "woman"]
queen = vecs[0:1] - vecs[1:2] + vecs[2:3]
queen

array([[ 0.41736597,  0.90427005, -1.0050299 , -0.06202102,  0.49725997,
         0.80667007, -0.14855   ,  0.80365   , -0.15653998, -0.66973996,
         0.23435399,  0.62476   ,  0.925871  , -0.97099996,  0.92566   ,
         0.89915   , -1.54596   , -0.52625   ,  0.13695401,  0.66199005,
         0.4871601 ,  0.37035   , -0.214214  ,  0.10100996,  0.71358   ,
        -2.0874999 , -1.1362001 , -1.1496099 , -0.53599   ,  0.27389997,
         1.6723    ,  0.02930999, -0.77656007,  0.46056286,  0.34866   ,
        -0.05741701,  0.19444   , -0.207748  , -0.73038995, -0.10751998,
         0.235544  ,  0.96423995, -0.46993998, -0.48727497, -0.25399995,
         0.4621299 , -0.66081   , -1.9451499 , -0.68797004, -0.49784005]],
      dtype=float32)

-----------

In [12]:
test_data = load_analogies("data/analogies_test.txt")
test_data.keys()

dict_keys(['capital-common-countries'])

In [None]:
# Iterate over each category in test data
for cateogry, analogies in test_data.items():
    correct = 0
    total = 0
    for analogy in analogies:
        w1,w2,w3,w4 = analogy
        v1,v2,v3,v4 = embeddings[[w1]], embeddings[[w2]],embeddings[[w3]],embeddings[[w4]]
        print(w1,w2,w3,w4)

athens greece baghdad iraq
athens greece bangkok thailand
athens greece beijing china
athens greece berlin germany
athens greece bern switzerland
athens greece cairo egypt
athens greece canberra australia
athens greece hanoi vietnam
athens greece havana cuba
athens greece helsinki finland
athens greece islamabad pakistan
athens greece kabul afghanistan
athens greece london england
athens greece madrid spain
athens greece moscow russia
athens greece oslo norway
athens greece ottawa canada
athens greece paris france
athens greece rome italy
athens greece stockholm sweden
athens greece tehran iran
athens greece tokyo japan
baghdad iraq bangkok thailand
baghdad iraq beijing china
baghdad iraq berlin germany
baghdad iraq bern switzerland
baghdad iraq cairo egypt
baghdad iraq canberra australia
baghdad iraq hanoi vietnam
baghdad iraq havana cuba
baghdad iraq helsinki finland
baghdad iraq islamabad pakistan
baghdad iraq kabul afghanistan
baghdad iraq london england
baghdad iraq madrid spain
b

In [None]:
def run_analogy_test(embeddings: Embeddings, test_data: AnalogiesDataset, k: int = 1) -> Dict[str, float]:
    """
    Runs the 3CosAdd test for a set of embeddings and analogy questions.

    :param embeddings: The embeddings to be evaluated using 3CosAdd.
    :param test_data: The set of analogies to compute accuracy.
    :param k: The "lenience" for accuracy. A question is correct if the target word
              is in the top k closest words.
    :return: A dictionary mapping each category to its analogy accuracy.
    """
    analogy_dict = {}

    # Iterate over each category in test data
    for category, analogies in test_data.items():
        correct = 0
        total = 0
        for analogy in analogies:
            w1,w2,w3,w4 = analogy
            vecs = embeddings[[w1, w2, w3]]

            # Compute vector using 3CosAdd formula
            predicted_vector = vecs[1:2] - vecs[0:1] + vecs[2:3]

            # Get closest words to the predicted vector
            closest_words = get_closest_words(embeddings, predicted_vector, k=k)

            # Check if expected word is in the top k closest words
            if w4 in closest_words[0]:
                correct += 1
            total += 1

        # Compute accuracy for this category
        accuracy = correct / total
        analogy_dict[category] = accuracy  # Store in dictionary

    return analogy_dict  # Return the computed accuracy for all categories


---------------

In [15]:
from test_analogies import run_analogy_test

In [16]:
test_data = load_analogies("data/analogies.txt")
test_data.keys()

dict_keys(['capital-common-countries', 'capital-world', 'currency', 'city-in-state', 'family', 'gram1-adjective-to-adverb', 'gram2-opposite', 'gram3-comparative', 'gram4-superlative', 'gram5-present-participle', 'gram6-nationality-adjective', 'gram7-past-tense', 'gram8-plural', 'gram9-plural-verbs'])

In [17]:
# Run the analogy test
run_analogy_test(embeddings, test_data, k=1)

{'capital-common-countries': 0.6837944664031621,
 'capital-world': 0.5899646330680813,
 'currency': 0.08429561200923788,
 'city-in-state': 0.08755573571139036,
 'family': 0.4762845849802372,
 'gram1-adjective-to-adverb': 0.09173387096774194,
 'gram2-opposite': 0.08866995073891626,
 'gram3-comparative': 0.26876876876876876,
 'gram4-superlative': 0.2531194295900178,
 'gram5-present-participle': 0.0928030303030303,
 'gram6-nationality-adjective': 0.8667917448405253,
 'gram7-past-tense': 0.09358974358974359,
 'gram8-plural': 0.25900900900900903,
 'gram9-plural-verbs': 0.1896551724137931}

--------

In [3]:
embeddings_50 = Embeddings.from_file("data/glove_50d.txt")
embeddings_100 = Embeddings.from_file("data/glove_100d.txt")
embeddings_200 = Embeddings.from_file("data/glove_200d.txt")

In [21]:
results_50 = run_analogy_test(embeddings_50, test_data, k=1)
results_100 = run_analogy_test(embeddings_100, test_data, k=1)
results_200 = run_analogy_test(embeddings_200, test_data, k=1)

In [23]:
results_50

{'capital-common-countries': 0.6837944664031621,
 'capital-world': 0.5899646330680813,
 'currency': 0.08429561200923788,
 'city-in-state': 0.08755573571139036,
 'family': 0.4762845849802372,
 'gram1-adjective-to-adverb': 0.09173387096774194,
 'gram2-opposite': 0.08866995073891626,
 'gram3-comparative': 0.26876876876876876,
 'gram4-superlative': 0.2531194295900178,
 'gram5-present-participle': 0.0928030303030303,
 'gram6-nationality-adjective': 0.8667917448405253,
 'gram7-past-tense': 0.09358974358974359,
 'gram8-plural': 0.25900900900900903,
 'gram9-plural-verbs': 0.1896551724137931}

In [24]:
results_100

{'capital-common-countries': 0.7905138339920948,
 'capital-world': 0.654288240495137,
 'currency': 0.10046189376443418,
 'city-in-state': 0.11876773408998784,
 'family': 0.4051383399209486,
 'gram1-adjective-to-adverb': 0.0625,
 'gram2-opposite': 0.07266009852216748,
 'gram3-comparative': 0.30930930930930933,
 'gram4-superlative': 0.2638146167557932,
 'gram5-present-participle': 0.13446969696969696,
 'gram6-nationality-adjective': 0.8792995622263915,
 'gram7-past-tense': 0.1141025641025641,
 'gram8-plural': 0.15615615615615616,
 'gram9-plural-verbs': 0.23218390804597702}

In [25]:
results_200

{'capital-common-countries': 0.5988142292490118,
 'capital-world': 0.46816976127320953,
 'currency': 0.08545034642032333,
 'city-in-state': 0.053911633563032024,
 'family': 0.3577075098814229,
 'gram1-adjective-to-adverb': 0.014112903225806451,
 'gram2-opposite': 0.020935960591133004,
 'gram3-comparative': 0.2822822822822823,
 'gram4-superlative': 0.16399286987522282,
 'gram5-present-participle': 0.043560606060606064,
 'gram6-nationality-adjective': 0.8674171357098186,
 'gram7-past-tense': 0.05961538461538462,
 'gram8-plural': 0.048798798798798795,
 'gram9-plural-verbs': 0.15747126436781608}

In [None]:
def compute_average_accuracy(results):
    semantic_categories = ['capital-common-countries', 'capital-world', 'currency', 'city-in-state', 'family']
    syntactic_categories = ['gram1-adjective-to-adverb', 'gram2-opposite', 'gram3-comparative',
                            'gram4-superlative', 'gram5-present-participle', 'gram6-nationality-adjective',
                            'gram7-past-tense', 'gram8-plural', 'gram9-plural-verbs']

    semantic_accuracy = sum(results[cat] for cat in semantic_categories) / len(semantic_categories)
    syntactic_accuracy = sum(results[cat] for cat in syntactic_categories) / len(syntactic_categories)
    overall_accuracy = sum(results.values()) / len(results)

    return semantic_accuracy, syntactic_accuracy, overall_accuracy

# Compute and print results
semantic_50, syntactic_50, overall_50 = compute_average_accuracy(results_50)
semantic_100, syntactic_100, overall_100 = compute_average_accuracy(results_100)
semantic_200, syntactic_200, overall_200 = compute_average_accuracy(results_200)

# Print results in table format
print(f"{'Embedding Space':<15} {'Semantic':<10} {'Syntactic':<10} {'Overall':<10}")
print("-" * 45)
print(f"{'GloVe 50':<15} {semantic_50:.3f} {syntactic_50:.3f} {overall_50:.3f}")
print(f"{'GloVe 100':<15} {semantic_100:.3f} {syntactic_100:.3f} {overall_100:.3f}")
print(f"{'GloVe 200':<15} {semantic_200:.3f} {syntactic_200:.3f} {overall_200:.3f}")

Embedding Space Semantic   Syntactic  Overall   
---------------------------------------------
GloVe 50        0.384 0.245 0.295
GloVe 100       0.414 0.247 0.307
GloVe 200       0.313 0.184 0.230


In [22]:
results_50_k2 = run_analogy_test(embeddings_50, test_data, k=2)

In [27]:
results_100_k2 = run_analogy_test(embeddings_100, test_data, k=2)

In [28]:
results_200_k2 = run_analogy_test(embeddings_200, test_data, k=2)

In [29]:
# Compute and print results
semantic_50_k2, syntactic_50_k2, overall_50_k2 = compute_average_accuracy(results_50_k2)
semantic_100_k2, syntactic_100_k2, overall_100_k2 = compute_average_accuracy(results_100_k2)
semantic_200_k2, syntactic_200_k2, overall_200_k2 = compute_average_accuracy(results_200_k2)

# Print results in table format
print(f"{'Embedding Space':<15} {'Semantic':<10} {'Syntactic':<10} {'Overall':<10}")
print("-" * 45)
print(f"{'GloVe 50':<15} {semantic_50_k2:.3f} {syntactic_50_k2:.3f} {overall_50_k2:.3f}")
print(f"{'GloVe 100':<15} {semantic_100_k2:.3f} {syntactic_100_k2:.3f} {overall_100_k2:.3f}")
print(f"{'GloVe 200':<15} {semantic_200_k2:.3f} {syntactic_200_k2:.3f} {overall_200_k2:.3f}")

Embedding Space Semantic   Syntactic  Overall   
---------------------------------------------
GloVe 50        0.560 0.504 0.524
GloVe 100       0.634 0.628 0.630
GloVe 200       0.658 0.634 0.642


In [8]:
custom_test_data = load_analogies("custom_analogies.txt")

In [9]:
results_50 = run_analogy_test(embeddings_50, custom_test_data, k=1)
results_100 = run_analogy_test(embeddings_100, custom_test_data, k=1)
results_200 = run_analogy_test(embeddings_200, custom_test_data, k=1)

In [10]:
print(f"{'Embedding':<12} {'Accuracy':<10}")
print("-" * 25)
print(f"{'GloVe 50':<12} {results_50['custom-analogies']:<10.3f}")
print(f"{'GloVe 100':<12} {results_100['custom-analogies']:<10.3f}")
print(f"{'GloVe 200':<12} {results_200['custom-analogies']:<10.3f}")

Embedding    Accuracy  
-------------------------
GloVe 50     0.333     
GloVe 100    0.333     
GloVe 200    0.500     


In [None]:
def run_analogy_test_answers(embeddings: Embeddings, test_data: AnalogiesDataset, k: int = 1) -> Dict[str, List[str]]:
    """
    Runs the 3CosAdd test and returns the predicted words for each analogy.

    :param embeddings: The embeddings to be evaluated using 3CosAdd.
    :param test_data: The set of analogies.
    :param k: The "lenience" for accuracy. Returns the top-k closest words.
    :return: A dictionary mapping each category to a list of predicted words.
    """
    analogy_predictions = {}

    # Iterate over each category
    for category, analogies in test_data.items():
        predictions = []

        for analogy in analogies:
            w1, w2, w3, gold = analogy  # Extract words

            try:
                vecs = embeddings[[w1.lower(), w2.lower(), w3.lower()]]  # Retrieve vectors
                predicted_vector = vecs[1:2] - vecs[0:1] + vecs[2:3]  # Compute 3CosAdd

                closest_word = get_closest_words(embeddings, predicted_vector, k=k)[0]  # Get closest word
            except KeyError:
                closest_word = "N/A"  # Handle missing words

            predictions.append((f"{w1} : {w2} :: {w3} : _x_", gold, closest_word))  # Store prediction

        analogy_predictions[category] = predictions  # Store all results for the category

    return analogy_predictions

In [13]:
test_data = load_analogies("custom_analogies.txt")

In [14]:
results_50 = run_analogy_test_answers(embeddings_50, test_data, k=1)
results_100 = run_analogy_test_answers(embeddings_100, test_data, k=1)
results_200 = run_analogy_test_answers(embeddings_200, test_data, k=1)

In [16]:
results_50

{'custom-analogies': [('france : paris :: italy : _x_', 'rome', ['rome']),
  ('france : paris :: japan : _x_', 'tokyo', ['tokyo']),
  ('france : paris :: florida : _x_', 'tallahassee', ['miami']),
  ('big : bigger :: small : _x_', 'smaller', ['larger']),
  ('big : bigger :: cold : _x_', 'colder', ['cold']),
  ('big : bigger :: quick : _x_', 'quicker', ['quick'])]}

In [17]:
results_100

{'custom-analogies': [('france : paris :: italy : _x_', 'rome', ['rome']),
  ('france : paris :: japan : _x_', 'tokyo', ['tokyo']),
  ('france : paris :: florida : _x_', 'tallahassee', ['florida']),
  ('big : bigger :: small : _x_', 'smaller', ['larger']),
  ('big : bigger :: cold : _x_', 'colder', ['cold']),
  ('big : bigger :: quick : _x_', 'quicker', ['quick'])]}

In [19]:
results_200

{'custom-analogies': [('france : paris :: italy : _x_', 'rome', ['rome']),
  ('france : paris :: japan : _x_', 'tokyo', ['tokyo']),
  ('france : paris :: florida : _x_', 'tallahassee', ['florida']),
  ('big : bigger :: small : _x_', 'smaller', ['smaller']),
  ('big : bigger :: cold : _x_', 'colder', ['cold']),
  ('big : bigger :: quick : _x_', 'quicker', ['quick'])]}