In [1]:
import sys
sys.path.append('..')

from src.data import make_dataset
# from src.simulations import run_simulations
from tqdm import tqdm

In [2]:
import numpy as np
import pandas as pd

In [13]:
def _euclidean_distance(a, b):
    return np.sqrt(np.sum(np.square(a - b)))

def _get_distances_series(glove_df, vector):
    return np.sqrt(np.sum(np.square(glove_df - vector), axis=1))


def _random_similar_word(word, glove_df, similarity_threshold=200):
    """
    Get a random word similar to the given word from a GloVe embeddings DataFrame.

    Parameters:
    - word (str): The word to find similar words to.
    - glove_df (pandas.DataFrame): DataFrame containing GloVe embeddings with words as index.
    - similarity_threshold (int): The maximum number of similar words to consider.

    Returns:
    - str: A random word similar to the given word.
    """
    distances = _get_distances_series(glove_df, glove_df.loc[word])
    sorted_distances = distances.sort_values()
    filtered_distances = sorted_distances.iloc[1 : similarity_threshold + 1]
    similar_word = np.random.choice(filtered_distances.index)
    return similar_word


def _pick_final_word(
    result_vector, starting_word, intermediate_words, glove_df, num_operations
):
    """
    Pick a final word for an equation generated using word vectors.

    Parameters:
    - result_vector (numpy.array): Resultant vector of the equation.
    - starting_word (str): The starting word of the equation.
    - intermediate_words (list of str): List of intermediate words used in the equation.
    - glove_df (pandas.DataFrame): DataFrame containing GloVe embeddings with words as index.
    - num_operations (int): Number of operations in the equation.

    Returns:
    - str: The final word selected for the equation.
    """
    equation_words = [starting_word] + list(intermediate_words)
    sorted_glove_indices = _get_distances_series(glove_df, result_vector).sort_values().index
    closest_words = sorted_glove_indices[: num_operations + 2]

    for final_word in closest_words:
        if final_word not in equation_words:
            return final_word
    return "ERROR: WORD NOT FOUND"


def _equation_to_string(starting_word, intermediate_words, final_word, operations):
    """
    Convert an equation represented by its components into a string format.

    Parameters:
    - starting_word (str): The starting word of the equation.
    - intermediate_words (list of str): List of intermediate words used in the equation.
    - final_word (str): The final word of the equation.
    - operations (list of int): List of operations (+1 for addition, -1 for subtraction).

    Returns:
    - str: The equation string in the format "starting_word + intermediate_word1 - intermediate_word2 = final_word".
    """
    equation_string = starting_word

    for i, operation in enumerate(operations):
        equation_string += " + " if operation == 1 else " - "
        equation_string += intermediate_words[i]

    return equation_string + " = " + final_word


def simulate_game(glove_df, num_operations):
    """
    Simulate a word-based game using GloVe embeddings.

    Parameters:
    - glove_df (pandas.DataFrame): DataFrame containing GloVe embeddings with words as index.
    - num_operations (int): Number of operations to perform in the game.

    Returns:
    - tuple: A tuple containing the euclidean distance between the final word and the result vector,
             and the equation string representing the game.
    """
    starting_word = np.random.choice(glove_df.index)

    intermediate_words = np.array(
        [
            _random_similar_word(starting_word, glove_df, 100)
            for _ in range(num_operations)
        ]
    )
    operations = np.random.choice([-1, 1], size=num_operations)

    factor_matrix = np.array([glove_df.loc[word] for word in intermediate_words])
    factors_x_operations = np.dot(operations, factor_matrix)

    result_vector = glove_df.loc[starting_word] + factors_x_operations

    final_word = _pick_final_word(
        result_vector, starting_word, intermediate_words, glove_df, num_operations
    )
    similarity = _euclidean_distance(glove_df.loc[final_word], result_vector)

    return similarity, _equation_to_string(
        starting_word, intermediate_words, final_word, operations
    )


def batch_simulations(
    threshold, num_results, output_filepath, glove_df, num_operations
):
    """
    Perform batch simulations of word-based games using GloVe embeddings.

    Parameters:
    - threshold (float): Maximum euclidean distance for considering a game result.
    - num_results (int): Number of simulation results to generate.
    - output_filepath (str): Filepath to save the results.
    - glove_df (pandas.DataFrame): DataFrame containing GloVe embeddings with words as index.
    - num_operations (int): Number of operations to perform in each game.

    Returns:
    - None
    """
    simulation_results = []

    with tqdm(total=num_results, desc="Equations found") as pbar:
        while len(simulation_results) < num_results:
            sim = simulate_game(glove_df, num_operations)
            if sim[0] < threshold:
                simulation_results.append(sim)
                pbar.update(1)
    results_df = pd.DataFrame(simulation_results, columns=["distance", "equation"])

    sorted_results = results_df.sort_values(by="distance", ascending=True)
    sorted_results.to_csv(output_filepath)

In [4]:
embeddings_index = make_dataset.parse_glove_data("..\\data\\raw\\glove.6B.300d.txt", True)

Reading file: 100%|██████████| 400000/400000 [04:07<00:00, 1614.58line/s]


In [5]:
words_array = make_dataset.get_words_array(embeddings_index)
print("len(words_array) (raw):\t\t\t" + "{:,}".format(len(words_array)))
words_array = make_dataset.filter_alphabetic(words_array)
print("len(words_array) (filter_alphabetic):\t" + "{:,}".format(len(words_array)))
words_array = make_dataset.filter_20k(words_array, "..\\data\\external\\20k.txt")
print("len(words_array) (filter_20k):\t\t" + "{:,}".format(len(words_array)))


len(words_array) (raw):			400,000
len(words_array) (filter_alphabetic):	317,756
len(words_array) (filter_20k):		19,737


In [6]:
glove_df = make_dataset.embeddings_to_dataframe(embeddings_index, words_array)

In [21]:
batch_simulations(7, 100, '..\\data\\processed\\results.csv', glove_df, 2)

Equations found: 100%|██████████| 100/100 [04:00<00:00,  2.40s/it]


In [17]:
a = glove_df.loc['deadly'] - glove_df.loc['fighting'] + glove_df.loc['dozens']
b = glove_df.loc['dozen']
_get_distances_series(glove_df, a).sort_values()

deadly          6.724125
dozens          7.125394
dozen           7.969513
hundreds        7.993612
fatal           8.251380
                 ...    
polynomial     14.367004
eh             14.506359
attn           14.885276
householder    15.817514
herein         16.107616
Length: 19737, dtype: float32

In [20]:
a = glove_df.loc['king'] - glove_df.loc['man'] + glove_df.loc['woman']
b = glove_df.loc['queen']
_get_distances_series(glove_df, a).sort_values()

king            4.753941
queen           5.955311
monarch         6.899858
mother          7.178615
princess        7.252288
                 ...    
attn           13.700110
teaspoon       13.716257
householder    14.167259
frontpage      14.233803
herein         15.916162
Length: 19737, dtype: float32