Notes:
Want context dependent and static
see how well our dictionary lines up with the datasets - We skip pretty much all of them in MEN: 2087
different hash codes

This is where I got RW, RG-65, and Mturk: https://github.com/vecto-ai/word-benchmarks/tree/master/word-similarity/monolingual/en.
has other stuff too

In [15]:
import torch


class FlyVecGPU:
    def __init__(self, N_size, kc_size, debug, device):
        self.W = torch.rand(kc_size, 2 * N_size, device=device)
        self.debug = debug
        self.device = device
        #print(self.W) #prints random matrix

    def predict(self, i):
        i = i.to(self.device)
        return torch.matmul(self.W, i)

    def save_checkpoint(self, path):
        torch.save(self.W, path)

    def load_checkpoint(self, path):
        #For .pth
        # self.W = torch.load(path, map_location=self.device) # Note the change here!

        #for .pt
        loaded_data = torch.load(path, map_location=self.device)  # Load the entire dictionary
        self.W = loaded_data['W']

        #torch.set_printoptions(profile="full")
        print(self.W)

    def learning(self, context_target_pair, probability_vector, learning_rate):
        # Ensure tensors are on the correct device
        context_target_pair = context_target_pair.to(self.device)
        probability_vector = probability_vector.to(self.device)
        learning_rate = torch.tensor(learning_rate, device=self.device)

        normalized_input = context_target_pair * probability_vector
        activations = torch.matmul(self.W, normalized_input)
        max_neuron = torch.argmax(activations)
        if self.debug:
            print(f"Max neuron: {max_neuron.item()}")
            print(f"Activations: {activations[max_neuron].item()}")

        max_neuron_weights = self.W[max_neuron]

        # Vectorized weight update
        update = learning_rate * (normalized_input - max_neuron_weights * normalized_input * max_neuron_weights)
        self.W[max_neuron] += update


In [16]:
import json
import string

import numpy as np
import torch


class StreamedTokenizer:
    def __init__(self, vocab_dict_file, vocab_freq_file, input_file, device, window_size=5):
        self.vocab_dict = {}
        self.vocab_freq = {}
        with open(vocab_dict_file, 'r') as f:
            self.vocab_dict = json.load(f)
        with open(vocab_freq_file, 'r') as f:
            self.vocab_freq = json.load(f)
        self.input_file_handle = open(input_file, 'r')
        self.window_size = window_size
        self.device = device
        self.N_size = len(self.vocab_dict)

        vocab_size = len(self.vocab_dict)
        self.probability_vector = torch.zeros(vocab_size * 2, dtype=torch.float, device=self.device)
        total_freq = sum(self.vocab_freq.values())
        for word, freq in self.vocab_freq.items():
            self.probability_vector[self.vocab_dict[word]] = freq / total_freq
            self.probability_vector[vocab_size + self.vocab_dict[word]] = freq / total_freq

    def __iter__(self):

        # Read a line from the input file
        for line in self.input_file_handle:
            if line.startswith(' = ') or line.startswith('= '):
                continue

            tokens = []
            for word in line.split():
                tokens.append(self.vocab_dict[word])

            half_window = self.window_size // 2
            vocab_size = len(self.vocab_dict)

            for i, target_word_idx in enumerate(tokens):
                # Get context indices
                context_indices = tokens[max(0, i - half_window):i] + tokens[i + 1:i + 1 + half_window]

                # Create context and target vectors as tensors
                context_vector = torch.zeros(vocab_size, dtype=torch.int, device=self.device)
                for idx in context_indices:
                    if idx < vocab_size:
                        context_vector[idx] = 1

                target_vector = torch.zeros(vocab_size, dtype=torch.int, device=self.device)
                if target_word_idx < vocab_size:
                    target_vector[target_word_idx] = 1

                # Concatenate context and target vectors
                input_vector = torch.cat([context_vector, target_vector])

                yield input_vector



In [17]:
import torch
import json
import csv
from scipy.stats import spearmanr

# Load the vocabulary dictionary
with open('dict.json', 'r') as f:
    vocab_dict = json.load(f)

N_size = len(vocab_dict)
kc_size = 400  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
def get_binary_embedding(activation, k):
    # Get indices of top-k activations
    topk_indices = torch.topk(activation, k=k).indices
    binary_embedding = torch.zeros_like(activation, dtype=torch.int)
    binary_embedding[topk_indices] = 1
    return binary_embedding.cpu()

In [19]:
def binary_similarity(v1, v2):
    n = len(v1)
    n11 = torch.sum((v1 == 1) & (v2 == 1)).item()
    n00 = torch.sum((v1 == 0) & (v2 == 0)).item()
    sim = (n11 + n00) / n
    return sim

In [20]:
def evaluate_dataset(dataset_path, embeddings):
    word_pairs = []
    human_scores = []
    with open(dataset_path, 'r') as f:
        sample_line = f.readline()
        if ',' in sample_line:
            delimiter = ','
        elif '\t' in sample_line:
            delimiter = '\t'
        else:
            delimiter = ' '
        reader = csv.reader(f, delimiter=delimiter)
        for row in reader:
            if len(row) < 3:
                continue
            word1 = row[0]
            word2 = row[1]
            score = float(row[2])
            word_pairs.append((word1, word2))
            human_scores.append(score)

    model_scores = []
    filtered_human_scores = []
    skips = 0
    for (word1, word2), human_score in zip(word_pairs, human_scores):
        if word1 in embeddings and word2 in embeddings:
            emb1 = embeddings[word1]
            emb2 = embeddings[word2]
            sim = binary_similarity(emb1, emb2)
            # print("Word1: ", word1)
            # print("Word2: ", word2)
            # print("emb1: ", emb1)
            # print("emb2: ", emb2)
            # print("sim: ", sim)
            # print("Human: ", human_score)
            model_scores.append(sim)
            filtered_human_scores.append(human_score)
        else:
            skips += 1
            
            continue  # Skip pairs where words are not in vocabulary
    print("Skips: ", skips)

    # Compute Spearman correlation
    if len(model_scores) > 1:
        spearman_corr, _ = spearmanr(model_scores, filtered_human_scores)
        return spearman_corr * 100  # Convert to percentage
    else:
        return None


In [None]:
import os

datasets = {
    'MEN': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/MEN/MEN_dataset_natural_form_full.txt',
    'WS353': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/WordSim353/combined_processed.tab',
    'SimLex': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/SimLex-999/SimLex-999_processed.txt',
    'RW': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/RW/rw_processed.txt',
    'RG': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/RG65/rg-65_processed.txt',
    'Mturk': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/Mturk/mturk-771_processed.txt',
}

folder_path = '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/paths'

for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    
    print("File: ", file_path)

    model = FlyVecGPU(N_size, kc_size, debug=False, device=device)
    model.load_checkpoint(file_path)
    #prints random matrix and learned matrix just to check

    # Hash lengths to evaluate
    hash_lengths = [51, 4, 8, 16, 32, 64, 128]
    test = True
    # Evaluate the model for each hash length
    for k in hash_lengths:
        print(f"\nHash Length (k): {k}")
        embeddings = {}
        for word, idx in vocab_dict.items():
            # Create input vector for the word
            input_vector = torch.zeros(2 * N_size, dtype=torch.float, device=device)
            input_vector[N_size + idx] = 1.0  # Set target word index
            # Compute activations
            activation = model.predict(input_vector)
            # Binarize the activations using top-k
            binary_embedding = get_binary_embedding(activation, k)
            if test:
                # print("activation:")
                # print(activation)
                # print("embedding:")
                # print(binary_embedding)
                test = False
            embeddings[word] = binary_embedding

        # Evaluate the model on each dataset
        results = {}
        for name, path in datasets.items():
            score = evaluate_dataset(path, embeddings)
            if score is not None:
                results[name] = score
                #print(f"{name}: {score:.1f}\n")
            else:
                print(f"{name}: Not enough data to compute Spearman correlation.")

        # Optionally, print results in a table format
        print("\nEvaluation Results:")
        print("Dataset\tSpearman Correlation (%)")
        for dataset_name, score in results.items():
            print(f"{dataset_name}\t{score:.1f}")

With plots for .pth

In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Assuming the following are defined elsewhere:
# N_size, kc_size, device, vocab_dict, FlyVecGPU, get_binary_embedding, evaluate_dataset

datasets = {
    'MEN': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/MEN/MEN_dataset_natural_form_full.txt',
    'WS353': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/WordSim353/combined_processed.tab',
    'SimLex': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/SimLex-999/SimLex-999_processed.txt',
    'RW': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/RW/rw_processed.txt',
    'RG': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/RG65/rg-65_processed.txt',
    'Mturk': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/Mturk/mturk-771_processed.txt',
}

folder_path = '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/paths'

hash_lengths = [51, 4, 8, 16, 32, 64, 128]

results = []

# Function to extract numeric part from file names like '0.pth', '1.pth', etc.
def extract_numeric_part(file_name):
    base_name = os.path.splitext(os.path.basename(file_name))[0]  # Get '0' from '0.pth'
    try:
        return int(base_name)
    except ValueError:
        return None  # Non-numeric files will be skipped

# Get and sort file names based on their numeric value
file_names = [f for f in os.listdir(folder_path) if f.endswith('.pth')]

# Filter out files without numeric parts
file_names = [f for f in file_names if extract_numeric_part(f) is not None]

# Sort the file names based on their numeric value
file_names.sort(key=lambda x: extract_numeric_part(x))

for file_name in file_names:
    iteration = extract_numeric_part(file_name)
    if iteration is None:
        continue  # Skip files without a numeric iteration value
    file_path = os.path.join(folder_path, file_name)
    file_label = os.path.basename(file_name)
    print("File: ", file_path)

    model = FlyVecGPU(N_size, kc_size, debug=False, device=device)
    model.load_checkpoint(file_path)
    # Prints random matrix and learned matrix just to check

    test = True

    for k in hash_lengths:
        print(f"\nHash Length (k): {k}")
        embeddings = {}
        for word, idx in vocab_dict.items():
            # Create input vector for the word
            input_vector = torch.zeros(2 * N_size, dtype=torch.float, device=device)
            input_vector[N_size + idx] = 1.0  # Set target word index
            # Compute activations
            activation = model.predict(input_vector)
            # Binarize the activations using top-k
            binary_embedding = get_binary_embedding(activation, k)
            if test:
                test = False
            embeddings[word] = binary_embedding

        # Evaluate the model on each dataset
        for dataset_name, dataset_path in datasets.items():
            score = evaluate_dataset(dataset_path, embeddings)
            if score is not None and np.isfinite(score):
                results.append({
                    'iteration': iteration,
                    'file_name': file_path,
                    'file_label': file_label,
                    'hash_length': k,
                    'dataset': dataset_name,
                    'score': score
                })
            else:
                print(f"{dataset_name}: Not enough data to compute Spearman correlation or score is not finite.")

# Convert results to DataFrame
df = pd.DataFrame(results)

# Remove any rows with non-finite values
df = df[np.isfinite(df['iteration']) & np.isfinite(df['score'])]

# Create 'plots' directory if it doesn't exist
os.makedirs('plots', exist_ok=True)

# Plotting
for dataset_name in df['dataset'].unique():
    # Create subfolder for the dataset
    dataset_folder = os.path.join('plots', dataset_name)
    os.makedirs(dataset_folder, exist_ok=True)
    for k in df['hash_length'].unique():
        df_subset = df[(df['dataset'] == dataset_name) & (df['hash_length'] == k)]
        df_subset = df_subset.sort_values('iteration')
        plt.figure(figsize=(10, 6))
        plt.plot(df_subset['iteration'], df_subset['score'], marker='o')
        plt.title(f'Dataset: {dataset_name}, Hash Length: {k}')
        plt.xlabel('Iteration (File Name)')
        plt.ylabel('Spearman Correlation (%)')
        plt.xticks(df_subset['iteration'], df_subset['file_label'], rotation=45, ha='right')
        plt.tight_layout()
        plt.grid(True)
        # Save the figure in the dataset's folder
        output_file = os.path.join(dataset_folder, f'{dataset_name}_k{k}.png')
        plt.savefig(output_file)
        plt.close()


With plots for .pt FOR CAMS MODEL

note, the vocab size needs to be even, just because i didnt feel like changing the model above

Skips:  546 of 3000 for MEN
Skips:  52 of 353 for WS353
Skips:  92 of 1000 for SimLex
Skips:  1718 of 2034 For RW
Skips:  24 of 65 For RG
Skips:  40 of 771 For MTURK

In [36]:
# For open_web
import json

# Load vocab
flyvec_embeddings_path = 'simple-flyvec-embeddings.json'
with open(flyvec_embeddings_path, 'r') as file:
    embeddings = json.load(file)


vocab_dict = {word: idx for idx, word in enumerate(embeddings.keys())}

N_size = len(vocab_dict)
kc_size = 400  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(N_size)

19570


In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re

# Assuming the following are defined elsewhere:
# N_size, kc_size, device, vocab_dict, FlyVecGPU, get_binary_embedding, evaluate_dataset

datasets = {
    'MEN': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/MEN/MEN_dataset_natural_form_full.txt',
    'WS353': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/WordSim353/combined_processed.tab',
    'SimLex': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/SimLex-999/SimLex-999_processed.txt',
    'RW': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/RW/rw_processed.txt',
    'RG': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/RG65/rg-65_processed.txt',
    'Mturk': '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/Mturk/mturk-771_processed.txt',
}

folder_path = '/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/498_project/flyvec_extensions/trained_models/context_openwebtext_checkpoints'

hash_lengths = [51, 4, 8, 16, 32, 64, 128]

results = []

# Function to extract numeric part from file names like '0.pth', '1.pth', etc.
def extract_numeric_part(file_name):
    base_name = os.path.splitext(os.path.basename(file_name))[0]  # Remove extension
    match = re.search(r'\d+', base_name)  # Find the first numeric part
    if match:
        #print(int(match.group()), flush = True)
        return int(match.group())
    else:
        print("errro", flush = True)

# Get and sort file names based on their numeric value
file_names = [f for f in os.listdir(folder_path) if f.endswith('.pt')]

# Filter out files without numeric parts
file_names = [f for f in file_names if extract_numeric_part(f) is not None]

# Sort the file names based on their numeric value
file_names.sort(key=lambda x: extract_numeric_part(x))

for file_name in file_names:
    iteration = extract_numeric_part(file_name)
    if iteration is None:
        continue  # Skip files without a numeric iteration value
    file_path = os.path.join(folder_path, file_name)
    file_label = os.path.basename(file_name)
    print("File: ", file_path)

    model = FlyVecGPU(int(N_size/2), kc_size, debug=False, device=device)
    model.load_checkpoint(file_path)
    # Prints random matrix and learned matrix just to check

    test = True

    for k in hash_lengths:
        print(f"\nHash Length (k): {k}")
        embeddings = {}
        for word, idx in vocab_dict.items():
            # Create input vector for the word
            input_vector = torch.zeros(N_size, dtype=torch.float, device=device)
            input_vector[idx] = 1.0  # Set target word index
            # Compute activations
            activation = model.predict(input_vector)
            # Binarize the activations using top-k
            binary_embedding = get_binary_embedding(activation, k)
            if test:
                test = False
            embeddings[word] = binary_embedding

        # Evaluate the model on each dataset
        for dataset_name, dataset_path in datasets.items():
            score = evaluate_dataset(dataset_path, embeddings)
            if score is not None and np.isfinite(score):
                results.append({
                    'iteration': iteration,
                    'file_name': file_path,
                    'file_label': file_label,
                    'hash_length': k,
                    'dataset': dataset_name,
                    'score': score
                })
                print(results)
            else:
                print(f"{dataset_name}: Not enough data to compute Spearman correlation or score is not finite.")

# Convert results to DataFrame
df = pd.DataFrame(results)
print(results)

# Remove any rows with non-finite values
df = df[np.isfinite(df['iteration']) & np.isfinite(df['score'])]

# Create 'plots' directory if it doesn't exist
os.makedirs('plots', exist_ok=True)

# Plotting
for dataset_name in df['dataset'].unique():
    # Create subfolder for the dataset
    dataset_folder = os.path.join('plots', dataset_name)
    os.makedirs(dataset_folder, exist_ok=True)
    for k in df['hash_length'].unique():
        df_subset = df[(df['dataset'] == dataset_name) & (df['hash_length'] == k)]
        df_subset = df_subset.sort_values('iteration')
        plt.figure(figsize=(10, 6))
        plt.plot(df_subset['iteration'], df_subset['score'], marker='o')
        plt.title(f'Dataset: {dataset_name}, Hash Length: {k}')
        plt.xlabel('Iteration (File Name)')
        plt.ylabel('Spearman Correlation (%)')
        plt.xticks(df_subset['iteration'], df_subset['file_label'], rotation=45, ha='right')
        plt.tight_layout()
        plt.grid(True)
        # Save the figure in the dataset's folder
        output_file = os.path.join(folder_path, f'{dataset_name}_k{k}.png')
        plt.savefig(output_file)
        plt.close()


In [None]:
model = FlyVecGPU(N_size, kc_size, debug=False, device=device)
model.load_checkpoint('/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/498_project/flyvec_extensions/trained_models/context_openwebtext_checkpoints/model_checkpoint_1pct.pt')

In [None]:
model = FlyVecGPU(N_size, kc_size, debug=False, device=device)
model.load_checkpoint('/mnt/c/Users/kobeh/OneDrive/eecs445/Project_2/flyvec/flyvec/table_7_recreation/paths/30.pth')

In [None]:
word = 'number'
hash_len = 32
k = hash_len

input_vector = torch.zeros(2 * N_size, dtype=torch.float, device=device)
input_vector[N_size + vocab_dict.get(word)] = 1.0  # Set target word index
# Compute activations
activation = model.predict(input_vector)

enc_target_word = get_binary_embedding(activation, k)
print(enc_target_word)

sims = []


def calc_sim(v1, v2):

    n = len(v1)
    n11 = torch.sum((v1 == 1) & (v2 == 1)).item()
    n00 = torch.sum((v1 == 0) & (v2 == 0)).item()
    #print(n11, " : ", n00)
    sim = (n11 + n00) / n
    return sim


# Calculate  similarity between target and the rest of vocab
for word in vocab_dict.keys():
    input_vector = torch.zeros(2 * N_size, dtype=torch.float, device=device)
    input_vector[N_size + vocab_dict.get(word)] = 1.0  # Set target word index
    # Compute activations
    activation = model.predict(input_vector)
    enc_word = get_binary_embedding(activation, k)

    sim = calc_sim(enc_target_word, enc_word)
    sims.append((word, sim))


# Sort by similarity score and get top N
N = 15
top_N = sorted(sims, key=lambda x: x[1], reverse=True)[:N]
bottom_N = sorted(sims, key=lambda x: x[1], reverse=False)[:N]

# Print results
print(f"{'Word':<15} {'Similarity':<10} {'Frequency':<10}")
print("-" * 35)
for word, sim in top_N:
    print(f"{word:<15} {sim:>9.3f}")


# Print results
print(f"{'Word':<15} {'Similarity':<10} {'Frequency':<10}")
print("-" * 35)
for word, sim in bottom_N:
    print(f"{word:<15} {sim:>9.3f}")