In [52]:
## Simple code to extract and save the word similarities based on the Binder Brain Ratings data
## James Fodor 2022

import numpy as np
import itertools

# Set numpy display properties needed for printing to file
np.set_printoptions(precision=4, threshold=100000, linewidth=100000, suppress=True, floatmode='fixed')

# Location of data files
data_root = 'D:/Study and Projects/School Work/Year 25 - PhD 1/Data//' # root location of data

In [None]:
def cosine_sim(embed_1, embed_2):
    """ numpy_array, numpy_array -> float
    Returns the cosine similarity (-1 to 1) between two embeddings, inputted as vectors.
    """
    if np.dot(embed_1,embed_2) == 0:
        similarity = 0 # don't normalise if similarity is zero
    else:
        similarity = np.dot(embed_1,embed_2)/(np.linalg.norm(embed_1)*np.linalg.norm(embed_2))
    return(similarity)

In [3]:
# Load Binder ratings and save them to a dictionary
folder_loc = data_root+'Word Embeddings/Binder Brain Ratings//'
filename = folder_loc+'binder_ratings_simple.csv'
with open(filename) as file:
    lines = [line.rstrip('\n') for line in file]
    
model_dict = {} # word dictionary for model
for line in lines:
    word_list = line.split()
    word = word_list[0]
    word = word.replace('ï»¿','') # remove weird initial symbol
    embedding_list = [float(x) for x in word_list[1:-1]]
    embedding_np = np.array(embedding_list)
    model_dict[word] = embedding_np

In [47]:
# Extract all pairs of words
list_of_words = list(model_dict.keys())
print(len(list_of_words),'words')
word_pairs = list(itertools.combinations(list_of_words, 2))
print(len(word_pairs),'word pairs')

534 words
142311 word pairs


In [48]:
# Calculate and store all pairwise similarities
word_sim_storage = []
for word_pair in word_pairs:
    word_embed_1 = model_dict[word_pair[0]]
    word_embed_2 = model_dict[word_pair[1]]
    similarity = cosine_sim(word_embed_1, word_embed_2)
    word_sim_storage.append(similarity)

In [53]:
# Save similarities to file
save_path = folder_loc+'binder_similarities.txt'
save_file = open(save_path, "a", encoding='utf-8')
vocab_size = len(word_pairs)

i=0
for word_pair in word_pairs:
    save_string = word_pair[0]+' '+word_pair[1]+' '+str(word_sim_storage[i])
    save_file.writelines(save_string)
    save_file.write('\n')
    i=i+1
save_file.close()

print('file saved with '+str(vocab_size)+' words')

file saved with 142311 words
