In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import spearmanr

np.set_printoptions(precision=4, threshold=1000, linewidth=10000, suppress=True, floatmode='fixed')

In [205]:
## Load lists of files

# Get list of embedding models
folder_loc= 'D:/Study and Projects/School Work/Year 25 - PhD 1/Data//'
model_loc = 'Word Embeddings//'
model_files = {
                  'CW_vectors':'Collobert and Weston Vectors/embeddings.txt',
                  'word2vec_skip':'Word2vec Skipgram CoNLL17/model_mini.txt',
                  'gensim_skip':'Gensim Skipgram wiki+giga/model_mini.txt',
                  'gensim_BNC':'Gensim Skipgram BNC/model_mini.txt',
                  'gensim_cbow':'Gensim CBoW giga/2010_mini.txt',
                  'glove':'Glove Word Embeddings/glove.840B.300d.mini.txt',
                  'fasttext':'FastText Skipgram wiki+giga/model_mini.txt',
                  'elmo':'Elmo Embeddings/elmo_mini.txt',
                  'conceptnet':'ConceptNet Embeddings/numberbatch-en.txt',
                  'wordnet':'WordNet Word Embeddings/wn2vec_mini.txt',
                  'bert_large':'bert_large_uncased_mini.txt',
                  'gpt2_large':'gpt2_large_mini.txt',
                  'electra_large':'electra_large_mini.txt',
                  'albert_xxlarge':'albert-xxlarge-v2_mini.txt',
                  'sembert':'sembert_mini.txt',
                  'ernie_base_0':'Ernie Base Embeddings/ernie-2.0-en-layer-0.txt',
                  'ernie_context_5':'Ernie Wikipedia Embeddings/Generic Embeddings/contextual_embeddings_layer_normalised_5.txt',
                  'ernie_context_5_v':'Ernie Wikipedia Embeddings/Verb Embeddings/contextual_embeddings_layer_normalised_5.txt',
                  'ernie_context_5_n':'Ernie Wikipedia Embeddings/Noun Embeddings/contextual_embeddings_layer_normalised_5.txt'
               }

# Get list of empirical datasets
directory = folder_loc+'Word Similarity Data//Leuven Natural Concept Database//pairwise similarities' # LNCD data
# directory = folder_loc+'Word Similarity Data//Lee Behavioral Data Repository//pairwise similarities' # Lee data
# directory = folder_loc+'Word Similarity Data//Spatial Arrangement Method Data//study1_data//pairwise similarities' # SpAM data
dataset_files = {}
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    dataset_name,file_type = filename.split('.')
    if os.path.isfile(f) and file_type=='txt': # get only .txt files
        dataset_files[dataset_name] = f

In [49]:
## Define key functions
def import_model(model_name):
    """ string -> None
    Imports an embedding model, storing it in the model_embed_storage dictionary.
    """
        
    # open relevant file
    file_loc = model_files[model_name]
    filename = folder_loc+model_loc+file_loc
    with open(filename, encoding='utf-8') as file:
        lines = [line.rstrip('\n') for line in file]

    model_dict = {} # create word dictionary for specific model
    for line in lines:
        word_list = line.split()
        word = word_list[0]
        embedding_list = [float(x) for x in word_list[1:-1]] # store embeddings
        embedding_np = np.array(embedding_list)
        model_dict[word] = embedding_np

    model_embed_storage[model_name] = model_dict # store model dictionary in the models dictionary
    print(model_name+' loaded')
    

# Function to load word similarity data for specified dataset
def import_dataset(dataset_name):
    """ string -> None
    Imports a dataset, storing a value of the form (list, numpy_array) in the dataset_storage dictionary.
    """
    file_loc = dataset_files[dataset_name]
    with open(file_loc, encoding='utf-8') as file:
        lines = file.readlines()

    wordpairs = [None]*len(lines) # initialise storage
    ratings = [None]*len(lines)
    i=0
    for line in lines:
        line = line.strip() # remove new line chars
        wordpairs[i] = line.split() # split at any whitespace chars
        ratings[i] = float(wordpairs[i][2])
        wordpair_str = wordpairs[i][0]+' '+wordpairs[i][1]
        i=i+1
    ratings = np.array(ratings)

    dataset_storage[dataset_name] = (wordpairs, ratings)
    
    
# Function to calculate cosine similarity between two embeddings
def cosine_sim(embed_1, embed_2):
    """ numpy_array, numpy_array -> float
    Returns the cosine similarity (-1 to 1) between two embeddings, inputted as vectors.
    """
    if np.dot(embed_1,embed_2) == 0:
        similarity = 0 # don't normalise if similarity is zero
    else:
        similarity = np.dot(embed_1,embed_2)/(np.linalg.norm(embed_1)*np.linalg.norm(embed_2))
        #similarity, _ = spearmanr(embed_1, embed_2)
    return(similarity)


# Function to compute the correlation between a given set of model and dataset embedding similarities
def compute_embed_correls(dataset_similarities, model_similarities,printing=False):
    """ list_flt, list_flt, int, boolean -> (list_flt, list_flt, list_flt)
    Computes the pearson_r and spearman_r between word similarities for a dataset and model.
    """
    pearson_r = np.corrcoef(dataset_similarities, model_similarities)[0,1]
    spearman_r, p = spearmanr(dataset_similarities, model_similarities)
    # differences = np.array(model_similarities)-np.array(dataset_similarities)# model minus dataset
            
    if printing==True: # printing results
        print('pearson: {:.3f}'.format(pearson_r), '\nspearman: {:.3f}\n'.format(spearman_r))
    return(pearson_r, spearman_r)

In [206]:
## Load dataset pairs and sims
dataset_storage = {}
for dataset in dataset_files:
    import_dataset(dataset)

In [207]:
## Load word embeddings
model_embed_storage = {}
# embedding_model = 'ernie_base_0'
embedding_model = 'conceptnet'
import_model(embedding_model)
word_embeds = model_embed_storage[embedding_model]

conceptnet loaded


In [212]:
## Store word embeddings for all vocab by category

# Define dictionary of words needing replacement
replace_words = {'trolleycar':'streetcar', 't-shirt':'t_shirt', 'shot-put':'shot_put', 'go-cart':'go_cart'}

# Loop over all categories in the dataset
model_word_embeds_by_cat = {}
vocab_set_by_cat = {}
for category in dataset_files:
    word_embeds_store = []

    # Loop over all vocab in catagory
    vocab_set = set(np.array(dataset_storage[category][0])[:,0])
    vocab_set_by_cat[category] = vocab_set
    for word in vocab_set:
        if word in replace_words.keys(): # words absent from conceptnet
            word=replace_words[word]

        try:
            word_embed = word_embeds[word]
            word_embeds_store.append(word_embed)
        except KeyError: # for missing words
            print('missing '+word+' in '+category)
            continue
    
    # Store word embeddings
    model_word_embeds_by_cat[category] = np.array(word_embeds_store)
    
# Check for any missing words
for category in model_word_embeds_by_cat.keys():
    print(category+': words='+str(len(vocab_set_by_cat[category]))+', found='+str(model_word_embeds_by_cat[category].shape[0]))
    

LNCD_all: words=380, found=380
LNCD_appliances: words=30, found=30
LNCD_birds: words=29, found=29
LNCD_clothes: words=28, found=28
LNCD_fish: words=21, found=21
LNCD_fruits: words=29, found=29
LNCD_insects: words=23, found=23
LNCD_mammals: words=29, found=29
LNCD_music: words=22, found=22
LNCD_occupations: words=29, found=29
LNCD_reptiles: words=18, found=18
LNCD_sports: words=27, found=27
LNCD_tools: words=25, found=25
LNCD_vegetables: words=27, found=27
LNCD_vehicles: words=27, found=27
LNCD_weapons: words=19, found=19


In [213]:
## Compute model similarities and correlate with dataset similarities

# Loop over all categories in the dataset
model_dataset_sims_dict = {}
for category in dataset_files:
    model_dataset_sims = {}
    
    # This word will be used to modify the other words in the dataset
    category_name = category.split('_')[1]
    if category_name=='all': # replace with generic noun
        category_name='man'
    cat_word_embed = word_embeds[category_name]
    mean_cat_embed = model_word_embeds_by_cat[category].mean(axis=0)
    var_cat_embed = model_word_embeds_by_cat[category].var(axis=0)
    modifications = {"None": 0, "cat_embed": cat_word_embed, "mean_cat": mean_cat_embed, "var_cat": var_cat_embed} # addition version
    # modifications = {"None": 2, "cat_embed": cat_word_embed, "mean_cat": mean_cat_embed, "var_cat": var_cat_embed} # multiplication version
    
    # Loop over all modifiers for embeddings
    for mod in modifications.keys():
        model_dataset_sims_single_mod = {}
        
        # Loop over all wordpairs in catagory
        for word_pair in dataset_storage[category][0]:
            word_1 = word_pair[0]
            word_2 = word_pair[1]
            dataset_sim = float(word_pair[2])
            
            # Calculate cosine similarities
            try:
                embed_1 = word_embeds[word_1]+0.9*modifications[mod]
                embed_2 = word_embeds[word_2]+0.9*modifications[mod]
                model_sim = cosine_sim(embed_1,embed_2)
                model_dataset_sims_single_mod[word_1+' '+word_2] = [dataset_sim,model_sim]
            except KeyError: # for missing words
                continue
        
        # Store results for a single modifier
        model_dataset_sims[mod] = model_dataset_sims_single_mod
    
    # Store category sim data in dictionary
    model_dataset_sims_dict[category] = model_dataset_sims
    
# Compute model-dataset correlations
correl_storage = {}
for dataset_cat in model_dataset_sims_dict.keys():
    results_single_cat = []
    for mod in modifications.keys():
        model_dataset_sims_np = np.array(list(model_dataset_sims_dict[dataset_cat][mod].values())).transpose()
        dataset_sims = model_dataset_sims_np[0]
        model_sims = model_dataset_sims_np[1]
        results_single_cat.append(compute_embed_correls(dataset_sims,model_sims)[1]) # get spearman correl
    correl_storage[dataset_cat] = np.array(results_single_cat)

In [199]:
correl_storage

{'SpAM_birds': array([0.4869, 0.4583, 0.4680, 0.4865]),
 'SpAM_clothing': array([0.6521, 0.6128, 0.6163, 0.6509]),
 'SpAM_fruit': array([0.5250, 0.5244, 0.5206, 0.5257]),
 'SpAM_furniture': array([0.7038, 0.6039, 0.6467, 0.7061]),
 'SpAM_professions': array([0.6211, 0.5707, 0.6021, 0.6217]),
 'SpAM_sports': array([0.5065, 0.4480, 0.4494, 0.5067]),
 'SpAM_vegetables': array([0.4841, 0.4321, 0.4687, 0.4843]),
 'SpAM_vehicles': array([0.7994, 0.7735, 0.7812, 0.8050])}

In [204]:
correl_storage

{'LBDR_all': array([0.2967, 0.3023, 0.2874, 0.2963]),
 'lee_animals': array([0.5259, 0.4993, 0.4813, 0.5291]),
 'lee_clothing': array([-0.0625, -0.0423, -0.0503, -0.0650]),
 'lee_fish': array([0.3502, 0.3341, 0.3368, 0.3497]),
 'lee_fruit': array([0.3487, 0.3470, 0.3490, 0.3487]),
 'lee_furniture': array([0.4379, 0.3245, 0.3611, 0.4451]),
 'lee_kinship': array([0.6738, 0.6729, 0.6662, 0.6726]),
 'lee_mixture': array([0.1669, 0.0694, 0.1718, 0.1708]),
 'lee_sport': array([0.6104, 0.5844, 0.5744, 0.6116]),
 'lee_tools': array([0.3379, 0.2891, 0.3017, 0.3373]),
 'lee_vegetables': array([0.3371, 0.3072, 0.3180, 0.3353]),
 'lee_vehicles': array([0.4636, 0.4608, 0.4405, 0.4641]),
 'lee_weapons': array([0.5561, 0.4437, 0.5014, 0.5502])}

In [214]:
correl_storage

{'LNCD_all': array([0.3797, 0.3850, 0.3696, 0.3854]),
 'LNCD_appliances': array([0.5871, 0.4914, 0.5668, 0.5878]),
 'LNCD_birds': array([0.4762, 0.4552, 0.4522, 0.4775]),
 'LNCD_clothes': array([0.5231, 0.4987, 0.5093, 0.5235]),
 'LNCD_fish': array([0.3332, 0.3426, 0.3168, 0.3344]),
 'LNCD_fruits': array([0.2944, 0.2668, 0.2741, 0.2947]),
 'LNCD_insects': array([0.3135, 0.2763, 0.2766, 0.3110]),
 'LNCD_mammals': array([0.5814, 0.5318, 0.5479, 0.5781]),
 'LNCD_music': array([0.4405, 0.4320, 0.4249, 0.4413]),
 'LNCD_occupations': array([0.5367, 0.4087, 0.5430, 0.5315]),
 'LNCD_reptiles': array([0.3565, 0.3341, 0.3412, 0.3529]),
 'LNCD_sports': array([0.4420, 0.4076, 0.4081, 0.4428]),
 'LNCD_tools': array([0.5139, 0.4969, 0.5139, 0.5150]),
 'LNCD_vegetables': array([0.2374, 0.2110, 0.2185, 0.2362]),
 'LNCD_vehicles': array([0.7026, 0.6881, 0.6822, 0.7119]),
 'LNCD_weapons': array([0.5932, 0.4591, 0.5101, 0.5928])}