# To do

Models.py => Load a model (download it if needed), from the list, or custom gensim model

Analogy.py => Analogy test results, parameters = model, questions (bats normaly, do we allow mikolov's questions? any kind?), vanilla or not
-> Good code already ready, just need it to be cleaner to output well the metrics (.txt or .json?)

analogy_decompo.py => Output the decomposition scores, same for diff_sim. Parameters = model, questions

metrics.py => OCS and PCS, parameters = model, questions, number of permutations.
           => test option; create the random sets, new parameter = nb random sets, limit random words
-> Need to adapt the code here, but should be relatively easy

-> Option to output results (analogy/metrics) for single model or all in our list.

plot.py => Plot results from analogy or metrics or analogy_decompo like in the paper.

# Introduction

In [1]:
# Using chrisjmccormick's github for the basic word2vec import

import gensim
from gensim import utils, matutils
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec, Word2VecKeyedVectors

import logging
import wget
from itertools import chain
import logging
from six import string_types
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import pickle
import os
import sklearn

import scipy
from scipy import sparse
from scipy.stats import ttest_ind
from scipy.sparse.linalg import norm
from scipy.stats import iqr

from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity as cos_sim
from sklearn.preprocessing import normalize
from sklearn.cluster import SpectralClustering

from svd2vec import svd2vec

import plotly.graph_objects as go
import plotly

import tensorflow
import transformers

from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

from pytorch_pretrained_bert import BertTokenizer, BertModel, GPT2Tokenizer, GPT2LMHeadModel

unable to import 'smart_open.gcs', disabling that module


# Download models

In [None]:
# word2vec
#wget.download('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz')

# Glove
#!wget http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip
#!unzip glove.840B.300d.zip

# dict2vec
#!wget https://dict2vec.s3.amazonaws.com/dict2vec300.tar.bz2
#!tar -x dict2vec300.tar.bz2

# ConceptNet Numberbatch
#!wget https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz

# BERT and GPT-2 ==> Loading will download them
bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
bert_model = BertModel.from_pretrained('bert-large-uncased').embeddings.word_embeddings.weight.data.numpy()

gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2').transformer.wte.weight.data.numpy()

In [None]:
# Download question-words
#wget.download('https://raw.githubusercontent.com/tmikolov/word2vec/master/questions-words.txt')

# Load models

In [9]:
# word2vec
pretrained_embeddings_path = "word-embeddings-geometry/GoogleNews-vectors-negative300.bin.gz"
# "./GoogleNews-vectors-negative300.bin.gz"
model = gensim.models.KeyedVectors.load_word2vec_format(pretrained_embeddings_path, 
binary=True)

# Glove
tmp_file = get_tmpfile("./glove_gensim.txt")
_ = glove2word2vec('./glove.840B.300d.txt', tmp_file)

model_glove = gensim.models.KeyedVectors.load_word2vec_format(tmp_file)

# dict2vec
model_dict2vec = gensim.models.KeyedVectors.load_word2vec_format("dict2vec-vectors-dim300.vec", binary=False, unicode_errors="ignore")

# ConceptNet Numberbatch
pretrained_embeddings_path = "numberbatch-en-19.08.txt.gz"
model_conceptnet = gensim.models.KeyedVectors.load_word2vec_format(pretrained_embeddings_path)

# BERT and GPT-2
bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
bert_model = BertModel.from_pretrained('bert-large-uncased').embeddings.word_embeddings.weight.data.numpy()

gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2').transformer.wte.weight.data.numpy()

# Useful functions

In [3]:
def token_embedding(tokenizer, model, word):
    tokenized_text = tokenizer.tokenize(word)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    embeds = np.array([model[i] for i in indexed_tokens])
    embed = np.mean(embeds, axis=0)
    return(embed)

def permutation_onecycle(n):
    if type(n) == tuple:
        n1, n2 = n[0], n[1]
    else:
        n1, n2 = 0, n
    l=np.random.permutation(range(n1, n2))
    for i in range(n1, n2):
        if i==l[i-n1]: 
            j=np.random.randint(n1, n2)
            while j==l[j-n1]:
                j=np.random.randint(n1, n2)
            l[i-n1], l[j-n1] = l[j-n1], l[i-n1]
    return(l)

def permutation_onecycle_avoidtrue(n, real): #May be a more optimal way
    test=False
    perm = permutation_onecycle(n)
    for i_r in range(len(real)):
        if real[i_r][1] == real[perm[i_r]][1]:
            test=True
    while test:
        test=False
        perm = permutation_onecycle(n)
        for i_r in range(len(real)):
            if real[i_r][1] == real[perm[i_r]][1]:
                test=True
    return(perm)

def shuffled_directions(model, idx_start, idx_end):
    perm_list = permutation_onecycle(len(idx_start))
    dirs = np.array([[model.wv.get_vector(idx_end[perm_list[i]]) - model.wv.get_vector(idx_start[i])
                                          for i in range(len(idx_start))]])
    return(dirs)

def similarite_offsets(directions_tuples, list_offsets):
    sim_offsets = []
    for i in range(len(directions_tuples)):
        sim_offsets.append([])
        list_tuples = list(list_offsets[i])
        for j in range(len(list_tuples)):
            for k in range(j+1,len(list_tuples)):
                sim_offsets[-1].append(cos_sim([list_tuples[j]], [list_tuples[k]])[0][0]) 
    return(np.array(sim_offsets))

def similarite_shuffled_offsets(directions_tuples, list_offsets, alpha=0.1):
    sim_offsets = []
    idxs = []
    for i in range(len(directions_tuples)):
        sim_offsets.append([])
        list_tuples = list(list_offsets[i])
        idx_shuffled_offsets = np.random.choice(len(list_tuples), size=int(alpha*(len(list_tuples))), replace=False) 
        idx_shuffled_offsets.sort()
        idxs.append(idx_shuffled_offsets)
        for j in range(len(idx_shuffled_offsets)):
            for k in range(j+1, len(idx_shuffled_offsets)):
                sim_offsets[-1].append(cos_sim([list_tuples[idx_shuffled_offsets[j]]], [list_tuples[idx_shuffled_offsets[k]]])[0][0]) 
    return(idxs, np.array(sim_offsets))

def mean_direction(offsets):
    mean = []
    mean_ofunnormalized = []
    for d in offsets:
        da = np.array(d)
        norma_d = (1/np.linalg.norm(da, axis=1))[:,None] * da
        mean.append(np.mean(norma_d, axis=0))
        mean_ofunnormalized.append(np.mean(da, axis=0))

def similarity_to_mean(len_categ, offsets, mean_direction):
    similarity_tomean = []
    for i in range(len_categ):
        similarity_tomean.append([])
        list_offsets = list(offsets[i])
        for j in range(len(list_offsets)):
            similarity_tomean[-1].append(cos_sim([list_offsets[j]], [mean_direction[i]])[0][0])
    return(np.array(similarity_tomean))

def OCS_PCS(len_categs, nb_perm, similarities, similarities_shuffle):
    ocs, pcs = [], []
    for i in range(len_categs):
        pcs_list = []
        for perm in range(nb_perm):
            y_true = [1 for j in range(len(similarities[i]))]+[0 for j in range(len(similarities_shuffle[perm][i]))]
            y_scores = list(similarities[i])+list(similarities_shuffle[perm][i])
            #print(y_true)
            #print(y_scores)
            auc_temp = sklearn.metrics.roc_auc_score(y_true,y_scores)
            pcs_list.append(auc_temp)
        pcs.append(np.mean(pcs_list))
        ocs.append(np.mean(similarities[i]))
    return(ocs, pcs)

# Analogy pairs loading

In [4]:
#BATS

directions_names = []
directions_tuples_bats = []

for d in os.listdir('BATS_3.0'):
    if d != 'metadata.json':
        for f in os.listdir('BATS_3.0/'+str(d)):
            directions_names.append(str(f)[:-4])
            directions_tuples_bats.append(set())
            with utils.open_file('BATS_3.0/'+str(d)+'/'+str(f)) as fin:
                for line_no, line in enumerate(fin):
                    line = utils.to_unicode(line)
                    a, b = [word.lower() for word in line.split()]
                    list_b = b.split('/')
                    if list_b[0] != a:
                        directions_tuples_bats[-1].add((a,list_b[0]))
            
used_voc = np.hstack([np.hstack([[i[0] for i in directions_tuples_bats[k]] for k in range(len(directions_tuples_bats))]),
                      np.hstack([[i[1] for i in directions_tuples_bats[k]] for k in range(len(directions_tuples_bats))])])

NameError: name 'directions_tuples' is not defined

In [None]:
vocabulary_keys = model.wv.vocab.keys()
vocabulary = set(vocabulary_keys)
vocabulary_list = np.array(list(vocabulary_keys))

directions_tuples = [[d for d in list(directions_tuples_bats[i]) if d[0] in vocabulary and d[1] in vocabulary] for i in range(len(directions_tuples_bats))]


In [8]:
for i in range(len(directions_tuples)):
    print(directions_names[i], ": ",len([d for d in directions_tuples[i] if d[0] in vocabulary and d[1] in vocabulary]), " pairs of out ", len(directions_tuples[i]))

L03 [hyponyms - misc] 50 50
L02 [hypernyms - misc] 50 50
L01 [hypernyms - animals] 50 45
L09 [antonyms - gradable] 50 50
L06 [meronyms - part] 50 46
L07 [synonyms - intensity] 50 50
L08 [synonyms - exact] 50 49
L10 [antonyms - binary] 50 50
L04 [meronyms - substance] 50 49
L05 [meronyms - member] 50 49
E02 [country - language] 50 36
E05 [name - occupation] 50 27
E08 [animal - shelter] 50 50
E09 [things - color] 50 50
E03 [UK_city - county] 50 25
E04 [name - nationality] 50 24
E10 [male - female] 50 49
E07 [animal - sound] 50 50
E06 [animal - young] 50 50
E01 [country - capital] 50 37
D04 [over+adj_reg] 50 50
D09 [verb+tion_irreg] 50 49
D10 [verb+ment_irreg] 50 48
D02 [un+adj_reg] 50 50
D03 [adj+ly_reg] 50 50
D07 [verb+able_reg] 50 49
D05 [adj+ness_reg] 50 46
D06 [re+verb_reg] 50 49
D08 [verb+er_irreg] 50 49
D01 [noun+less_reg] 50 49
I09 [verb_Ving - Ved] 50 50
I08 [verb_Ving - 3pSg] 50 50
I02 [noun - plural_irreg] 48 46
I06 [verb_inf - Ving] 50 50
I10 [verb_3pSg - Ved] 50 50
I07 [verb_

# Constructing random offsets sets

In [19]:
k_random = 10
size_random_categ = 50
limit_word = 10000



# a* - a, pour chaque categorie
direction_w2vs = np.array([[model.wv.get_vector(i[1]) - model.wv.get_vector(i[0]) 
                            for i in directions_tuples[k] if i[1] in vocabulary and i[0] in vocabulary] 
                           for k in range(len(directions_tuples))])


# a* - a, a et a* de la même catégorie mais permuté
perm_lists_nnp = []
direction_w2vs_normal_normal_permutation = []
for k_r in range(k_random):
    perm_lists_nnp.append([])
    direction_w2vs_normal_normal_permutation.append([])
    for i in range(len(directions_tuples)):
        perm_list = permutation_onecycle(len(directions_tuples[i]))
        direction_w2vs_normal_normal_permutation[-1].append([])
        ds = list(directions_tuples[i])
        for k in range(len(ds)):
            di = ds[k]
            dj = ds[perm_list[k]]
            if di[0] in vocabulary and dj[1] in vocabulary and dj[1] != di[0]:
                direction_w2vs_normal_normal_permutation[-1][-1].append(model.wv.get_vector(dj[1]) - model.wv.get_vector(di[0]))
                
        perm_lists_nnp[-1].append(perm_list)
        
direction_w2vs_categ_categ_intra = []
perm_lists_intra = []
for k_r in range(k_random):
    perm_list_intra = np.hstack([permutation_onecycle(10),
                             permutation_onecycle((10,20)),
                             permutation_onecycle((20,30)),
                             permutation_onecycle((30,40)),
                            ])
    perm_lists_intra.append(perm_list_intra)
    direction_w2vs_categ_categ_intra.append([])
    
    for i in range(len(directions_tuples)):
        direction_w2vs_categ_categ_intra[-1].append([])
        j = perm_list_intra[i]
        len_max = min(len(directions_tuples[i]), len(directions_tuples[j]))
        for k in range(len_max):
            di = list(directions_tuples[i])[k]
            dj = list(directions_tuples[j])[k]
            if dj[1] != di[0]:
                direction_w2vs_categ_categ_intra[-1][-1].append(model.wv.get_vector(dj[1]) - model.wv.get_vector(di[0]))

                
# a* - a, a et a* de categories différentes, probablement très très grand pour bats!!
direction_w2vs_categ_categ = []
perm_lists_inter = []
for k_r in range(k_random):
    perm_list_categ_categ = permutation_onecycle(len(directions_tuples))
    perm_lists_inter.append(perm_list_categ_categ)
    direction_w2vs_categ_categ.append([])
    
    for i in range(len(directions_tuples)):
        direction_w2vs_categ_categ[-1].append([])
        j = perm_list_categ_categ[i]
        len_max = min(len(directions_tuples[i]), len(directions_tuples[j]))
        for k in range(len_max):
            di = list(directions_tuples[i])[k]
            dj = list(directions_tuples[j])[k]
            if dj[1] != di[0]:
                    direction_w2vs_categ_categ[-1][-1].append(model.wv.get_vector(dj[1]) - model.wv.get_vector(di[0]))

                    
# For half random categories
idx_random_categ = [[np.random.choice(limit_word, size=len(directions_tuples[k_d]), replace=False) 
                     for k in range(k_random)] 
                    for k_d in range(len(directions_tuples))]
idx_random_categ = []
for k_d in range(len(directions_tuples)):
    idx_random_categ.append([])
    for k in range(k_random):
        rand_ints = np.random.choice(limit_word, size=len(directions_tuples[k_d]), replace=False)
        rand_vos = [vocabulary_list[r] for r in rand_ints if not vocabulary_list[r] in used_voc] #i?
        while len(rand_vos) < len(directions_tuples[k_d]):
            rand_int = int(np.random.choice(limit_word, size=1, replace=False))
            if not vocabulary_list[rand_int] in used_voc and not vocabulary_list[rand_int] in rand_vos:
                rand_vos.append(vocabulary_list[rand_int])
        idx_random_categ[-1].append(rand_vos)
idx_random_categ = np.array(idx_random_categ)

# a* - a, a et a* de categories différentes
# a* - a, a* d'un ensemble random
direction_w2vs_random_normal = np.array([[[model.wv.get_vector(idx_random_categ[k_d][k_r][i]) -\
                                           model.wv.get_vector(list_directions_tuples[k_d][i][0])
                                          for i in range(len(list_directions_tuples[k_d])) 
                                           if list_directions_tuples[k_d][i][0] in vocabulary 
                                          ] 
                           for k_d in range(len(list_directions_tuples))] for k_r in range(k_random)])

# a* - a, a d'un ensemble random
direction_w2vs_normal_random = np.array([[[model.wv.get_vector(list_directions_tuples[k_d][i][1]) -\
                                           model.wv.get_vector(idx_random_categ[k_d][k_r][i])
                                          for i in range(len(list_directions_tuples[k_d])) 
                                           if list_directions_tuples[k_d][i][1] in vocabulary 
                                          ] 
                           for k_d in range(len(list_directions_tuples))] for k_r in range(k_random)])


# For random->random categories
idx_random = [np.random.choice(limit_word, size=size_random_categ, replace=False) for k in range(k_random)]
idx_random = np.array([[vocabulary_list[i] for i in idx_random[k] if not vocabulary_list[i] in used_voc] for k in range(k_random)])
### 2nd definition may be not needed

### May be a better way to change it
idx_random2 = []
for k in range(k_random):
    rand_ints = np.random.choice(limit_word, size=size_random_categ, replace=False) 
    rand_vos = [vocabulary_list[r] for r in rand_ints if not vocabulary_list[r] in used_voc and not vocabulary_list[r] in idx_random[k]]
    while len(rand_vos) < 50:
        rand_int = int(np.random.choice(limit_word, size=1, replace=False))
        if not vocabulary_list[rand_int] in used_voc and not vocabulary_list[rand_int] in rand_vos and not vocabulary_list[rand_int] in idx_random[k]:
            rand_vos.append(vocabulary_list[rand_int])
    idx_random2.append(rand_vos)
idx_random2 = np.array(idx_random2)

# a* - a, a et a* d'ensembles random
direction_w2vs_random_random = [np.array([[model.wv.get_vector(idx_random2[k_r][i]) - model.wv.get_vector(idx_random[k_r][i]) 
                                          for i in range(len(idx_random[k_r]))] 
                           ]) for k_r in range(k_random)]

# Shuffled sets for normal and random sets

In [238]:
nb_perms = 50

# Pour la courbe ROC, on a ici changé la version précédente avec des i!=j mtnt
direction_w2vs_shuffle = []
for k in range(len(directions_tuples)):
    direction_w2vs_shuffle.append([])
    for perm in range(nb_perms):
        perm_list = permutation_onecycle(len(directions_tuples[k]))
        dirs = [model.wv.get_vector(directions_tuples[k][perm_list[i]][1]) - model.wv.get_vector(directions_tuples[k][i][0])
                                              for i in range(len(directions_tuples[k]))]
        direction_w2vs_shuffle[-1].append(dirs)
# Fonctionne aussi pour le normal normal permutation

# a* - a, a et a* de categories différentes, même grande catégorie pour bats probablement très très grand pour bats!!
direction_w2vs_categ_categ_intra_shuffle  = []
for k_r in range(k_random):
    print(k_r)
    direction_w2vs_categ_categ_intra_shuffle.append([])
    perm_list_intra = perm_lists_intra[k_r]
    for k in range(len(directions_tuples)):
        direction_w2vs_categ_categ_intra_shuffle[-1].append([])
        kj = perm_list_intra[k]
        len_max = min(len(directions_tuples[k]), len(directions_tuples[kj]))
        for perm in range(nb_perms):
            perm_list = permutation_onecycle(len_max)
            #perm_list = permutation_onecycle_avoidtrue(len_max, directions_tuples[kj])
            dirs = [model.wv.get_vector(directions_tuples[kj][perm_list[i]][1]) - model.wv.get_vector(directions_tuples[k][i][0])
                                                  for i in range(len_max)]
            direction_w2vs_categ_categ_intra_shuffle[-1][-1].append(dirs)

        
direction_w2vs_categ_categ_shuffle = []
for k_r in range(k_random):
    print(k_r)
    direction_w2vs_categ_categ_shuffle.append([])
    perm_list_categ_categ = perm_lists_inter[k_r]
    for k in range(len(directions_tuples)):
        direction_w2vs_categ_categ_shuffle[-1].append([])
        kj = perm_list_categ_categ[k]
        len_max = min(len(directions_tuples[k]), len(directions_tuples[kj]))
        for perm in range(nb_perms):
            perm_list = permutation_onecycle(len_max)
            #perm_list = permutation_onecycle_avoidtrue(len_max, directions_tuples[kj])
            dirs = [model.wv.get_vector(directions_tuples[kj][perm_list[i]][1]) - model.wv.get_vector(directions_tuples[k][i][0])
                                                  for i in range(len_max)]
            direction_w2vs_categ_categ_shuffle[-1][-1].append(dirs)
        
        
# a* - a, a d'un ensemble random, shuffle
direction_w2vs_random_normal_shuffle = []
for k_r in range(k_random):
    print(k_r)
    direction_w2vs_random_normal_shuffle.append([])
    for k in range(len(directions_tuples)):
        direction_w2vs_random_normal_shuffle[-1].append([])
        len_max = min(len(directions_tuples[k]), len(idx_random_categ[k][k_r]))
        for perm in range(nb_perms):
            perm_list = permutation_onecycle(len_max)
            dirs = [model.wv.get_vector(idx_random_categ[k][k_r][perm_list[i]]) - model.wv.get_vector(directions_tuples[k][i][0])
                                                  for i in range(len_max)]
            direction_w2vs_random_normal_shuffle[-1][-1].append(dirs)

direction_w2vs_normal_random_shuffle = []
for k_r in range(k_random):
    print(k_r)
    direction_w2vs_normal_random_shuffle.append([])
    for k in range(len(directions_tuples)):
        direction_w2vs_normal_random_shuffle[-1].append([])
        len_max = min(len(directions_tuples[k]), len(idx_random_categ[k][k_r]))
        for perm in range(nb_perms):
            perm_list = permutation_onecycle(len_max)
            dirs = [model.wv.get_vector(directions_tuples[k][perm_list[i]][1]) - model.wv.get_vector(idx_random_categ[k][k_r][i])
                                                  for i in range(len_max)]
            direction_w2vs_normal_random_shuffle[-1][-1].append(dirs)

direction_w2vs_random_random_shuffle = [[shuffled_directions(model, idx_random[k_r], idx_random2[k_r]) 
                                         for perm in range(nb_perm)] 
                                        for k_r in range(k_random)]



Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).



0



Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).



1
2
3
4
5
6
7
8
9
0



Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).



1
2
3
4
5
6
7
8
9



Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).



# Similarity of offsets and shuffled offsets

In [241]:
# Similarite offsets normaux
directions_similarity_w2vs = similarite_offsets(directions_tuples, direction_w2vs)
print("NNP")
directions_similarity_w2vs_normal_normal_permutation = [similarite_offsets(directions_tuples, direction_w2vs_normal_normal_permutation[k]) for k in range(k_random)]
print("intra")
directions_similarity_w2vs_categ_categ_intra = [similarite_offsets(directions_tuples, direction_w2vs_categ_categ_intra[k]) for k in range(k_random)]
print("inter")
directions_similarity_w2vs_categ_categ = [similarite_offsets(directions_tuples, direction_w2vs_categ_categ[k]) for k in range(k_random)]
print("N -> R")
directions_similarity_w2vs_random_normal = [similarite_offsets(directions_tuples, direction_w2vs_random_normal[k]) for k in range(k_random)]
print("R -> N")
directions_similarity_w2vs_normal_random = [similarite_offsets(directions_tuples, direction_w2vs_normal_random[k]) for k in range(k_random)]
print("R R")
directions_similarity_w2vs_random_random = [similarite_offsets(['random'], direction_w2vs_random_random[k]) for k in range(k_random)]

NNP
intra
inter
N -> R


TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:
# Similarite offsets normaux
directions_similarity_w2vs_shuffle = [similarite_offsets(directions_tuples, np.array(direction_w2vs_shuffle)[:,perm]) for perm in range(nb_perms)]
#directions_similarity_w2vs_normal_normal_permutation_shuffle = similarite_shuffle_offsets(directions_tuples, direction_w2vs_normal_normal_permutation)

print('intra')
directions_similarity_w2vs_categ_categ_intra_shuffle = [[similarite_offsets(directions_tuples, np.array(direction_w2vs_categ_categ_intra_shuffle[k_r])[:,perm]) for perm in range(nb_perms)] for k_r in range(k_random)]
print('inter')
directions_similarity_w2vs_categ_categ_shuffle = [[similarite_offsets(directions_tuples, np.array(direction_w2vs_categ_categ_shuffle[k_r])[:,perm]) for perm in range(nb_perms)] for k_r in range(k_random)]
print('N>R')
directions_similarity_w2vs_random_normal_shuffle = [[similarite_offsets(directions_tuples, np.array(direction_w2vs_random_normal_shuffle[k_r])[:,perm]) for perm in range(nb_perms)] for k_r in range(k_random)]
print('R>N')
directions_similarity_w2vs_normal_random_shuffle = [[similarite_offsets(directions_tuples, np.array(direction_w2vs_normal_random_shuffle[k_r])[:,perm]) for perm in range(nb_perms)] for k_r in range(k_random)]
print('R>R')
directions_similarity_w2vs_random_random_shuffle = [[similarite_offsets(['random'], direction_w2vs_random_random_shuffle[k_r][perm]) for perm in range(nb_perms)] for k_r in range(k_random)]



intra


# OCS and PCS

In [None]:
len_categs = len(directions_names)

ocs, pcs = OCS_PCS(len_categs, 
                   nb_perms,
                   directions_similarity_w2vs, 
                   directions_similarity_w2vs_shuffle)
print('nnp')
metrics_res = np.array([OCS_PCS(len_categs, 
                                nb_perms,
                                directions_similarity_w2vs_normal_normal_permutation[kr], 
                                directions_similarity_w2vs_shuffle) for kr in range(k_random)])
ocs_nnp, pcs_nnp = metrics_res[:,0], metrics_res[:,1]

print('intra')
metrics_res = np.array([OCS_PCS(len_categs, 
                                nb_perms,
                                directions_similarity_w2vs_categ_categ_intra[kr], 
                                directions_similarity_w2vs_categ_categ_intra_shuffle[kr]) for kr in range(k_random)])
ocs_categ_categ_intra, pcs_categ_categ_intra = metrics_res[:,0], metrics_res[:,1]

print('inter')
metrics_res = np.array([OCS_PCS(len_categs, 
                                nb_perms,
                                directions_similarity_w2vs_categ_categ[kr], 
                                directions_similarity_w2vs_categ_categ_shuffle[kr]) for kr in range(k_random)])
ocs_categ_categ, pcs_categ_categ = metrics_res[:,0], metrics_res[:,1]

print('N>R')
metrics_res = np.array([OCS_PCS(len_categs, 
                                nb_perms,
                                directions_similarity_w2vs_random_normal[kr], 
                                directions_similarity_w2vs_random_normal_shuffle[kr]) for kr in range(k_random)])
ocs_random_normal, pcs_random_normal = metrics_res[:,0], metrics_res[:,1]

print('R>N')
metrics_res = np.array([OCS_PCS(len_categs, 
                                nb_perms,
                                directions_similarity_w2vs_normal_random[kr], 
                                directions_similarity_w2vs_normal_random_shuffle[kr]) for kr in range(k_random)])
ocs_normal_random, pcs_normal_random = metrics_res[:,0], metrics_res[:,1]

print('R>R')
metrics_res = np.array([OCS_PCS(1, 
                                nb_perms,
                                directions_similarity_w2vs_random_random[kr], 
                                directions_similarity_w2vs_random_random_shuffle[kr]) for kr in range(k_random)])
ocs_random_random, pcs_random_random = metrics_res[:,0], metrics_res[:,1]

# Analogy test

In [None]:
## Version modifiée de gensim, peut être à verifier quels sont les changements exactement (le but est principalement d'autorizer la version vanilla)

def most_similar(model, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None, ignore=True):
    if positive is None:
        positive = []
    if negative is None:
        negative = []
        
    model.init_sims()
    
    if isinstance(positive, string_types) and not negative:
        # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
        positive = [positive]

    # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
    positive = [
        (word, 1.0) if isinstance(word, string_types + (np.ndarray,)) else word
        for word in positive
    ]
    negative = [
        (word, -1.0) if isinstance(word, string_types + (np.ndarray,)) else word
        for word in negative
    ]

    # compute the weighted average of all words
    all_words, mean = set(), []
    for word, weight in positive + negative:
        if isinstance(word, np.ndarray):
            mean.append(weight * word)
        else:
            mean.append(weight * model.word_vec(word, use_norm=True))
            if word in model.vocab:
                all_words.add(model.vocab[word].index)
    if not mean:
        raise ValueError("cannot compute similarity with no input")
    mean = matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)

    if indexer is not None:
        return indexer.most_similar(mean, topn)

    limited = model.vectors_norm if restrict_vocab is None else model.vectors_norm[:restrict_vocab]
    dists = np.dot(limited, mean)
    if not topn:
        return dists
    best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
    # ignore (don't return) words from the input
    if ignore:
        result = [(model.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
    else:
        result = [(model.index2word[sim], float(dists[sim])) for sim in best]
    return result[:topn]

def evaluate_word_analogies_bats(model, directory, method='add', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False, rank=False):
    logger = logging.getLogger(__name__)

    print("Method used: ", method)

    method1 = 'add'
    method2 = 'vanilla'
    
    ok_vocab = [(w, model.vocab[w]) for w in model.index2word[:restrict_vocab]]
    ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
    oov = 0
    #logger.info("Evaluating word analogies for top %i words in the model on %s", restrict_vocab, analogies)
    sections, section = [], None
    quadruplets_no = 0
    
    directions_names_bats = []
    directions_tuples_bats = []
    
    for f in os.listdir('../BATS_3.0/'+str(directory)):
        directions_names_bats.append(str(f)[:-4])
        directions_tuples_bats.append(set())
        with utils.open_file('../BATS_3.0/'+str(directory)+'/'+str(f)) as fin:
            for line_no, line in enumerate(fin):
                line = utils.to_unicode(line)
                a, b = [word.lower() for word in line.split()]
                list_b = b.split('/')
                if list_b[0] != a:
                    directions_tuples_bats[-1].add((a.upper(),list_b[0].upper()))
    
    for i in range(len(directions_names_bats)):
        if section:
            # store the last section, too
            sections.append(section)
            #model._log_evaluate_word_analogies(section)
            correct, incorrect = len(section['correct']), len(section['incorrect'])
            if correct + incorrect > 0:
                score = correct / (correct + incorrect)
                logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect)
            else:
                print('No score for ', section['section'])
            correct, incorrect = len(section['correct_vanilla']), len(section['incorrect_vanilla'])
            if correct + incorrect > 0:
                score = correct / (correct + incorrect)
                logger.info("%s: %.1f%% (%i/%i) VANILLA", section['section'], 100.0 * score, correct, correct + incorrect)
            if method2 == 'vanilla':
                total_section = len(section['correct_vanilla']) + len(section['incorrect_vanilla'])
                if total_section > 0:
                    logger.info('Number of predictions equal to a: %i (%d), a*: %i (%d), b: %i (%d)', 
                                section['n_a'], section['n_a']/total_section, 
                                section['n_a*'], section['n_a*']/total_section, 
                                section['n_b'], section['n_b']/total_section)

        
        section = {'section': directions_names_bats[i], 'correct': [], 'incorrect': [], 
                   'correct_vanilla': [], 'incorrect_vanilla': [],'n_a':0, 'n_a*':0, 'n_b':0,
                  'cd': [], 'badc': [], 'bac': [], 'n/cba': [], 'n/c': [], 'n/d': []}
        
        tuples = directions_tuples_bats[i]
        for t1 in tuples:
            for t2 in tuples:
                a,b = t1
                c,expected = t2
                if a != c:
                    quadruplets_no += 1
                    if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                        oov += 1
                        #if dummy4unknown:
                        #    logger.debug('Zero accuracy for line #%d with OOV words: %s', line_no, line.strip())
                        #    section['incorrect'].append((a, b, c, expected))
                        #else:
                        #    logger.debug("Skipping line with OOV words")
                        continue
                    original_vocab = model.vocab
                    model.vocab = ok_vocab

                    predicted = None
                    ignore = {a, b, c} # input words to be ignored
                    ignore_bool = False
                    positive = [b, c]
                    negative=[a]
                    # find the most likely prediction using 3CosAdd (vector offset) method
                    # TODO: implement 3CosMul and set-based methods for solving analogies
                    sims = most_similar(model, positive=positive, negative=negative, topn=5, restrict_vocab=restrict_vocab, ignore=ignore_bool)

                    model.vocab = original_vocab

                    #predicted = sims[0][0].upper() if case_insensitive else sims[0][0]
                    for element in sims:
                        predicted = element[0].upper() if case_insensitive else element[0]

                        if predicted in ok_vocab  and predicted not in ignore:
                            break
                    for element in sims:
                        predicted_ignore = element[0].upper() if case_insensitive else element[0]

                        if predicted_ignore in ok_vocab:
                            break
                            #if predicted != expected:
                                #logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted)

                    if predicted == expected:
                        section['correct'].append((a, b, c, expected))
                    else:
                        section['incorrect'].append((a, b, c, expected))
                        
                    if predicted_ignore == expected:
                        section['correct_vanilla'].append((a, b, c, expected))
                    else:
                        section['incorrect_vanilla'].append((a, b, c, expected))
                    if predicted_ignore == a:
                        section['n_a'] +=1
                    if predicted_ignore == b:
                        section['n_a*'] +=1
                    if predicted_ignore == c:
                        section['n_b'] +=1
                    
    if section:
        # store the last section, too
        sections.append(section)
        #model._log_evaluate_word_analogies(section)
        correct, incorrect = len(section['correct']), len(section['incorrect'])
        if correct + incorrect > 0:
            score = correct / (correct + incorrect)
            logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect)
        else:
            print('No score for ', section['section'])
        correct, incorrect = len(section['correct_vanilla']), len(section['incorrect_vanilla'])
        if correct + incorrect > 0:
            score = correct / (correct + incorrect)
            logger.info("%s: %.1f%% (%i/%i) VANILLA", section['section'], 100.0 * score, correct, correct + incorrect)
        if method2 == 'vanilla':
            total_section = len(section['correct_vanilla']) + len(section['incorrect_vanilla'])
            if total_section > 0:
                logger.info('Number of predictions equal to a: %i (%d), a*: %i (%d), b: %i (%d)', 
                            section['n_a'], section['n_a']/total_section, 
                            section['n_a*'], section['n_a*']/total_section, 
                            section['n_b'], section['n_b']/total_section)

    total = {
        'section': 'Total accuracy',
        'correct': list(chain.from_iterable(s['correct'] for s in sections)),
        'incorrect': list(chain.from_iterable(s['incorrect'] for s in sections)),
        'correct_vanilla': list(chain.from_iterable(s['correct_vanilla'] for s in sections)),
        'incorrect_vanilla': list(chain.from_iterable(s['incorrect_vanilla'] for s in sections)),
    }

    oov_ratio = float(oov) / quadruplets_no * 100
    logger.info('Quadruplets with out-of-vocabulary words: %.1f%%', oov_ratio)
    if not dummy4unknown:
        logger.info(
            'NB: analogies containing OOV words were skipped from evaluation! '
            'To change this behavior, use "dummy4unknown=True"'
        )
    #analogies_score = model._log_evaluate_word_analogies(total)
    correct, incorrect = len(total['correct']), len(total['incorrect'])
    #print(total)
    if correct + incorrect > 0:
        score = correct / (correct + incorrect)
        logger.info("%s: %.1f%% (%i/%i)", total['section'], 100.0 * score, correct, correct + incorrect)
        analogies_score = score
    correct_vanilla, incorrect_vanilla = len(total['correct_vanilla']), len(total['incorrect_vanilla'])
    #print(total)
    if correct_vanilla + incorrect_vanilla > 0:
        score = correct_vanilla / (correct_vanilla + incorrect_vanilla)
        logger.info("%s: %.1f%% (%i/%i) VANILLA", total['section'], 100.0 * score, correct_vanilla, correct_vanilla + incorrect_vanilla)
        analogies_score = score
    
    sections.append(total)
    # Return the overall score and the full lists of correct and incorrect analogies
    return analogies_score, sections

def bats_test(model):
    results = []
    for d in os.listdir('../BATS_3.0'):
        if d != 'metadata.json':
            results.append(evaluate_word_analogies_bats(model.wv, directory=d))
    return(results)

w2v_results = bats_test(model.wv)

# Décompo

In [None]:
c_w2vs = np.array([[model.wv.get_vector(i[0]) for i in directions_tuples[k] if i[0] in vocabulary and i[1] in vocabulary] 
                           for k in range(len(directions_tuples))])

# a* d'une catégorie
d_w2vs = np.array([[model.wv.get_vector(i[1]) for i in directions_tuples[k] if i[0] in vocabulary and i[1] in vocabulary] 
                           for k in range(len(directions_tuples))])

am1_dm1_c_d = []
am1_dm1_bma_d = []
am1_dm1_bma_c = []
am2_ana_bma = []
am2_ana_c = []
am2_c_bma = []
am2_bma_bma = []
am1_cd_f_dm1mcm1 = []
am1_dm1_bma_dmc = []
am1_dm1_c_dmc = []

for i in range(len(directions_tuples)):
    am1_dm1_c_d.append([])
    am1_dm1_bma_d.append([])
    am1_dm1_bma_c.append([])
    am2_ana_bma.append([])
    am2_ana_c.append([])
    am2_c_bma.append([])
    am2_bma_bma.append([])
    am1_cd_f_dm1mcm1.append([])
    am1_dm1_bma_dmc.append([])
    am1_dm1_c_dmc.append([])
    
    list_c_w2vs = list(c_w2vs[i])
    list_d_w2vs = list(d_w2vs[i])
    
    for j in range(len(list_c_w2vs)):
        for k in range(len(list_c_w2vs)):
            if j!=k:
                a,b,c,d = list_c_w2vs[j], list_d_w2vs[j], list_c_w2vs[k], list_d_w2vs[k]
                norme_a_m1 = 1/(np.linalg.norm(c+b-a))
                norme_d_m1 = 1/(np.linalg.norm(d))
                norme_c_m1 = 1/(np.linalg.norm(c))
                norme_d_m1 = 1/(np.linalg.norm(d))
                
                am1_dm1_c_d[-1].append(c@d*norme_a_m1*norme_d_m1)
                am1_dm1_bma_d[-1].append((b-a)@d*norme_a_m1*norme_d_m1)
                am1_dm1_bma_c[-1].append((b-a)@c*norme_a_m1*norme_d_m1)
                am2_ana_bma[-1].append((norme_a_m1**2)*(c+b-a)@(b-a))
                am2_ana_c[-1].append((norme_a_m1**2)*(c+b-a)@c)
                am2_c_bma[-1].append((norme_a_m1**2)*(b-a)@c)
                am2_bma_bma[-1].append((norme_a_m1**2)*(b-a)@(b-a))
                am1_cd_f_dm1mcm1[-1].append(norme_a_m1*(norme_d_m1 - norme_c_m1)*(c@(c+b-a)))
                am1_dm1_bma_dmc[-1].append(norme_a_m1*norme_d_m1*(b-a)@(d-c))
                am1_dm1_c_dmc[-1].append(norme_a_m1*norme_d_m1*c@(d-c))
                

In [None]:
c1='#FFBBB3'
c2='#5F2EFF'
c3='#00610F'
idx_idel = [37, 32, 39, 38, 36, 33, 35, 31, 30, 34, 29, 23, 24, 20, 26, 27, 25, 28, 21, 22,  19, 10, 14, 15, 11, 18, 17,
       12, 13, 16, 2,  1,  0,  8,
        9,  4,  5,  6,  3,  7]
x=np.array([d[5:-1] for d in directions_names])[idx_idel]

y1=[np.mean(am1_dm1_c_d[i]) for i in idx_idel]
y2=[np.mean(am1_dm1_bma_d[i]) for i in idx_idel]
y3=[np.mean(am1_dm1_bma_c[i]) for i in idx_idel]
y4=np.array(y2) - np.array(y3)
#   y4=[np.mean(am1_dm1_c_dmc[i]) for i in idx_idel] même chose sauf erreur
fig = go.Figure(go.Bar(x=x, y=y1,name='$b\cdot b^*$', marker_color= c2))
fig.add_trace(go.Bar(x=x, y=y3, name='$b\cdot o_a$',marker_color= c1))
fig.add_trace(go.Bar(x=x, y=y4,name='$o_b\cdot o_a$', marker_color= c3))
fig.update_layout(barmode='relative', 
                  xaxis={'tickangle':-45, 'ticklen':0.5}, 
                  font=dict(family="Times New Roman",size=16), 
                  yaxis_title_text='Value in the analogy score')
fig.show()


z1= [np.mean(am2_ana_bma[i]) for i in idx_idel]
z2= [np.mean(am2_ana_c[i]) for i in idx_idel]
z3= [np.mean(am2_c_bma[i]) for i in idx_idel]
z4= [np.mean(am2_bma_bma[i]) for i in idx_idel]
fig = go.Figure(go.Bar(x=x, y=z2, name='$b\cdot (b+o_a)$', marker_color= c2))
fig.add_trace(go.Bar(x=x, y=z3, name='$b\cdot o_a$', marker_color= c1))
fig.add_trace(go.Bar(x=x, y=z4, name='$off_a\cdot o_a$', marker_color= c3))
fig.update_layout(barmode='relative', 
                  xaxis={'tickangle':-45, 'ticklen':0.5}, 
                  font=dict(family="Times New Roman",size=16), 
                  yaxis_title_text='Value in the reference analogy score')
fig.show()


y1=[np.mean(am1_cd_f_dm1mcm1[i]) for i in idx_idel]#range(len(x))]
y2=[np.mean(am1_dm1_bma_dmc[i]) for i in idx_idel]#range(len(x))]
y3=[np.mean(am1_dm1_c_dmc[i]) for i in idx_idel]#range(len(x))]
fig = go.Figure(go.Bar(x=x, y=y1, name='$(1\!-\!\|b^*\|/\|b\|)\!\cdot\!(b\!+\!o_a)\!\cdot\!b$', marker_color=c1))
fig.add_trace(go.Bar(x=x, y=y2, name='$o_a\!\cdot\!o_b$', marker_color=c3))
fig.add_trace(go.Bar(x=x, y=y3, name='$b\!\cdot\!o_b$', marker_color=c2))
fig.update_layout(barmode='relative', 
                  xaxis={'tickangle':-45, 'tickwidth':0.5}, 
                  font=dict(family="Times New Roman",size=16),
                  yaxis_title_text='$\Delta_{sim}$')
fig.show()

# Plot results

In [300]:
#word2vec roc

labels=["Derivationnal","Encyclopedic","Inflectional","Lexicographic"]
intervals=[(20,30),(10,20),(30,40),(0,10)]
x = (ocs, ocs_nnp, ocs_categ_categ_intra, ocs_categ_categ, ocs_random_normal, ocs_normal_random, ocs_random_random)
y = (pcs, pcs_nnp, pcs_categ_categ_intra, pcs_categ_categ, pcs_random_normal, pcs_normal_random, pcs_random_random)


x_n, x_nnp, x_categ_categ_intra, x_categ_categ, x_random_normal, x_normal_random, x_random_random = x
x_nnp, x_categ_categ_intra, x_categ_categ = np.mean(x_nnp,axis=0), np.mean(x_categ_categ_intra,axis=0), np.mean(x_categ_categ,axis=0)
x_random_normal, x_normal_random, x_random_random = np.mean(x_random_normal,axis=0), np.mean(x_normal_random,axis=0), np.mean(x_random_random,axis=0)
#x_nnp, x_categ_categ_intra, x_categ_categ = iqr(x_nnp,axis=0), iqr(x_categ_categ_intra,axis=0), iqr(x_categ_categ,axis=0)
#x_random_normal, x_normal_random, x_random_random = iqr(x_random_normal,axis=0), iqr(x_normal_random,axis=0), iqr(x_random_random,axis=0)

y_n, y_nnp, y_categ_categ_intra, y_categ_categ, y_random_normal, y_normal_random, y_random_random = y
y_nnp, y_categ_categ_intra, y_categ_categ = np.mean(y_nnp,axis=0), np.mean(y_categ_categ_intra,axis=0), np.mean(y_categ_categ,axis=0)
y_random_normal, y_normal_random, y_random_random = np.mean(y_random_normal,axis=0), np.mean(y_normal_random,axis=0), np.mean(y_random_random,axis=0)
#y_nnp, y_categ_categ_intra, y_categ_categ = iqr(y_nnp,axis=0), iqr(y_categ_categ_intra,axis=0), iqr(y_categ_categ,axis=0)
#y_random_normal, y_normal_random, y_random_random = iqr(y_random_normal,axis=0), iqr(y_normal_random,axis=0), iqr(y_random_random,axis=0)


def l_m(x):
    return([np.mean(x)])
    
for i in range(len(labels)):
    i1,i2 = intervals[i]
    #x_cc = x_categ_categ[:,i1:i2]
    #y_cc = x_categ_categ[:,i1:i2]
    #for l in perm_lists_inter:
        
    #for icc in perm_list_categ_categ:
    #    if perm_list_categ_categ[icc] in range(i1,i2):
    #        x_cc = np.hstack((x_cc, x_categ_categ[icc]))
    #        y_cc = np.hstack((y_cc, y_categ_categ[icc]))
    print(labels[i])
    print("Normal                ", l_m(x_n[i1:i2])[0], l_m(y_n[i1:i2])[0])
    print("Permuted within categ ", l_m(x_nnp[i1:i2])[0], l_m(y_nnp[i1:i2])[0])
    print("Mismatched within type", l_m(x_categ_categ_intra[i1:i2])[0], l_m(y_categ_categ_intra[i1:i2])[0])
    print("Mismatched across type", l_m(x_categ_categ[i1:i2])[0], l_m(y_categ_categ[i1:i2])[0])
    print("Random start          ", l_m(x_normal_random[i1:i2])[0], l_m(y_normal_random[i1:i2])[0])
    print("Random end            ", l_m(x_random_normal[i1:i2])[0], l_m(y_random_normal[i1:i2])[0])
print("Random start and end  ", l_m(x_random_random)[0], l_m(y_random_random)[0])

Derivationnal
Normal                 0.15630814 0.6790253152647141
Permuted within categ  0.0013571794144809246 0.005040067772658974
Mismatched within type 0.03354356996715069 0.007551503972635143
Mismatched across type 0.10050375675782561 0.006171112820321556
Random start           0.004677556827664376 0.0060845519141245865
Random end             0.005111376661807299 0.005753678529607292
Encyclopedic
Normal                 0.19812895 0.5566167896026564
Permuted within categ  0.004359106719493866 0.009409635662129967
Mismatched within type 0.09686008542776108 0.013313107158677251
Mismatched across type 0.07516164667904376 0.006679870954383838
Random start           0.009957562014460564 0.008362772750721354
Random end             0.007240977883338928 0.008493071611089142
Inflectional
Normal                 0.2951204 0.8509698953960825
Permuted within categ  0.0027671211399137976 0.0074141229504471
Mismatched within type 0.08488833475857974 0.006653726813466787
Mismatched across type 0.0

In [175]:
#word2vec roc

labels=["Lexicographic","Encyclopedic","Derivationnal","Inflectional"]
intervals=[(0,10),(10,20),(20,30),(30,40)]
x = (ocs, ocs_nnp, ocs_categ_categ_intra, ocs_categ_categ, ocs_random_normal, ocs_normal_random, ocs_random_random)
y = (pcs, pcs_nnp, pcs_categ_categ_intra, pcs_categ_categ, pcs_random_normal, pcs_normal_random, pcs_random_random)


x_n, x_nnp, x_categ_categ_intra, x_categ_categ, x_random_normal, x_normal_random, x_random_random = x
x_random_normal, x_normal_random, x_random_random = np.mean(x_random_normal,axis=0), np.mean(x_normal_random,axis=0), np.mean(x_random_random,axis=0)
y_n, y_nnp, y_categ_categ_intra, y_categ_categ, y_random_normal, y_normal_random, y_random_random = y
y_random_normal, y_normal_random, y_random_random = np.mean(y_random_normal,axis=0), np.mean(y_normal_random,axis=0), np.mean(y_random_random,axis=0)

def l_m(x):
    return([np.mean(x)])
    
for i in range(len(labels)):
    i1,i2 = intervals[i]
    x_cc = x_categ_categ[i1:i2]
    y_cc = x_categ_categ[i1:i2]
    for icc in perm_list_categ_categ:
        if perm_list_categ_categ[icc] in range(i1,i2):
            x_cc = np.hstack((x_cc, x_categ_categ[icc]))
            y_cc = np.hstack((y_cc, y_categ_categ[icc]))
    print(labels[i])
    print("Normal")
    print(l_m(x_n[i1:i2]))
    print(l_m(y_n[i1:i2]))
    print("Normal NNP")
    print(l_m(x_nnp[i1:i2]))
    print(l_m(y_nnp[i1:i2]))
    print("CC intra")
    print(l_m(x_categ_categ_intra[i1:i2]))
    print(l_m(y_categ_categ_intra[i1:i2]))
    print("CC inter")
    print(l_m(x_categ_categ[i1:i2]))
    print(l_m(y_categ_categ[i1:i2]))
    print("Random->Normal")
    print(l_m(x_normal_random[i1:i2]))
    print(l_m(y_normal_random[i1:i2]))
    print("Normal->Random")
    print(l_m(x_random_normal[i1:i2]))
    print(l_m(y_random_normal[i1:i2]))
print("Random->Random")
print(l_m(x_random_random))
print(l_m(y_random_random))

Lexicographic
Normal
[0.031050116]
[0.5374205051995696]
Normal NNP
[0.014700961]
[0.5000064274469275]
CC intra
[0.09704797]
[0.500972445498127]
CC inter
[0.16666335]
[0.5020916179998232]
Random->Normal
[0.06861267398111523]
[0.5000440911490568]
Normal->Random
[0.058894394487142565]
[0.4994701837967577]
Encyclopedic
Normal
[0.19812895]
[0.5565923947940875]
Normal NNP
[0.16895457]
[0.49930119761268843]
CC intra
[0.2589875]
[0.49648172922058953]
CC inter
[0.20307705]
[0.49914245428597975]
Random->Normal
[0.20400208070874215]
[0.4996925414543981]
Normal->Random
[0.1374869143217802]
[0.5007649277207766]
Derivationnal
Normal
[0.15630814]
[0.6790398803003883]
Normal NNP
[0.0874205]
[0.4984289860111325]
CC intra
[0.12175258]
[0.5003503237238648]
CC inter
[0.15171589]
[0.5005767901574866]
Random->Normal
[0.07464414473623036]
[0.500759367018961]
Normal->Random
[0.05997217817232013]
[0.50049051736413]
Inflectional
Normal
[0.2951204]
[0.8506725440174288]
Normal NNP
[0.111265816]
[0.499701942319633

In [283]:
for i in range(len(labels)):
    i1,i2 = intervals[i]
    y_cc = y_categ_categ[i1:i2]
    for icc in perm_list_categ_categ:
        if perm_list_categ_categ[icc] in range(i1,i2):
            y_cc = np.hstack((y_cc, y_categ_categ[icc]))
    print(labels[i])
    print("Normal", l_m(y_n[i1:i2])[0])
    print("Normal NNP", l_m(y_nnp[i1:i2])[0])
    print("CC intra", l_m(y_categ_categ_intra[i1:i2])[0])
    print("CC inter", l_m(y_cc[i1:i2])[0])
    print("Random->Normal", l_m(y_normal_random[i1:i2])[0])
    print("Normal->Random", l_m(y_random_normal[i1:i2])[0])
print("Random->Random", l_m(y_random_random)[0])

Lexicographic
Normal 0.5379290215368527
Normal NNP 0.5000666601083112
CC intra 0.500158897322328
CC inter 0.48943239868336674
Random->Normal 0.5000518325593741
Normal->Random 0.4993168933866512
Encyclopedic
Normal 0.5566167896026564
Normal NNP 0.5001332847163477
CC intra 0.501470630889484
CC inter 0.3226533744580055
Random->Normal 0.4993094037794368
Normal->Random 0.5006965934595644
Derivationnal
Normal 0.6790253152647141
Normal NNP 0.4997700469288904
CC intra 0.4996664562434659
CC inter nan
Random->Normal 0.5004644784300808
Normal->Random 0.5003988574826186
Inflectional
Normal 0.8509698953960825
Normal NNP 0.4991496151082746
CC intra 0.500518839172449
CC inter nan
Random->Normal 0.5003427146784183
Normal->Random 0.49954779690872
Random->Random 0.50066296079548



Mean of empty slice.


invalid value encountered in double_scalars



In [213]:
for i in range(len(labels)):
    i1,i2 = intervals[i]
    y_cc = x_categ_categ[i1:i2]
    for icc in perm_list_categ_categ:
        if perm_list_categ_categ[icc] in range(i1,i2):
            y_cc = np.hstack((y_cc, y_categ_categ[icc]))
    print(labels[i])
    print("Normal", l_m(y_n[i1:i2])[0])
    print("Normal NNP", l_m(y_nnp[i1:i2])[0])
    print("CC intra", l_m(y_categ_categ_intra[i1:i2])[0])
    print("CC inter", l_m(y_categ_categ[i1:i2])[0])
    print("Random->Normal", l_m(y_normal_random[i1:i2])[0])
    print("Normal->Random", l_m(y_random_normal[i1:i2])[0])
print("Random->Random", l_m(y_random_random)[0])

Lexicographic
Normal 0.5374205051995696
Normal NNP 0.5000064274469275
CC intra 0.500972445498127
CC inter 0.5020916179998232
Random->Normal 0.5000440911490568
Normal->Random 0.4994701837967577
Encyclopedic
Normal 0.5565923947940875
Normal NNP 0.49930119761268843
CC intra 0.49648172922058953
CC inter 0.49914245428597975
Random->Normal 0.4996925414543981
Normal->Random 0.5007649277207766
Derivationnal
Normal 0.6790398803003883
Normal NNP 0.4984289860111325
CC intra 0.5003503237238648
CC inter 0.5005767901574866
Random->Normal 0.500759367018961
Normal->Random 0.50049051736413
Inflectional
Normal 0.8506725440174288
Normal NNP 0.49970194231963355
CC intra 0.5041745169262756
CC inter 0.4992979616192089
Random->Normal 0.5003718090045324
Normal->Random 0.4996029461753017
Random->Random 0.5004407848280471


In [307]:
def metrics_ofmodel(model, directions_names, directions_tuples, nb_perms=50, token=False, tokenizer=None):
    

    if token:
        directions = [list(d) for d in directions_tuples]
        #directions = [[d for d in list(directions_tuples[i]) if d[0] in vocabulary and d[1] in vocabulary] for i in range(len(directions_tuples))]
        direction_w2vs = np.array([[token_embedding(tokenizer, model, i[1]) - token_embedding(tokenizer, model, i[0]) 
                                    for i in directions[k]] 
                                    for k in range(len(directions))])
        #direction_w2vs_shuffle = np.array([[token_embedding(tokenizer, model, i[1]) - token_embedding(tokenizer, model, j[0])
        #                                    for i in directions_tuples[k]
        #                                    for j in directions_tuples[k] if i!=j  and i[1]!=j[1]] 
        #                           for k in range(len(directions_tuples))])
        direction_w2vs_shuffle = []
        for k in range(len(directions)):
            direction_w2vs_shuffle.append([])
            for perm in range(nb_perms):
                #perm_list = permutation_onecycle(len(directions[k]))
                perm_list = permutation_onecycle_avoidtrue(len(directions[k]),  directions[k])
                dirs = [token_embedding(tokenizer, model, directions[k][perm_list[i]][1]) - token_embedding(tokenizer, model, directions[k][i][0])
                                                      for i in range(len(directions[k]))]
                direction_w2vs_shuffle[-1].append(dirs)
    else:
        vocabulary_keys = model.wv.vocab.keys()
        vocabulary = set(vocabulary_keys)

        #list_directions_tuples = [list(d) for d in directions_tuples]
        directions = [[d for d in list(directions_tuples[i]) if d[0] in vocabulary and d[1] in vocabulary] for i in range(len(directions_tuples))]
        
        
        direction_w2vs = np.array([[model.wv.get_vector(i[1]) - model.wv.get_vector(i[0]) 
                                    for i in directions[k] if i[1] in vocabulary and i[0] in vocabulary] 
                                   for k in range(len(directions))])
        #direction_w2vs_shuffle = np.array([[model.wv.get_vector(i[1]) - model.wv.get_vector(j[0]) 
        #                                for i in directions_tuples[k] if i[1] in vocabulary 
        #                                for j in directions_tuples[k] if j[0] in vocabulary and i!=j]  
        #                       for k in range(len(directions_tuples))])
        direction_w2vs_shuffle = []
        for k in range(len(directions)):
            direction_w2vs_shuffle.append([])
            for perm in range(nb_perms):
                #perm_list = permutation_onecycle(len(directions_tuples[k]))
                perm_list = permutation_onecycle_avoidtrue(len(directions[k]),  directions[k])
                dirs = [model.wv.get_vector(directions[k][perm_list[i]][1]) - model.wv.get_vector(directions[k][i][0])
                                                      for i in range(len(directions[k]))]
                direction_w2vs_shuffle[-1].append(dirs)
    
    directions_similarity_w2vs = similarite_offsets(directions, direction_w2vs)
    directions_similarity_w2vs_shuffle = [similarite_offsets(directions, np.array(direction_w2vs_shuffle)[:,perm]) for perm in range(nb_perms)]
    
    ocs, pcs = OCS_PCS(len(directions_names), 
                       nb_perms,
                       directions_similarity_w2vs, 
                       directions_similarity_w2vs_shuffle)
    
    return(ocs, pcs)

In [195]:
directions = [list(d) for d in directions_tuples]
directions[3][3]

('introvert', 'extravert')

In [306]:
scores_models2 = []
print("w2v")
scores_models2.append((metrics_ofmodel(model, directions_names, directions_tuples_bats)))
print("glove")
scores_models2.append((metrics_ofmodel(model_glove, directions_names, directions_tuples_bats)))
print("dict2vec")
scores_models2.append((metrics_ofmodel(model_dict2vec, directions_names, directions_tuples_bats)))
print("numberbatch")
scores_models2.append((metrics_ofmodel(model_conceptnet, directions_names, directions_tuples_bats)))
print("bert")
scores_models2.append((metrics_ofmodel(bert_model, directions_names, directions_tuples_bats, token=True, tokenizer=bert_tokenizer)))
print("gpt2")
scores_models2.append((metrics_ofmodel(gpt2_model, directions_names, directions_tuples_bats, token=True, tokenizer=gpt2_tokenizer)))

w2v



Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).


Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).


Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).



glove



Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).


Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).


Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).



dict2vec



Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).


Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).


Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).



numberbatch



Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).


Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).


Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).



bert


NameError: free variable 'vocabulary' referenced before assignment in enclosing scope

In [203]:
pcs_models = [i[1] for i in scores_models]

In [311]:
ocs_models = [i[0] for i in scores_models2]
pcs_models = [i[1] for i in scores_models2]

#intervals = [[0,10],[10,20],[20,30],[30,40]]
intervals=[(20,30),(10,20),(30,40),(0,10)]
for i_m in range(len(models)):
    print(models[i_m])
    for i_l in range(len(labels)):
        #print(labels[i_l])
        i,j = intervals[i_l]
        print(labels[i_l], l_m(ocs_models[i_m][i:j])[0], l_m(pcs_models[i_m][i:j])[0])

word2vec
Derivationnal 0.15630814 0.6790723348103488
Encyclopedic 0.19812895 0.5591797316771174
Inflectional 0.2951204 0.850800472832011
Lexicographic 0.031050116 0.539257017823826
Glove
Derivationnal 0.23739421 0.7100789384423158
Encyclopedic 0.2552645 0.6231332902173314
Inflectional 0.3447476 0.8595064435991357
Lexicographic 0.043298054 0.5496225486047479
dict2vec
Derivationnal 0.07907895 0.6291216193241242
Encyclopedic 0.21264341 0.6279168959600167
Inflectional 0.09856267 0.7001493223209936
Lexicographic 0.024130624 0.5340288299875052
Numberbatch
Derivationnal 0.2242213 0.8052135369981617
Encyclopedic 0.25081062 0.673627129555972
Inflectional 0.35681745 0.9239420923399727
Lexicographic 0.03379402 0.5520509241149522
BERT
Derivationnal 0.17828079 0.6318017206164098
Encyclopedic 0.1511009 0.5687281932528113
Inflectional 0.21723476 0.8206673927611924
Lexicographic 0.016187701 0.5166289879216993
GPT-2
Derivationnal 0.27039167 0.7320427188671387
Encyclopedic 0.07054619 0.5131203005414411


In [225]:
import plotly.graph_objects as go
models=['word2vec', 'Glove', 'dict2vec', 'Numberbatch', 'BERT', 'GPT-2']
labels=["Lexicographic","Encyclopedic","Derivationnal","Inflectional"]
c = ['#FF7070','#FF4136','#CAE000','#00610F', '#39CCCC', '#5F2EFF']
intevals = [[0,10],[10,20],[20,30],[30,40]]

data = [go.Bar(name=models[i], x=labels, y=[l_m(pcs_models[i][0:10])[0], 
                                            l_m(pcs_models[i][10:20])[0], 
                                            l_m(pcs_models[i][20:30])[0], 
                                            l_m(pcs_models[i][30:40])[0]],
              marker_color=c[i]
              ) for i in range(len(models))]

fig = go.Figure(data=data)
fig.update_layout(
    barmode='group',
    title={
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title_text="Category type", # xaxis label
    yaxis_title_text='Pairwise Consistency Score')
fig.show()

In [7]:
# IDEL
y_analogie = np.array([[.686,.645,.503,.360,.568,.119],
    [.158,.167,.127,.060,.288,.359],
    [.198, .281, .162, .056, .041, .014],
    [.063, .080, .030, .004, .002, .003]]).T

y_pcs = np.array([[.851, .860, .700, .924, .821, .651],
    [.679, .710, .630, .805, .632, .732],
    [.559, .623, .628, .674, .569, .513],
    [.539, .550, .534, .552, .517, .508]]).T

In [26]:
import plotly.graph_objects as go
models=['word2vec', 'Glove', 'dict2vec', 'Numberbatch', 'BERT', 'GPT-2']
labels=["Inflectional", "Derivational","Encyclopedic", "Lexicographic"]
c = ['#FF7070','#FF4136','#CAE000','#00610F', '#39CCCC', '#5F2EFF']
intevals = [[30,40],[20,30],[10,20], [0,10]]

#data = [go.Bar(name=models[i], x=labels, y=[l_m(pcs_models[i][0:10])[0], 
#                                            l_m(pcs_models[i][10:20])[0], 
#                                            l_m(pcs_models[i][20:30])[0], 
#                                            l_m(pcs_models[i][30:40])[0]],
data = [go.Bar(name=models[i], x=labels, y=y_pcs[i],
marker_color=c[i]
              ) for i in range(len(models))]

fig = go.Figure(data=data)
#fig.add_trace(go.Scatter(x=labels, y=[0.5,0.5,0.5,0.5],
#                         line = dict(color='royalblue', width=4, dash='dash')))
fig.update_layout(
    barmode='group',
    title={
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    font=dict(family="Times New Roman",size=26),
    #uniformtext_minsize=24,
        #family="Courier New, monospace",
        #size=18,
        #color="RebeccaPurple"
    #)
    xaxis_title_text="Category type", # xaxis label
    yaxis_title_text='Pairwise Consistency Score'
)
fig.update_yaxes(range=[0.5, 1])
fig.show()

In [22]:
import plotly.graph_objects as go
models=['word2vec', 'Glove', 'dict2vec', 'Numberbatch', 'BERT', 'GPT-2']
labels=["Inflectional", "Derivational","Encyclopedic", "Lexicographic"]
c = ['#FF7070','#FF4136','#CAE000','#00610F', '#39CCCC', '#5F2EFF']
intevals = [[30,40],[20,30],[10,20], [0,10]]

#data = [go.Bar(name=models[i], x=labels, y=[l_m(pcs_models[i][0:10])[0], 
#                                            l_m(pcs_models[i][10:20])[0], 
#                                            l_m(pcs_models[i][20:30])[0], 
#                                            l_m(pcs_models[i][30:40])[0]],
data = [go.Bar(name="models[i]", x=labels, y=y_analogie[i],
marker_color=c[i]
              ) for i in range(len(models))]

fig = go.Figure(data=data)
#fig.add_trace(go.Scatter(x=labels, y=[0.5,0.5,0.5,0.5],
#                         line = dict(color='royalblue', width=4, dash='dash')))
fig.update_layout(
    barmode='group',
    title={
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    font=dict(family="Times New Roman",size=26),
    #uniformtext_minsize=24,
        #family="Courier New, monospace",
        #size=18,
        #color="RebeccaPurple"
    #)
    xaxis_title_text="Category type", # xaxis label
    yaxis_title_text='Analogy test accuracy'
)
fig.update_yaxes(range=[0, 1])
fig.show()

In [222]:
c = ['#FF7070','#00610F','#CAE000','#5F2EFF']

data = [go.Bar(name=labels[i], x=models, y=[l_m(pcs_models[k][intervals[i][0]:intervals[i][1]])[0] for k in range(len(models))],
              marker_color=c[i]
              ) for i in range(len(labels))]

fig = go.Figure(data=data)
fig.update_layout(
    barmode='group',
    title={
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title_text="Word embeddings", # xaxis label
    yaxis_title_text='Pairwise Consistency Score')
fig.show()