This notebook can only be run after running the "Models Preparation" notebook, which pre-computes the models.

After loading the models, preparing the data and pre-computing what is needed, this notebook will compute the different experiences of the article. It is possible to choose the BATS categories for computing the error terms norms as well as the paraphrase error rankings.

In [None]:
import gensim
from gensim import utils, matutils
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

import math
import time
import logging
from itertools import chain
import logging
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os

import scipy
from scipy import sparse

from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity as cos_sim

from scipy.stats import gaussian_kde


# Loading models and data

In [None]:
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

text8 = api.load("text8")
    
# Make sure to have run the Models preparation.ipynb file before to prepare the different models and count matrices.

count_dict_ij = pickle.load( open( "./Models/count_dict_ij.pkl", "rb" ) )
count_matrix_tri = scipy.sparse.load_npz('./Models/count_matrix_tri.npz')
pmi_matrix = scipy.sparse.load_npz('./Models/pmi_matrix_nominctxt.npz')
para_matrix = scipy.sparse.load_npz('./Models/paraphrase_matrix.npz')
model = Word2Vec.load("./Models/word2vec_clean.model") 

print("Succesfully loaded all models/arrays/dicts")

In [None]:
# Loading the BATS dataset

directory = './BATS_3.0'
names = []
pairs_sets = []

for d in os.listdir(directory):
    if d != 'metadata.json':
        for f in os.listdir(os.path.join(directory,str(d))):
            names.append(str(f)[:-4])
            pairs_sets.append(set())
            with utils.open_file(os.path.join(directory,str(d),str(f))) as fin:
                for line_no, line in enumerate(fin):
                    line = utils.to_unicode(line)
                    a, b = [word.lower() for word in line.split()]
                    list_b = b.split('/')
                    if list_b[0] != a: #Keeping only the first analogy pair
                        pairs_sets[-1].add((a, list_b[0]))

pairs_sets = [list(d) for d in pairs_sets]

print("Succesfully loaded the BATS dataset")

In [None]:
# Matrices W,C and Ct (pseudo-inverse) of the w2v model
Ct = np.linalg.pinv(model.syn1neg)
W = model.wv.vectors
C = model.syn1neg
print("Extracted the W, C and Ct matrices of the w2v model.")

# Vocabulary definition, inversed vocabulary etc...

# For memory purposes, we recommend to limit the vocabulary for tests. 
# Replace by None if you wish to use the entirety of the vocabulary. 
vocab_limit = 2000

vocabulary_keys = model.wv.index_to_key
vocabulary = set(vocabulary_keys)
vocabulary_list = np.array(list(vocabulary_keys[:vocab_limit]))
len_vocabulary = len(list(vocabulary_keys[:vocab_limit]))

invdict_vocabulary = dict.fromkeys(vocabulary_list)
for i,wi in enumerate(vocabulary_list):
    invdict_vocabulary[wi] = i

full_invdict_vocabulary = dict.fromkeys(np.array(list(vocabulary_keys)))
for i,wi in enumerate(np.array(list(vocabulary_keys))):
    full_invdict_vocabulary[wi] = i
    
# Top indexes vocabulary
count_dict_i = dict.fromkeys(vocabulary_list)
for w in count_dict_ij:
    count_dict_i[w] = sum(count_dict_ij[w].values())
    
n_max = 10000    
sorted_count_dict = {k: v for k, v in sorted(count_dict_i.items(), key=lambda item: item[1])}
sorted_vocabulary = list(reversed([[k, v] for (k, v) in sorted_count_dict.items()]))
top_voc_list = [w[0] for w in sorted_vocabulary[:n_max]]
top_voc = set(top_voc_list)
invdict_top_voc = {w:i for i,w in enumerate(top_voc_list)}
    
sorted_count_dict_full = {k: v for k, v in sorted(count_dict_i.items(), key=lambda item: item[1])}
sorted_vocabulary_full = list(reversed([[k, v] for (k, v) in sorted_count_dict.items()]))
top_voc_list_full = [w[0] for w in sorted_vocabulary]
top_voc_full = set(top_voc_list)
invdict_top_voc_full = {w:i for i,w in enumerate(top_voc_list)}
    
# Relation between vocabulary indexes and top_vocabulary indexes (switching between w2v and pmi indexes)
#     n_i[i] = index for word v in pmi for v = to w2v_dict[i]
#     pmi[n_i[i]] -> pmi[i] = pmi of word w2v_dict[i]
new_idxs_array = np.array([full_invdict_vocabulary[model.wv.index_to_key[i]] 
                           for i in range(len(model.wv.index_to_key))]).astype(np.int)

new_idxs_inv_array = np.zeros(new_idxs_array.shape)
for i in range(new_idxs_array.shape[0]):
    new_idxs_inv_array[new_idxs_array[i]] = int(i)
new_idxs_inv_array = new_idxs_inv_array.astype(np.int)
    
print("Prepared the vocabulary and other important data")

In [None]:
# idx <-> word tuple relation of the paragraph error matrix
def idx_oftuple(w1,w2, invdict_top_voc, n=10000):
    if invdict_top_voc[w2] < invdict_top_voc[w1]:
        w1, w2 = w2, w1
    return(n*invdict_top_voc[w1]+invdict_top_voc[w2])

def tuple_ofidx(idx, top_voc_list, n=10000):
    i1, i2 = idx//n, idx%n
    return(top_voc_list[i1], top_voc_list[i2])

def ww_sim_analogy(positive, negative, mat, tok2indx, indx2tok, topn=10, numpy=False):
    """Calculate topn most similar words to word"""
    if numpy:
        positives = [mat[tok2indx[indx]] for indx in positive]
        negatives = [mat[tok2indx[indx]] for indx in negative]
        v1 = np.copy(positives[0])
    else:
        positives = [mat.getrow(tok2indx[indx]) for indx in positive]
        negatives = [mat.getrow(tok2indx[indx]) for indx in negative]
        v1 = scipy.sparse.csr_matrix.copy(positives[0]) 
    for p in positives[1:]:
        v1 += p
    for n in negatives:
        v1 -= n
    sims = cos_sim(mat, v1).flatten()
    sindxs = np.argsort(-sims)
    sim_word_scores = [(indx2tok[sindx], sims[sindx]) for sindx in sindxs[0:topn]]
    return sim_word_scores

def ww_sim_analogy_para_rank(positive1, positive2, mat, top_voc_list, invdict_top_voc, topn=10, negative=None, metric='cossim'):
    """
    Calculate topn most similar word sets to a given word set from the paraphrase error. 
    Can specify a wanted "negative" word wanted.
    """
    
    if type(negative) == list:
        n1, n2 = negative[0], negative[1]
    
    v1 = mat.getrow(idx_oftuple(positive1, positive2, invdict_top_voc))

    sims = pairwise_distances(mat, v1, metric).flatten()
    sindxs = np.argsort(-sims)
    
    value = idx_oftuple(n1, n2, invdict_top_voc)
    i, = np.where(sindxs == value)

    return(i[0])

print('Defined all useful functions')

In [None]:
# Error terms vectors definitions

epsilon = 1e-10

# Create the errors terms and their norms
def paraphrase_error_vector(w1,w2,w3,w4, invdict_top_voc, n_ww, count_matrix_tri):
    p_e = np.zeros(len(vocabulary_list))
    
    i_w1w2 = idx_oftuple(w1, w2, invdict_top_voc)
    i_w3w4 = idx_oftuple(w3, w4, invdict_top_voc)
    
    n_w1w2 = n_ww[i_w1w2]
    n_w3w4 = n_ww[i_w3w4]
    
    for k, wk in enumerate(vocabulary_list):
        #total_count = log_or_null(total_count_paraph(count_dict_cj_mwkq, w1, w2, wk))
        n_c_w1w2 = count_matrix_tri[i_w1w2, invdict_vocabulary[wk]]
        n_c_w3w4 = count_matrix_tri[i_w3w4, invdict_vocabulary[wk]]
        
        if n_c_w1w2 == 0:
            if n_c_w3w4 == 0:
                p_e_k = 0
            else:
                p_e_k = np.log(epsilon)
        else:
            if n_c_w3w4 == 0:
                p_e_k = -np.log(epsilon)
            else:
                p_e_k = np.log((n_c_w1w2 * n_w3w4)/(n_c_w3w4 * n_w1w2))
        p_e[k] = p_e_k
        
    return(p_e)

def indep_error_vector(w1,w2,w3,w4, invdict_top_voc, n_ww, n_w, n_w_c, count_matrix_tri):
    i_e = np.zeros(len(vocabulary_list))
    
    i_w1w2 = idx_oftuple(w1, w2, invdict_top_voc)
    i_w3w4 = idx_oftuple(w3, w4, invdict_top_voc)
    
    n_w1w2 = n_ww[i_w1w2]
    n_w3w4 = n_ww[i_w3w4]
    n_w1 = n_w[w1]
    n_w2 = n_w[w2]
    n_w3 = n_w[w3]
    n_w4 = n_w[w4]
    
    tau = np.log((n_w3w4*n_w1*n_w2)/(n_w1w2*n_w3*n_w4))
    
    for k, wk in enumerate(vocabulary_list):
        n_c_w1w2 = count_matrix_tri[i_w1w2, invdict_vocabulary[wk]]
        n_c_w3w4 = count_matrix_tri[i_w3w4, invdict_vocabulary[wk]]
        
        n_c_w1 = n_w_c[w1].get(wk,0)
        n_c_w2 = n_w_c[w2].get(wk,0)
        n_c_w3 = n_w_c[w3].get(wk,0)
        n_c_w4 = n_w_c[w4].get(wk,0)
        
        if n_c_w1w2 == 0:
            if n_c_w1 == 0 or n_c_w2 == 0:
                i_e_w = 0
            else:
                i_e_w = np.log(epsilon)
        else:
            if n_c_w1 == 0 or n_c_w2 == 0:
                # Impossible case normally
                i_e_w = -np.log(epsilon)
            else:
                i_e_w = np.log((n_c_w1w2)/(n_c_w1*n_c_w2))
                
        if n_c_w3w4 == 0:
            if n_c_w3 == 0 or n_c_w4 == 0:
                i_e_we = 0
            else:
                i_e_we = -np.log(epsilon)
        else:
            if n_c_w3 == 0 or n_c_w4 == 0:
                print("erreur?")
                i_e_we = np.log(epsilon)
            else:
                i_e_we = -np.log((n_c_w3w4)/(n_c_w3*n_c_w4))
                
        i_e[k] = i_e_w + i_e_we + tau
        
    return(i_e)

def normes(p_e, i_e):
    n_p, n_i, n_s = np.linalg.norm(p_e), np.linalg.norm(i_e), np.linalg.norm(p_e + i_e)
    n_p_1, n_i_1, n_s_1 = np.linalg.norm(p_e, ord=1), np.linalg.norm(i_e, ord=1), np.linalg.norm(p_e + i_e, ord=1)
    return([n_p, n_i, n_s, n_p_1, n_i_1, n_s_1])

def errors_norms(w1,w2,w3,w4, invdict_top_voc, n_w1w2,  n_w, n_w_c, count_matrix_tri):
    p_e = paraphrase_error_vector(w1,w2,w3,w4, invdict_top_voc, n_w1w2, count_matrix_tri)
    i_e = indep_error_vector(w1,w2,w3,w4, invdict_top_voc, n_w1w2, n_w, n_w_c, count_matrix_tri)
    return(normes(p_e, i_e))

print('Defined the error terms functions')

# Link between the PMI and word2vec embeddings

Reconstruction of the PMI vector and word2vec embedding for a given word.
Here the word chosen is King. We can find a relatively high correlation, but not good enough for concluding that the link is linear enough, especially since this is done for the closest word2vec model possible to the PMI factorization.

In [None]:
font = {'family' : 'serif',
        'weight' : 'normal',
        'size'   : 28}

iking = model.wv.index_to_key.index('king')
false_pmi_king = np.dot(W[new_idxs_inv_array[iking]][:pmi_matrix.shape[0]],
                        C[new_idxs_inv_array[:pmi_matrix.shape[0]]].T)
false_w2v_emb_king = np.dot(Ct[:,new_idxs_inv_array[:pmi_matrix.shape[0]]],
                            pmi_matrix[invdict_vocabulary['king']].toarray()[0])

x,y = false_pmi_king, pmi_matrix[invdict_vocabulary['king']].toarray()[0]

plt.figure(figsize=(10,10))
plt.scatter(x, y, s=1)
plt.xlabel('Values of the reconstructed PMI vector')
plt.ylabel('Values of the true PMI vector')
plt.rc('font', **font)
plt.show()

print(scipy.stats.pearsonr(x, y))

x,y = model.wv.get_vector('king'), false_w2v_emb_king
plt.figure(figsize=(10,10))
plt.scatter(x, y, s=20)
plt.xlabel('Values of the true w2v vector')
plt.ylabel('Values of the reconstructed w2v vector')
plt.rc('font', **font)
plt.show()

print(scipy.stats.pearsonr(x, y))

In [None]:
# Compute the correlations for the top words in the vocabulary (10k by default)

correlations = []

Ct_rearranged = Ct[:,new_idxs_inv_array]

for i in range(len(top_voc_list)):
  if i%1000==0: print(i)
  top_voc_list
  wi = top_voc_list[i]
  false_w2v_emb_wi = np.dot(Ct_rearranged,
                            pmi_matrix[invdict_vocabulary[wi]].toarray()[0])


  correlations.append(scipy.stats.pearsonr(model.wv.get_vector(wi), false_w2v_emb_wi)[0])

In [None]:
import plotly.graph_objects as go

xaxis_title_text = 'Correlation between true and approximated embedding'
yaxis_title_text = 'Count'

fig = go.Figure(data=[go.Histogram(x=correlations)])
fig.update_layout(
    font=dict(
        family="Times New Roman",
        size=30),
    xaxis_title_text=xaxis_title_text,
    yaxis_title_text=yaxis_title_text)
fig.show()

# Percentages of paraphrases cooccuring

In [None]:
occurences_para_p = []
occurences_para_n = []
occurences_para_not_in_corpus = []

for k in range(len(pairs_sets)):
    positive = 0
    negative = 0
    not_in_corpus = 0
    
    for i in range(len(pairs_sets[k])):
        p1_a, p1_ap = pairs_sets[k][i]
        for j in range(i+1, len(pairs_sets[k])):
            p2_a, p2_ap = pairs_sets[k][j]
            
            if (p1_a in count_dict_ij and p2_ap in count_dict_ij[p1_a]) or (p2_ap in count_dict_ij and p1_a in count_dict_ij[p2_ap]):
                positive += 1
            else:
                if p1_a in count_dict_i and p2_ap in count_dict_i:
                    negative += 1
                else:
                    not_in_corpus += 1
            if (p2_a in count_dict_ij and p1_ap in count_dict_ij[p2_a]) or (p1_ap in count_dict_ij and p2_a in count_dict_ij[p1_ap]):
                positive += 1
            else:
                if p2_a in count_dict_i and p1_ap in count_dict_i:
                    negative += 1
                else:
                    not_in_corpus += 1
                
            
    occurences_para_p.append(positive)
    occurences_para_n.append(negative)
    occurences_para_not_in_corpus.append(not_in_corpus) 

In [None]:
for k in range(len(pairs_sets)):
    p = occurences_para_p[k]
    n = occurences_para_n[k]
    nic = occurences_para_not_in_corpus[k]
    total = p+n+nic
    print(names[k][:3],
          '; Paraph. with cooc.: ', float(int(p/total*10000))/100,
          '%; Paraph. with NO cooc.: ', float(int(n/total*10000))/100,
          '%; Paraph. with no oc. of 1 of the words: ', float(int(nic/total*10000))/100, '%')

# Analogies (W and W*)

In [None]:
# Creation of the suitable W and W* pairs.

def possible_analogies_and_voc(pairs_sets, count_dict_ij, invdict_top_voc):
  possible_analogies = []
  for k in range(len(pairs_sets)):
      possible_analogies.append([])
      for i in range(len(pairs_sets[k])):
          p1_a, p1_ap = pairs_sets[k][i]
          for j in range(i+1, len(pairs_sets[k])):
              p2_a, p2_ap = pairs_sets[k][j]

              if (p1_a in count_dict_ij and p2_ap in count_dict_ij[p1_a]) or (p2_ap in count_dict_ij and p1_a in count_dict_ij[p2_ap]):
                  if (p2_a in count_dict_ij and p1_ap in count_dict_ij[p2_a]) or (p1_ap in count_dict_ij and p2_a in count_dict_ij[p1_ap]):
                      if p1_a in invdict_top_voc and p1_ap in invdict_top_voc and p2_ap in invdict_top_voc and p2_a in invdict_top_voc:
                          possible_analogies[-1].append([(p1_a, p2_ap), (p1_ap, p2_a)])
                          
  voc_possible_analogies = set()
  for k in range(len(pairs_sets)):
      for p in possible_analogies[k]:
          t1, t2 = p[0], p[1]
          p0,p1 = t1
          n0,n1 = t2
          voc_possible_analogies.add(p0)
          voc_possible_analogies.add(p1)
          voc_possible_analogies.add(n0)
          voc_possible_analogies.add(n1)
  voc_possible_analogies = list(voc_possible_analogies)

  return(possible_analogies, voc_possible_analogies)

In [None]:
possible_analogies, voc_possible_analogies = possible_analogies_and_voc(pairs_sets, count_dict_ij, invdict_top_voc)

# Examples of paraphrases W and W* forming analogies in the male-female category.
print(possible_analogies[16])

In [None]:
print('Number of possibles analogies using paraphrases, where W and W* both are coocuring; using only the top 10 000 words')

for i,p in enumerate(possible_analogies):
    print(names[i], str(len(p)))

# Ranking

In [None]:
# Ranks of the country_capital category for the paraphrase matrix distance

# The vectors [p(cj|W)]_j have coordinates equal to 0 when N(W,cj) == 0, 
# which differs slightly from the previous paraphrase definition; but is necessary for computation purposes

# Choose all BATS categories or some in particular
all_categories = False

if all_categories:
    categories = list(range(len(names)))
else:
    k_female = 16
    k_country = 19
    categories = [k_female, k_country]

ranks_total= []
for k in categories:
    ranks_total.append([])
    for ana in possible_analogies[k]:
        t1, t2 = ana[0], ana[1]
        p0,p1 = t1
        n0,n1 = t2
        ranks_total[-1].append(ww_sim_analogy_para_rank(p0, p1, para_matrix, top_voc_list_full, invdict_top_voc_full, metric='l2', negative = [n0, n1]))


In [None]:
# Print the mean and median rank of every category

for ik,k in enumerate(categories):
    print(names[k], ': Mean= ', np.mean(ranks_total[ik]), ' ; Median= ', np.median(ranks_total[ik]))

# Norms computation

In [None]:
# Preparations to compute norms
# n_w_c is long to compute and is thus computed in the previous notebook.

# Count_matrix_tri = N(w1, w2, cj)

# N(w1,w2)
n_w1w2 = count_matrix_tri.sum(axis=1)
n_w1w2 =np.array(n_w1w2)[:,0]

# N(cj)
n_cj = count_matrix_tri.sum(axis=0)

# N(w1)
n_w = dict()
for wi in voc_possible_analogies:
    n_wi = 0
    for wj in top_voc_list:
        if invdict_top_voc[wi] > invdict_top_voc[wj]:
            n_wi += int(n_w1w2[10000*invdict_top_voc[wj] + invdict_top_voc[wi]])
        else:
            n_wi += int(n_w1w2[10000*invdict_top_voc[wi] + invdict_top_voc[wj]])
    n_w[wi] = n_wi
    
# N(w1, cj)

n_w_c = load_obj( "./Models/n_w_c")

In [None]:
norms_types = ["Paraphrase error L2", "Dependence errors L2", "All errors L2", 
               "Paraphrase error L1", "Dependence errors L1", "All errors L1"]

# Choose all BATS categories or some in particular
all_categories = False

if all_categories:
    categories = list(range(len(names)))
else:
    k_female = 16
    k_country = 19
    categories = [k_female, k_country]

normes_total= []
for k in categories:
    normes_total.append([])
    for i_ana, ana in enumerate(possible_analogies[k]):
        t1, t2 = ana[0], ana[1]
        p0,p1 = t1
        n0,n1 = t2
        normes_total[-1].append(errors_norms(p0,p1,n0,n1, invdict_top_voc, n_w1w2,  n_w, n_w_c, count_matrix_tri))

In [None]:
# Print the mean and median norms for every category

for ik,k in enumerate(categories):
    print(names[k])
    for i,l in enumerate(np.array(normes_total[ik]).T):
            print(norms_types[i], ': Mean= ', np.mean(l), ' ; Median= ', np.median(l))

# Context words in cooccurences of a paraphrase

In [None]:
# Print all the context phrases where king and woman appear together

def check_context_of_set(text8, vocabulary, w1, w2):
    set_ctxt = set()
    for nd,doc in enumerate(text8):
        for i,wi in enumerate(doc):
            for j in range(max(0,i-5),min(len(doc),i+6)):
                cj = doc[j]
                for k in range(j+1,min(len(doc),i+6)):
                    ck = doc[k]
                    if (cj == w1 and ck == w2) or (cj == w2 and ck == w1) and wi in vocabulary and j != i and k != i:
                        for wc in doc[i-5:i+6]:
                            set_ctxt.add(wc)
    return(set_ctxt)

In [None]:
#set_context_kw = check_context_of_set(text8, vocabulary, 'king', 'woman')
#set_context_qm = check_context_of_set(text8, vocabulary, 'queen', 'man')

# For a context of 5, the computed context words are:
set_context_kw = {
 'a','about','alfred','ammonite','an','and','apologized','as','assigns','at','be','beautiful','beget','bells','born','brought','but','by',
 'clad','consequently','danish','day','died','donald','edmund','elf','encourage','established','evil','exactly','famed','final','finds','foreign',
 'from','grimhild','had','happens','have','he','heart','helgi','her','herself','high','his','i','identity','in','insisted','is','king',
 'kings','la','line','lizard','marriage','married','murdered','named','of','official','offspring','olympias','on','one','over',
 'pass','peirithous','philip','plottest','priced','profusely','queen','realizing','refrain','relationship','resurrection','return',
 'returns','s','saga','says','servant','she','short','silk','single','solomon','son','stomach','task','telling','term','that','the',
 'thessalian','thou','to','two','upon','was','where','who','will','wise','with','woman','wouldst','xiv'}

set_context_qm = {
 'a','adelaide','all','an','and','anointed','around','as','celebrating','celebrations','channel','charge','crowning','dancing','day',
 'deputy','for','green','her','his','in','include','is','islands','isle','iv','james','k','karaoke','king','kingdom','man','mary','may',
 'maypole','more','morris','much','of','office','prosecution','queen','representative','s','scots','seasons','she','simpler','since','singing',
 'sophisticated','tastes','the','thomas','try','ultimately','united','was','who','wife','william','with'}

# Words in context presented at the same time as "King" and "Woman".
print(set(set_context_qm).intersection(set(set_context_kw)))