This notebook should be run first once, completely. Its goal is to pre-compute the word2vec embeddings, PMI and PCI matrices, and N[W,c] count; steps that consume a high amount of time and memory.

The computations can hardly be optimised further as we need the complete counts and accurate matrices to perform our analyses. However if needed, it is possible to limit the vocabulary size (and thus also the context words taken into account in the corpus), which limits importantly the implications but impacts the memory used quadratically.

In [None]:
import gensim
from gensim import utils, matutils
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

import time
import logging
from itertools import chain
import logging
import numpy as np
import pickle
import os

import scipy
from scipy import sparse

text8 = api.load("text8")

def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

# word2vec model

In [None]:
# Define and saving a model
model = Word2Vec(text8, negative=1, hs=0, sg=1, ns_exponent=1, vector_size=500, epochs=15, sample=0) 
Word2Vec.save(model, "./Models/word2vec_clean.model") 

# Loading a model
# model = Word2Vec.load("./Models/word2vec_clean.model")

In [None]:
# Check that the model has trained well. Is Queen there?

model.wv.most_similar('king')

In [None]:
# Vocabulary definition, inversed vocabulary etc...

# For memory purposes, we recommend to limit the vocabulary for tests. 
# Replace by None if you wish to use the entirety of the vocabulary. 
vocab_limit = 2000

vocabulary_keys = model.wv.index_to_key
vocabulary = set(vocabulary_keys)
vocabulary_list = np.array(list(vocabulary_keys[:vocab_limit]))
len_vocabulary = len(list(vocabulary_keys[:vocab_limit]))

invdict_vocabulary = dict.fromkeys(vocabulary_list)
for i,wi in enumerate(vocabulary_list):
    invdict_vocabulary[wi] = i

# Count dictionnaries

In [None]:
def count_bioccurences(corpus, vocabulary_list, context_size=5, print_size=200):
    '''
    Create a dict of dict, counting the occurences of context words for each word of the vocabulary, in a corpus.
    '''
    vocabulary = set(vocabulary_list)
    count_dict_ij = dict.fromkeys(vocabulary_list)
    for v in count_dict_ij:
        count_dict_ij[v] = dict()
    
    for nd,doc in enumerate(corpus):
        if nd%print_size==0:
            print(nd)
            
        for i,wi in enumerate(doc):
            for j in range(max(0,i-context_size),min(len(doc),i+context_size+1)):
                cj = doc[j]
                if wi in vocabulary and cj in vocabulary and j != i:
                    if not cj in count_dict_ij[wi]:
                        count_dict_ij[wi][cj] = 1
                    else:
                        count_dict_ij[wi][cj] += 1

    return(count_dict_ij)


def count_trioccurences(corpus, top_voc, vocabulary_list, context_size=5, print_size=200):
    '''
    Create a dict of dict of dict, counting the occurences of context pairs of each word of the vocabulary, in a corpus.
    '''
    vocabulary = set(vocabulary_list)
    count_dict_cj_tri = dict.fromkeys(vocabulary_list)
    
    for nd,doc in enumerate(corpus):
        if nd%print_size==0:
            print(nd)
            
        for i,wi in enumerate(doc):
            for j in range( max(0, i-context_size), min(len(doc), i+context_size+1) ):
                cj = doc[j]
                for k in range(j+1,min(len(doc),i+context_size+1)):
                    ck = doc[k]
                    if cj in top_voc and ck in top_voc and wi in vocabulary and j != i and k != i:
                        if count_dict_cj_tri[wi] is None:
                            count_dict_cj_tri[wi] = dict()
                        if not cj in count_dict_cj_tri[wi]:
                            count_dict_cj_tri[wi][cj] = dict()
                        if not ck in count_dict_cj_tri[wi][cj]:
                            count_dict_cj_tri[wi][cj][ck] = 1
                        else:
                            count_dict_cj_tri[wi][cj][ck] += 1
    return(count_dict_cj_tri)

In [None]:
## Bioccurences count

count_dict_ij = count_bioccurences(text8, vocabulary_list, context_size=5, print_size=200)
pickle.dump(count_dict_ij, open("./Models/count_dict_ij.pkl", "wb"))

# count_dict_ij = pickle.load(open("./Models/count_dict_ij.pkl", "rb"))

In [None]:
### Trioccurence count

# Top indexes vocabulary
count_dict_i = dict.fromkeys(vocabulary_list)
for w in count_dict_ij:
    count_dict_i[w] = sum(count_dict_ij[w].values())
n_max = 10000    
sorted_count_dict = {k: v for k, v in sorted(count_dict_i.items(), key=lambda item: item[1])}
sorted_vocabulary = list(reversed([[k, v] for (k, v) in sorted_count_dict.items()]))
top_voc_list = [w[0] for w in sorted_vocabulary[:n_max]]
top_voc = set(top_voc_list)
invdict_top_voc = {w:i for i,w in enumerate(top_voc_list)}

# Computation
count_dict_cj_tri = count_trioccurences(text8, top_voc, vocabulary_list, context_size=5, print_size=200)
pickle.dump(count_dict_cj_tri, open("./Models/count_dict_cj_tri.pkl", "wb"))

# count_dict_cj_tri = pickle.load(open("./Models/count_dict_cj_tri.pkl", "rb"))

# Count matrices

In [None]:
# Heavily inspired by https://www.kaggle.com/kenshoresearch/kdwd-pmi-word-vectors
def get_count_matrix(skipgrams, tok2indx):
    '''
    Build a sparse matrix of bioccurences from a dict of dict skipgrams.
    '''
    row_indxs = []                                                                       
    col_indxs = []                                                                       
    dat_values = []
    
    for i,wi in enumerate(skipgrams):
        if i%500==0:
            print(i)
        for wj in skipgrams[wi]:
            row_indxs.append(tok2indx[wi])
            col_indxs.append(tok2indx[wj])
            dat_values.append(skipgrams[wi][wj])
            
    print('building sparse bicount matrix')
    return sparse.csr_matrix((dat_values, (row_indxs, col_indxs)))

def get_count_matrix_tri(skipgrams, tok2indx, tok2indx_top):
    '''
    Build a sparse matrix of trioccurences from a dict of dict of dict skipgrams.
    The matrix will be of dimension 2.
    '''
    row_indxs = []                                                                       
    col_indxs = []
    dat_values = []    
    
    for k,ck in enumerate(skipgrams):
        if k%3000==0:
            print(k)
        if not skipgrams[ck] is None:
            for wi in skipgrams[ck]:
                for wj in skipgrams[ck][wi]:
                    if tok2indx_top[wi] > tok2indx_top[wj] and (not wj in skipgrams[ck] or not wi in skipgrams[ck][wj]):
                        row_indxs.append(10000*tok2indx_top[wj] + tok2indx_top[wi])
                        col_indxs.append(tok2indx[ck])
                        dat_values.append(skipgrams[ck][wi][wj])   
                        
                    if tok2indx_top[wi] <= tok2indx_top[wj]:
                        s = 0
                        if wi != wj and wj in skipgrams[ck]:
                            if wi in skipgrams[ck][wj]:
                                s = skipgrams[ck][wj][wi]
                        row_indxs.append(10000*tok2indx_top[wi] + tok2indx_top[wj])
                        col_indxs.append(tok2indx[ck])
                        dat_values.append(skipgrams[ck][wi][wj] + s)
                        
    print('building sparse tricount matrix')
    return(sparse.csr_matrix((dat_values, (row_indxs, col_indxs))))

In [None]:
count_matrix = get_count_matrix(count_dict_ij, invdict_vocabulary)

scipy.sparse.save_npz('./Models/count_matrix.npz', count_matrix)

In [None]:
count_matrix_tri = get_count_matrix_tri(count_dict_cj_tri, invdict_vocabulary, invdict_top_voc)

scipy.sparse.save_npz('./Models/count_matrix_tri.npz', count_matrix_tri)

# PMI and PCI matrix

In [None]:
def get_pci_matrix(count_matrix_tri, proba=False):
    '''
    Build the sparse (proba) paraphrase error matrix from the sparse trioccurences matrix
    '''
    sum_over_contexts = np.array(count_matrix_tri.sum(axis=1)).flatten()
    div_sum_over_contexts = np.clip(1/sum_over_contexts,0,1)
    para = count_matrix_tri.T.multiply(div_sum_over_contexts).T
    para_csr = para.tocsr()
    if not proba:
        para_csr.data = np.log(para_csr.data)
    return(para_csr)

def get_pmi_matrix(skipgrams, count_matrix, tok2indx, alpha=0.75, ppmi_bool=False, min_context=False, not_pmi=False):
    
    # for standard PPMI
    DD = 0
    for wi in skipgrams:
        DD += sum(skipgrams[wi].values())

    sum_over_contexts = np.array(count_matrix.sum(axis=1)).flatten()
    sum_over_words = np.array(count_matrix.sum(axis=0)).flatten()
        
    # for context distribution smoothing (cds)
    sum_over_words_alpha = sum_over_words**alpha
    Pc_alpha_denom = np.sum(sum_over_words_alpha)
        
    row_indxs = []
    col_indxs = []
    ppmi_dat_values = []   # positive pointwise mutual information
    
    for i,wi in enumerate(skipgrams):
        if i%5000==0:
            print(i)
        for wj in skipgrams[wi]:    
            tok_word_indx, tok_context_indx, pound_wc = tok2indx[wi], tok2indx[wj], skipgrams[wi][wj]
            pound_w = sum_over_contexts[tok_word_indx]
            pound_c = sum_over_words[tok_context_indx]
            pound_c_alpha = sum_over_words_alpha[tok_context_indx]
            
            # Doesn't actually work here, as pound_wc is >=1. The sparse structure keep at 0 all values where pound_wc =0
            if min_context:
                pound_wc = max(pound_wc,1)
            Pwc = pound_wc / DD
            Pw = pound_w / DD
            Pc = pound_c / DD
            Pc_alpha = pound_c_alpha / Pc_alpha_denom

            if not_pmi:
                Pc_alpha = 1
            
            pmi = np.log2(Pwc / (Pw * Pc_alpha))
            
            if ppmi_bool:
                ppmi = max(pmi, 0)
            else:
                ppmi = pmi

            row_indxs.append(tok_word_indx)
            col_indxs.append(tok_context_indx)
            ppmi_dat_values.append(ppmi)
    
    if ppmi_bool:
        print('building ppmi matrix') 
    else:
        print('building pmi matrix') 
    return sparse.csr_matrix((ppmi_dat_values, (row_indxs, col_indxs)))


In [None]:
pmi_matrix = get_pmi_matrix(count_dict_ij, count_matrix, invdict_vocabulary, alpha=1, ppmi_bool=False, min_context=False)

scipy.sparse.save_npz('./Models/pmi_matrix_nominctxt.npz', pmi_matrix)

In [None]:
para_matrix = get_pci_matrix(count_matrix_tri)

scipy.sparse.save_npz('./Models/paraphrase_matrix.npz', para_matrix)

# N[W,c] 

In [None]:
## Preparation of the analogy pairs used in the main notebook to pre-compute N[W,c].

# Loading of the BATS dataset

directory = './BATS_3.0'
names = []
pairs_sets = []

for d in os.listdir(directory):
    if d != 'metadata.json':
        for f in os.listdir(os.path.join(directory,str(d))):
            names.append(str(f)[:-4])
            pairs_sets.append(set())
            with utils.open_file(os.path.join(directory,str(d),str(f))) as fin:
                for line_no, line in enumerate(fin):
                    line = utils.to_unicode(line)
                    a, b = [word.lower() for word in line.split()]
                    list_b = b.split('/')
                    if list_b[0] != a: #Keeping only the first analogy pair
                        pairs_sets[-1].add((a, list_b[0]))

pairs_sets = [list(d) for d in pairs_sets]

# Creation of the suitable W and W* pairs.

def possible_analogies_and_voc(pairs_sets, count_dict_ij, invdict_top_voc):
  possible_analogies = []
  for k in range(len(pairs_sets)):
      possible_analogies.append([])
      for i in range(len(pairs_sets[k])):
          p1_a, p1_ap = pairs_sets[k][i]
          for j in range(i+1, len(pairs_sets[k])):
              p2_a, p2_ap = pairs_sets[k][j]

              if (p1_a in count_dict_ij and p2_ap in count_dict_ij[p1_a]) or (p2_ap in count_dict_ij and p1_a in count_dict_ij[p2_ap]):
                  if (p2_a in count_dict_ij and p1_ap in count_dict_ij[p2_a]) or (p1_ap in count_dict_ij and p2_a in count_dict_ij[p1_ap]):
                      if p1_a in invdict_top_voc and p1_ap in invdict_top_voc and p2_ap in invdict_top_voc and p2_a in invdict_top_voc:
                          possible_analogies[-1].append([(p1_a, p2_ap), (p1_ap, p2_a)])
                          
  voc_possible_analogies = set()
  for k in range(len(pairs_sets)):
      for p in possible_analogies[k]:
          t1, t2 = p[0], p[1]
          p0,p1 = t1
          n0,n1 = t2
          voc_possible_analogies.add(p0)
          voc_possible_analogies.add(p1)
          voc_possible_analogies.add(n0)
          voc_possible_analogies.add(n1)
  voc_possible_analogies = list(voc_possible_analogies)

  return(possible_analogies, voc_possible_analogies)

In [None]:
possible_analogies, voc_possible_analogies = possible_analogies_and_voc(pairs_sets, count_dict_ij, invdict_top_voc)

In [None]:
def get_n_w_c(voc_possible_analogies, vocabulary_list, top_voc_list, invdict_top_voc, count_matrix_tri):
    n_w_c = dict()
    for i, wi in enumerate(voc_possible_analogies):
        n_wi_c = dict()
        for wj in top_voc_list:

            if invdict_top_voc[wi] > invdict_top_voc[wj]:
                n_wiwj_c = count_matrix_tri[10000*invdict_top_voc[wj] + invdict_top_voc[wi]]
            else:
                n_wiwj_c = count_matrix_tri[10000*invdict_top_voc[wi] + invdict_top_voc[wj]]

            for ik, ind in enumerate(n_wiwj_c.indices):
                w_ind = vocabulary_list[ind]
                if not w_ind in n_wi_c:
                    n_wi_c[w_ind] = n_wiwj_c.data[ik]
                else:
                    n_wi_c[w_ind] += n_wiwj_c.data[ik]
        n_w_c[wi] = n_wi_c
        
    return(n_w_c)

In [None]:
n_w_c = get_n_w_c(voc_possible_analogies, vocabulary_list, top_voc_list, invdict_top_voc, count_matrix_tri)
save_obj(n_w_c, "./Models/n_w_c")