# Static methods for LSCD on SemEval2020 Task 1.2

## Imports

In [34]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import mangoes
import pickle
from tqdm import tqdm

In [3]:
from tools.readers import SemEvalReader
from tools.utils import OrthogProcrustAlign, standardize, unitcenter, centerunit, shared_vocabulary
from tools.count_based import *
from tools.pipelines import *

## Our method

### Corpora and targets

In [4]:
reader = SemEvalReader('../semeval2020_ulscd_posteval/starting_kit/test_data_public')

In [5]:
language = 'english'
targets, gold_scores = reader.read_targets(language)
corpus1, corpus2 = reader.load_corpora(language,'lemma')

[INFO] Building corpus 1...


Counting words: 0it [00:00, ?it/s]

[INFO] Corpus 1: 253644 sentences 	86557 words
[INFO] Building corpus 2...


Counting words: 0it [00:00, ?it/s]

[INFO] Corpus 2: 353692 sentences 	149891 words


In [6]:
folder= './matrices/{}/'.format(language)

### Count matrices and vocabulary

In [10]:
vocabulary, vocabs_len = shared_vocabulary(corpus1,corpus2)

In [12]:
counts_matrix1, counts_matrix2  = creates_count_matrices_pair(corpus1,corpus2, vocabulary, window_size=10)

[INFO] Computing count matrix for corpus 1...
[INFO] Success!
[INFO] Computing count matrix for corpus 2...
[INFO] Success!


In [13]:
word2index = dict()
idx2word = dict()
for i, word in enumerate(list(vocabulary.words)):
    word2index[word]=i
    idx2word[i] = word

### PPMI

In [14]:
alpha = 0.75
shift = 1

create_ppmi_matrices_pair(counts_matrix1, counts_matrix2, alpha, shift, storage_folder=folder+'ppmi', verbose=True)

[INFO] Computing PPMI matrices with alpha=0.75 and k=1.
[INFO] Computing PPMI matrix for Corpus 1...
[INFO] Success!
[INFO] Computing PPMI matrix for Corpus 2...
[INFO] Success!
[INFO] Matrices stored in ./matrices/english/ppmi/.


In [15]:
ppmi1, ppmi2 = load_ppmi_matrices_as_csr(folder+'ppmi')

In [16]:
ppmi1

<33954x33954 sparse matrix of type '<class 'numpy.float64'>'
	with 8484175 stored elements in Compressed Sparse Row format>

In [17]:
reader.spearman_score(ppmi1,ppmi2,word2index,word2index,language,out=False)

Spearman's rho: 0.26795 	p-value: 0.1088


### SVD aligned with OP

In [20]:
#gamma has no importance as we standardize vectors for OP alignement.
create_svd_matrices_pair(ppmi1,ppmi2,folder+'svd',standardise=False, dim=300,gamma=0.0,random_state=None,n_iter=5, verbose=True)

[INFO] Computing SVD matrices with gamma=0.0 and d=300.
[INFO] Computing SVD matrix for Corpus 1...
[INFO] Success!
[INFO] Computing SVD matrix for Corpus 2...
[INFO] Success!
[INFO] Matrices stored in ./matrices/english/svd/.


In [21]:
svd1, svd2 = load_svd_matrices(folder+'svd')

In [22]:
std_func = unitcenter
svd1_std = std_func(svd1)
svd2_std = std_func(svd2)

W_align_SVD = OrthogProcrustAlign(svd1_std,svd2_std,standard=True, backward=True)

svd1_std_aligned = svd1_std.dot(W_align_SVD)

reader.spearman_score(svd1_std_aligned,svd2_std,word2index,word2index,language,out=False)

Spearman's rho: 0.33849 	p-value: 0.0404


In [5]:
static_SemEval_pipeline(data_folder= '../semeval2020_ulscd_posteval/starting_kit/test_data_public', storage_folder= './matrices', language= 'english', 
                    window_size=10, ppmi_alpha=0.75, ppmi_k=5, 
                    svd_dim=300,svd_niter=5, OP_func=centerunit,
                    rng_seed=None)

[START] Collecting data
[INFO] Building corpus 1...


Counting words: 0it [00:00, ?it/s]

[INFO] Corpus 1: 253644 sentences 	86557 words
[INFO] Building corpus 2...


Counting words: 0it [00:00, ?it/s]

[INFO] Corpus 2: 353692 sentences 	149891 words
[INFO] Number of types: 86557 in C1, 149891 in C2, 33954 shared
[INFO] Counting occurences...
[INFO] Computing count matrix for corpus 1...
[INFO] Success!
[INFO] Computing count matrix for corpus 2...
[INFO] Success!
[INFO] Done.
[INFO] Computing and scoring PPMI
[INFO] Computing PPMI matrices with alpha=0.75 and k=5.
[INFO] Computing PPMI matrix for Corpus 1...
[INFO] Success!
[INFO] Computing PPMI matrix for Corpus 2...
[INFO] Success!
[INFO] Matrices stored in ./matrices/english/ppmi/.
[SCORE PPMI] Spearman's rho: 0.21946 	p-value: 0.1919
[INFO] Computing SVD
[INFO] Computing SVD matrices with gamma=0.0 and d=300.
[INFO] Computing SVD matrix for Corpus 1...
[INFO] Success!
[INFO] Computing SVD matrix for Corpus 2...
[INFO] Success!
[INFO] Matrices stored in ./matrices/english/svd/.
[INFO] Aligning SVD with OP
[INFO] Scoring SVD
[SCORE SVD] Spearman's rho: 0.22586 	p-value: 0.1789
[END] End of pipeline.


((0.21946, 0.1919), (0.22586, 0.1789))

In [4]:
data_folder= '../semeval2020_ulscd_posteval/starting_kit/test_data_public'
storage_folder= './matrices'
language= 'english'

window_size=10
ppmi_alpha = 0.75
ppmi_k = 5
svd_dim = 300
svd_niter = 5
rng_seed = None
op_func = centerunit

reader, folder, corpus1, corpus2, vocabulary = prepare_SemEval_data(data_folder, storage_folder, language)
reader.read_targets(language,out=False)

[START] Collecting data
[INFO] Building corpus 1...


Counting words: 0it [00:00, ?it/s]

[INFO] Corpus 1: 253644 sentences 	86557 words
[INFO] Building corpus 2...


Counting words: 0it [00:00, ?it/s]

[INFO] Corpus 2: 353692 sentences 	149891 words
[INFO] Number of types: 86557 in C1, 149891 in C2, 33954 shared


In [None]:
word2index = dict()
for i, word in enumerate(list(vocabulary.words)):
    word2index[word]=i

In [22]:
ppmi_score = (0.678,1.254)
c1 = mangoes.base.CountBasedRepresentation.load(folder+'count/count1/count_ws10')

In [12]:
matrix_name = f'count_ws{window_size}'
counts_matrix1,counts_matrix2 = count_SemEval_occurences(corpus1,corpus2,vocabulary,window_size, folder)

[INFO] Counting occurences...
[INFO] Computing count matrix for corpus 1...
[INFO] Success!
[INFO] Computing count matrix for corpus 2...
[INFO] Success!
[INFO] Done.


In [None]:
ppmi1,ppmi2,ppmi_score = compute_score_PPMI(counts_matrix1,counts_matrix2,ppmi_alpha,ppmi_k,word2index,folder,reader,language)
del counts_matrix1, counts_matrix2
matrix_name += f'-ppmi_a{ppmi_alpha}_k{ppmi_k}'
rename_and_clean_PPMIs(folder+'ppmi', matrix_name)

In [9]:
matrix_name += f'-svd_d{svd_dim}'
svd_score = compute_score_SVD(ppmi1,ppmi2,svd_dim,rng_seed,svd_niter,word2index ,folder, reader, language, op_func, matrix_name)
del ppmi1, ppmi2

[INFO] Computing SVD matrices with gamma=0.0 and d=300.
[INFO] Computing SVD matrix for Corpus 1...
[INFO] Success!
[INFO] Computing SVD matrix for Corpus 2...
[INFO] Success!
[INFO] Matrices stored in ./matrices/english/svd/.
[INFO] Aligning SVD with OP
[INFO] Scoring SVD
[SCORE SVD] Spearman's rho: 0.36351 	p-value: 0.027


**Comment:** This Spearman's $\rho$ should be interpreted carefully, as it is affected by the randomness of the SVD algorithm. It can vary from 0.23 to 0.35 approximately. The "only" important conclusion is that it is an improvement compared to PPMI.

## Matrices from https://github.com/Garrafao/LSCDetection

In [65]:
folder= './matrices/from_others/{}/'.format(language)

### From their SVD matrices to OP alignment

In [None]:
matrix_array = np.loadtxt('/home/bastien/lscd_garrafao/LSCDetection/matrices/semeval_eng_sim/corpus1/svd/win10.count-k5.ppmi-dim300-iter1.svd', dtype=object, comments=None, delimiter=' ', skiprows=1, encoding='utf-8')
svd1 = matrix_array[:,1:].astype(np.float)
rows1 = list(matrix_array[:,0].flatten())
w2i1 = {word : i for i,word in enumerate(rows1)}

matrix_array = np.loadtxt('/home/bastien/lscd_garrafao/LSCDetection/matrices/semeval_eng_sim/corpus2/svd/win10.count-k5.ppmi-dim300-iter1.svd', dtype=object, comments=None, delimiter=' ', skiprows=1, encoding='utf-8')
svd2 = matrix_array[:,1:].astype(np.float)
rows2 = list(matrix_array[:,0].flatten())
w2i2 = {word : i for i,word in enumerate(rows2)}

In [None]:
shared=list( set(rows1) & set(rows2))
#shared.sort()

to_keep1 = list()
to_keep2 = list()
word2index = dict()
idx2word = dict()
for i, word in enumerate(shared):
    word2index[word]=i
    idx2word[i] = word
    to_keep1.append(w2i1[word])
    to_keep2.append(w2i2[word])

In [None]:
std_func = unitcenter
svd1_std = std_func(svd1)
svd2_std = std_func(svd2)
svd1_std_red = svd1_std[to_keep1]
svd2_std_red = svd2_std[to_keep2]

W_align_SVD = OrthogProcrustAlign(svd1_std_red,svd2_std_red,standard=True, backward=True)

svd1_std_aligned = svd1_std.dot(W_align_SVD)

reader.spearman_score(svd1_std_aligned,svd2_std,w2i1,w2i2,language,out=False)

Spearman's rho: 0.26534 	p-value: 0.1125


**Comment:** We match their results. Thus we can make reliable comparisons between their results and ours after Orthogonal Procrustes alignement.

### From their count matrices

In [66]:
path1 = '/home/bastien/lscd_garrafao/LSCDetection/matrices/semeval_eng_sim/corpus1/count/win10.count'
P1 = sc.sparse.load_npz(path1)
with open(path1+'_rows','rb') as f:
    rows1 = pickle.load(f)
with open(path1+'_columns','rb') as f:
    cols1 = pickle.load(f)
w2i1 = {word : i for i,word in enumerate(rows1)}

path2 = '/home/bastien/lscd_garrafao/LSCDetection/matrices/semeval_eng_sim/corpus2/count/win10.count'
P2 = sc.sparse.load_npz(path2)
with open(path2+'_rows','rb') as f:
    rows2 = pickle.load(f)
with open(path2+'_columns','rb') as f:
    cols2 = pickle.load(f)
w2i2 = {word : i for i,word in enumerate(rows2)}


In [67]:
c1 = mangoes.Vocabulary(cols1)
c2 = mangoes.Vocabulary(cols2)
r1 = mangoes.Vocabulary(rows1)
r2 = mangoes.Vocabulary(rows2)
counts1 =  mangoes.base.CountBasedRepresentation(r1, c1, P1)
counts2 =  mangoes.base.CountBasedRepresentation(r2, c2, P2)

alpha = 0.75
shift = 5

create_ppmi_matrices_pair(counts1, counts2, alpha, shift, storage_folder=folder+'ppmi', verbose=True)

[INFO] Computing PPMI matrices with alpha=0.75 and k=5.
[INFO] Computing PPMI matrix for Corpus 1...
[INFO] Success!
[INFO] Computing PPMI matrix for Corpus 2...
[INFO] Success!
[INFO] Matrices stored in ./matrices/from_others/english/ppmi/.


In [68]:
ppmi1, ppmi2 = load_ppmi_matrices_as_csr(folder+'ppmi')

In [69]:
shared=list( set(rows1) & set(rows2))
#shared.sort()

to_keep1 = list()
to_keep2 = list()
word2index = dict()
idx2word = dict()
for i, word in enumerate(shared):
    word2index[word]=i
    idx2word[i] = word
    to_keep1.append(w2i1[word])
    to_keep2.append(w2i2[word])


In [70]:
ppmi1_ci = ppmi1[:,to_keep1]
ppmi2_ci = ppmi2[:,to_keep2]

In [71]:
reader.spearman_score(ppmi1_ci,ppmi2_ci,w2i1,w2i2,language,out=False)

Spearman's rho: 0.2159 	p-value: 0.1994


**Comment:** We match their results. Our PPMI function works, we can trust it.

In [82]:
#gamma has no importance as we standardize vectors for OP alignement.
create_svd_matrices_pair(ppmi1,ppmi2,folder+'svd',standardise=False, dim=300,gamma=0,random_state=None,n_iter=5, verbose=True)

[INFO] Computing SVD matrices with gamma=0 and d=300.
[INFO] Computing SVD matrix for Corpus 1...
[INFO] Success!
[INFO] Computing SVD matrix for Corpus 2...
[INFO] Success!
[INFO] Matrices stored in ./matrices/from_others/english/svd/.


In [83]:
svd1, svd2 = load_svd_matrices(folder+'svd')

In [85]:
shared=list( set(rows1) & set(rows2))
#shared.sort()

to_keep1 = list()
to_keep2 = list()
word2index = dict()
idx2word = dict()
for i, word in enumerate(shared):
    word2index[word]=i
    idx2word[i] = word
    to_keep1.append(w2i1[word])
    to_keep2.append(w2i2[word])

std_func = unitcenter
svd1_std = std_func(svd1)
svd2_std = std_func(svd2)
svd1_std_red = svd1_std[to_keep1]
svd2_std_red = svd2_std[to_keep2]

W_align_SVD = OrthogProcrustAlign(svd1_std_red,svd2_std_red,standard=True, backward=True)

svd1_std_aligned = svd1_std.dot(W_align_SVD)

reader.spearman_score(svd1_std_aligned,svd2_std,w2i1,w2i2,language,out=False)

Spearman's rho: 0.28597 	p-value: 0.0862


**Comment:** There is a gap between our SVD-aligned and theirs. We know that this issue does not come from OP alignment, and that it shouldn't come from PPMI as the function has been demonstrated to output the same results. In the next part we remove potential issues with PPMI by starting from their PPMI matrices to compute SVD above of them and aligning them with OP.

### From their PPMI matrices to SVD+OP

In [48]:
path1 = '/home/bastien/lscd_garrafao/LSCDetection/matrices/semeval_eng_sim/corpus1/ppmi/win10.count-k5.ppmi'
P1 = sc.sparse.load_npz(path1)
with open(path1+'_rows','rb') as f:
    rows1 = pickle.load(f)
with open(path1+'_columns','rb') as f:
    cols1 = pickle.load(f)
w2i1 = {word : i for i,word in enumerate(rows1)}

path2 = '/home/bastien/lscd_garrafao/LSCDetection/matrices/semeval_eng_sim/corpus2/ppmi/win10.count-k5.ppmi'
P2 = sc.sparse.load_npz(path2)
with open(path2+'_rows','rb') as f:
    rows2 = pickle.load(f)
with open(path2+'_columns','rb') as f:
    cols2 = pickle.load(f)
w2i2 = {word : i for i,word in enumerate(rows2)}


In [62]:
#gamma has no importance as we standardize vectors for OP alignement.
create_svd_matrices_pair(P1,P2,folder+'svd/from_their_ppmi',standardise=False, dim=300,gamma=0,random_state=None,n_iter=5, verbose=True)

[INFO] Computing SVD matrices with gamma=0 and d=300.
[INFO] Computing SVD matrix for Corpus 1...
[INFO] Success!
[INFO] Computing SVD matrix for Corpus 2...
[INFO] Success!
[INFO] Matrices stored in ./matrices/from_others/english/svd/from_their_ppmi/.


In [63]:
svd1, svd2 = load_svd_matrices(folder+'svd/from_their_ppmi')

In [64]:
shared=list( set(rows1) & set(rows2))
#shared.sort()

to_keep1 = list()
to_keep2 = list()
word2index = dict()
idx2word = dict()
for i, word in enumerate(shared):
    word2index[word]=i
    idx2word[i] = word
    to_keep1.append(w2i1[word])
    to_keep2.append(w2i2[word])

std_func = unitcenter
svd1_std = std_func(svd1)
svd2_std = std_func(svd2)
svd1_std_red = svd1_std[to_keep1]
svd2_std_red = svd2_std[to_keep2]

W_align_SVD = OrthogProcrustAlign(svd1_std_red,svd2_std_red,standard=True, backward=True)

svd1_std_aligned = svd1_std.dot(W_align_SVD)

reader.spearman_score(svd1_std_aligned,svd2_std,w2i1,w2i2,language,out=False)

Spearman's rho: 0.25965 	p-value: 0.1207


**Comment:** We observe the same gap between our results and theirs. There is randomness induced by the randomized_svd function used both by our implementation and theirs. The results do not match perfectly, but are close to each others. Sometimes ours is better, sometimes it is worse.