# Static methods for LSCD on SemEval2020 Task 1.2

## Imports

In [28]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import mangoes
from tqdm import tqdm

In [29]:
from tools.readers import SemEvalReader
from tools.utils import standardize, OrthogProcrustAlign
from tools.count_based import *

## Our method

### Corpora and targets

In [4]:
reader = SemEvalReader('../semeval2020_ulscd_posteval/starting_kit/test_data_public')

In [5]:
language = 'english'
targets, gold_scores = reader.read_targets(language)
corpus1, corpus2 = reader.load_corpora(language,'lemma')

[INFO] Building corpus 1...


Counting words: 0it [00:00, ?it/s]

[INFO] Corpus 1: 253644 sentences 	86557 words
[INFO] Building corpus 2...


Counting words: 0it [00:00, ?it/s]

[INFO] Corpus 2: 353692 sentences 	149891 words


### Count matrices and vocabulary

In [6]:
counts_matrix1, counts_matrix2, vocabulary = creates_count_matrices_pair(corpus1,corpus2,window_size=10)

[INFO] Creating shared vocabulary...
86557 words in corpus 1
149891 words in corpus 2
Shared vocabulary size: 33954
[INFO] Computing count matrix for corpus 1...
[INFO] Success!
[INFO] Computing count matrix for corpus 2...
[INFO] Success!


In [7]:
word2index = dict()
idx2word = dict()
for i, word in enumerate(list(vocabulary.words)):
    word2index[word]=i
    idx2word[i] = word

### PPMI

In [8]:
alpha = 0.75
shift = 5
folder= './matrices/{}/'.format(language)

create_ppmi_matrices_pair(counts_matrix1, counts_matrix2, alpha, shift, storage_folder=folder+'ppmi', verbose=True)

[INFO] Computing PPMI matrices with alpha=0.75 and k=5.
[INFO] Computing PPMI matrix for Corpus 1...
[INFO] Success!
[INFO] Computing PPMI matrix for Corpus 2...
[INFO] Success!
[INFO] Matrices stored in ./matrices/english//.


In [9]:
ppmi1, ppmi2 = load_ppmi_matrices_as_csr(folder+'ppmi')

In [10]:
ppmi1

<33954x33954 sparse matrix of type '<class 'numpy.float64'>'
	with 3657682 stored elements in Compressed Sparse Row format>

In [11]:
reader.spearman_score(ppmi1,ppmi2,word2index,language,out=False)

Spearman's rho: 0.21946 	p-value: 0.1919


### SVD aligned with OP

In [40]:
#gamma has no importance as we standardize vectors for OP alignement.
create_svd_matrices_pair(ppmi1,ppmi2,folder+'svd',standardise=True, dim=300,gamma=1.0,random_state=0,n_iter=5, verbose=True)

[INFO] Computing standardised SVD matrices with gamma=1.0 and d=300.
[INFO] Computing SVD matrix for Corpus 1...
[INFO] Success!
[INFO] Computing SVD matrix for Corpus 2...
[INFO] Success!
[INFO] Matrices stored in ./matrices/english/svd/.


In [41]:
svd1, svd2 = load_svd_matrices(folder+'svd')

In [42]:
W_align_SVD = OrthogProcrustAlign(svd1,svd2,True)
svd2_aligned = svd2.dot(W_align_SVD)

In [43]:
reader.spearman_score(svd1,svd2_aligned,word2index,language,out=False)

Spearman's rho: 0.28063 	p-value: 0.0925


## Matrices from https://github.com/Garrafao/LSCDetection

### From their count matrices

In [118]:
import pickle
path1 = '/home/bastien/lscd_garrafao/LSCDetection/matrices/semeval_eng_sim/corpus1/count/win10.count'
P1 = sc.sparse.load_npz(path1)
with open(path1+'_rows','rb') as f:
    rows1 = pickle.load(f)
with open(path1+'_columns','rb') as f:
    cols1 = pickle.load(f)
w2i1 = {word : i for i,word in enumerate(rows1)}

path2 = '/home/bastien/lscd_garrafao/LSCDetection/matrices/semeval_eng_sim/corpus2/count/win10.count'
P2 = sc.sparse.load_npz(path2)
with open(path2+'_rows','rb') as f:
    rows2 = pickle.load(f)
with open(path2+'_columns','rb') as f:
    cols2 = pickle.load(f)
w2i2 = {word : i for i,word in enumerate(rows2)}


In [58]:
c1 = mangoes.Vocabulary(cols1)
c2 = mangoes.Vocabulary(cols2)
r1 = mangoes.Vocabulary(rows1)
r2 = mangoes.Vocabulary(rows2)
counts1 =  mangoes.base.CountBasedRepresentation(r1, c1, P1)
counts2 =  mangoes.base.CountBasedRepresentation(r2, c2, P2)

alpha = 0.75
shift = 5
folder= './matrices/from_others/{}/'.format(language)

create_ppmi_matrices_pair(counts1, counts2, alpha, shift, storage_folder=folder+'ppmi', verbose=True)

[INFO] Computing PPMI matrices with alpha=0.75 and k=5.
[INFO] Computing PPMI matrix for Corpus 1...
[INFO] Success!
[INFO] Computing PPMI matrix for Corpus 2...
[INFO] Success!
[INFO] Matrices stored in ./matrices/from_others/english/ppmi/.


In [119]:
ppmi1, ppmi2 = load_ppmi_matrices_as_csr(folder+'ppmi')

In [120]:
shared=list( set(rows1) & set(rows2))
shared.sort()
to_keep1 = [w2i1[word] for word in shared]
to_keep2 = [w2i2[word] for word in shared]

word2index = dict()
idx2word = dict()
for i, word in enumerate(shared):
    word2index[word]=i
    idx2word[i] = word

In [121]:
ppmi1_red = ppmi1[to_keep1][:,to_keep1]
ppmi2_red = ppmi2[to_keep2][:,to_keep2]

In [122]:
reader.spearman_score(ppmi1_red,ppmi2_red,word2index,language,out=False)

Spearman's rho: 0.2159 	p-value: 0.1994


**Comment:** Same score as expected! Our PPMI function works.

In [123]:
#gamma has no importance as we standardize vectors for OP alignement.
create_svd_matrices_pair(ppmi1,ppmi2,folder+'svd',standardise=True, dim=300,gamma=0,random_state=None,n_iter=1, verbose=True)

[INFO] Computing standardised SVD matrices with gamma=0 and d=300.
[INFO] Computing SVD matrix for Corpus 1...
[INFO] Success!
[INFO] Computing SVD matrix for Corpus 2...
[INFO] Success!
[INFO] Matrices stored in ./matrices/from_others/english/svd/.


In [125]:
svd1, svd2 = load_svd_matrices(folder+'svd')

In [128]:
shared=list( set(rows1) & set(rows2))
shared.sort()
to_keep1 = [w2i1[word] for word in shared]
to_keep2 = [w2i2[word] for word in shared]

word2index = dict()
idx2word = dict()
for i, word in enumerate(shared):
    word2index[word]=i
    idx2word[i] = word

svd1_red = svd1[to_keep1]
svd2_red = svd2[to_keep2]

W_align_SVD = OrthogProcrustAlign(svd1_red,svd2_red,True,backward=True)
svd1_red_aligned = svd1_red.dot(W_align_SVD)

In [129]:
reader.spearman_score(svd1_red_aligned,svd2_red,word2index,language,out=False)

Spearman's rho: 0.18507 	p-value: 0.2728


### From their PPMI matrices to SVD+OP

In [111]:
path1 = '/home/bastien/lscd_garrafao/LSCDetection/matrices/semeval_eng_sim/corpus1/ppmi/win10.count-k5.ppmi'
P1 = sc.sparse.load_npz(path1)
with open(path1+'_rows','rb') as f:
    rows1 = pickle.load(f)
with open(path1+'_columns','rb') as f:
    cols1 = pickle.load(f)
w2i1 = {word : i for i,word in enumerate(rows1)}

path2 = '/home/bastien/lscd_garrafao/LSCDetection/matrices/semeval_eng_sim/corpus2/ppmi/win10.count-k5.ppmi'
P2 = sc.sparse.load_npz(path2)
with open(path2+'_rows','rb') as f:
    rows2 = pickle.load(f)
with open(path2+'_columns','rb') as f:
    cols2 = pickle.load(f)
w2i2 = {word : i for i,word in enumerate(rows2)}


In [112]:
#gamma has no importance as we standardize vectors for OP alignement.
create_svd_matrices_pair(P1,P2,folder+'svd',standardise=True, dim=300,gamma=0,random_state=None,n_iter=1, verbose=True)

[INFO] Computing standardised SVD matrices with gamma=0 and d=300.
[INFO] Computing SVD matrix for Corpus 1...
[INFO] Success!
[INFO] Computing SVD matrix for Corpus 2...
[INFO] Success!
[INFO] Matrices stored in ./matrices/from_others/english/svd/.


In [115]:
svd1, svd2 = load_svd_matrices(folder+'svd')

In [116]:
shared=list( set(rows1) & set(rows2))
shared.sort()
to_keep1 = [w2i1[word] for word in shared]
to_keep2 = [w2i2[word] for word in shared]

word2index = dict()
idx2word = dict()
for i, word in enumerate(shared):
    word2index[word]=i
    idx2word[i] = word

svd1_red = svd1[to_keep1]
svd2_red = svd2[to_keep2]

W_align_SVD = OrthogProcrustAlign(svd1_red,svd2_red,True)
svd2_red_aligned = svd2_red.dot(W_align_SVD)

In [117]:
reader.spearman_score(svd1_red,svd2_red_aligned,word2index,language,out=False)

Spearman's rho: 0.23914 	p-value: 0.154


### From their SVD matrices to OP alignment

In [98]:
matrix_array = np.loadtxt('/home/bastien/lscd_garrafao/LSCDetection/matrices/semeval_eng_sim/corpus1/svd/win10.count-k5.ppmi-dim300-iter1.svd', dtype=object, comments=None, delimiter=' ', skiprows=1, encoding='utf-8')
svd1 = matrix_array[:,1:].astype(np.float)
rows1 = list(matrix_array[:,0].flatten())

matrix_array = np.loadtxt('/home/bastien/lscd_garrafao/LSCDetection/matrices/semeval_eng_sim/corpus2/svd/win10.count-k5.ppmi-dim300-iter1.svd', dtype=object, comments=None, delimiter=' ', skiprows=1, encoding='utf-8')
svd2 = matrix_array[:,1:].astype(np.float)
rows2 = list(matrix_array[:,0].flatten())

In [108]:
shared=list( set(rows1) & set(rows2))
shared.sort()
to_keep1 = [w2i1[word] for word in shared]
to_keep2 = [w2i2[word] for word in shared]

word2index = dict()
idx2word = dict()
for i, word in enumerate(shared):
    word2index[word]=i
    idx2word[i] = word

svd1_red = svd1[to_keep1]
svd2_red = svd2[to_keep2]

W_align_SVD = OrthogProcrustAlign(svd1_red,svd2_red,True, backward=True)
svd1_red_aligned = svd1_red.dot(W_align_SVD)

In [110]:
reader.spearman_score(svd1_red_aligned,svd2_red,word2index,language,out=False)

Spearman's rho: 0.25336 	p-value: 0.1303


### Using SVD matrices

In [None]:
matrix_array = np.loadtxt('/home/bastien/lscd_garrafao/LSCDetection/matrices/semeval_eng_sim/corpus1/svd/win10.count-k5.ppmi-dim300-iter1.svd', dtype=object, comments=None, delimiter=' ', skiprows=1, encoding='utf-8')
svd1 = matrix_array[:,1:].astype(np.float)
rows1 = list(matrix_array[:,0].flatten())

matrix_array = np.loadtxt('/home/bastien/lscd_garrafao/LSCDetection/matrices/semeval_eng_sim/corpus2/svd/win10.count-k5.ppmi-dim300-iter1.svd', dtype=object, comments=None, delimiter=' ', skiprows=1, encoding='utf-8')
svd2 = matrix_array[:,1:].astype(np.float)
rows2 = list(matrix_array[:,0].flatten())