Calculates accuracies for the Synonym Judgement Task for several word vector models.

In [32]:
from pathlib import Path
import pandas as pd
from gensim import models
from semsim.constants import DATA_DIR, SEMD_DIR

sjt_dir = DATA_DIR / "metrics" / "synonym_judgement"

In [33]:
def closest_match(terms, vectors):
    """
    Returns the index of the term closest to the first term in a list of words.
    
    Note that index 0 is taken as the probe and all words with index > 0 are tested.
    """
    
    terms = terms.to_list()
    # print(terms)
    try:
        distances = vectors.distances(terms[0], terms[1:])
        # print(distances)
        min_dist = distances.argmin() + 1
        return min_dist
    except KeyError:
        for term in terms:
            if term not in vectors:
                print(f"missing in vectors: '{term}'")
        return -1

    
def synonym_judgement_accuracy(wordvectors, tests, target_idx=1):
    pred = tests.apply(lambda x: closest_match(x, wordvectors), axis=1)
    pred = pred[pred > 0]
    correct = (pred == target_idx).sum()
    acc = correct / len(pred)
    print()
    print('Accuracy:', round(acc, 3))
    print('Tests with unknown words:', len(tests) - len(pred))

### ENGLISH

In [34]:
sj_file_en = sjt_dir / "cueing_study_stimuli_for_distribution.csv"

sj_en_full = pd.read_csv(sj_file_en)
sj_en = sj_en_full[['Probe', 'Target', 'Foil1', 'Foil2']]
sj_en = sj_en[~sj_en.isna().any(axis=1)]
sj_en

Unnamed: 0,Probe,Target,Foil1,Foil2
0,abstain,refrain,covet,specify
1,accordance,agreement,substitute,analogy
2,advantage,benefit,tendency,condition
3,alias,pseudonym,aspect,reprisal
4,alternative,substitute,ambition,discretion
...,...,...,...,...
195,villain,crook,herring,aluminium
196,violin,viola,shed,rabbit
197,waist,torso,goddess,chorus
198,window,door,eye,plant


Download GoogleNews word vectors from https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300



In [35]:
google_w2v = models.KeyedVectors.load_word2vec_format(DATA_DIR / 'vectors' / 'GoogleNews-vectors-negative300.bin', binary=True)

In [36]:
synonym_judgement_accuracy(google_w2v, sj_en)

missing in vectors: 'pretence'
missing in vectors: 'theatre'
missing in vectors: 'axe'
missing in vectors: 'aluminium'

Accuracy: 0.939
Tests with unknown words: 4


In [37]:
file = 'bnc_lsi_gensim_term_vectors.csv'
dir_path = SEMD_DIR / 'bnc_cs1000_minsz50_lc_filtered'
file_path = dir_path / file
w2v_file_path = file_path.with_suffix('.w2v')

In [38]:
# convert csv file to w2v format

lsi_wv = pd.read_csv(file_path, index_col=0)

with open(w2v_file_path, 'w') as fp:
    fp.write(f'{lsi_wv.shape[0]} {lsi_wv.shape[1]}\n')
    lsi_wv.to_csv(fp, sep=' ', header=False)

lsi_wv

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
's,0.115285,0.141942,0.090127,0.036433,0.019808,0.006016,0.009087,0.012758,0.020618,0.017084,...,0.010214,0.007080,0.011617,0.000036,0.012509,0.023662,0.009905,0.002908,0.010690,0.010289
've,0.070124,0.105965,0.089680,0.024757,-0.004964,-0.000286,0.005302,0.007845,0.006689,0.012744,...,0.020262,-0.002677,0.005711,-0.001196,0.005764,-0.003499,0.013321,0.002315,0.025031,-0.016147
071,0.000620,-0.000476,-0.000216,0.000346,0.002867,0.000345,0.000839,0.000255,-0.000500,-0.000811,...,-0.001839,-0.001220,0.000112,-0.000474,0.001482,-0.002238,0.000328,0.001863,0.000372,0.000194
0800,0.000562,-0.000697,0.000328,0.000363,0.002579,-0.002336,0.001268,-0.000540,-0.001732,0.000161,...,-0.003286,0.000713,0.000464,0.000712,-0.000940,0.001268,0.001290,-0.000695,0.000258,-0.000735
081,0.000344,-0.000304,-0.000046,0.000147,0.001892,0.000042,0.000692,0.000151,-0.000196,-0.000528,...,-0.001153,-0.000899,0.000916,0.000991,0.000081,-0.001822,0.000042,-0.000358,-0.000872,0.000021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
linthorpe,0.000232,-0.000235,-0.000457,0.001058,0.001230,0.000735,0.001692,-0.000064,-0.000884,-0.000930,...,0.000810,0.000836,-0.000002,-0.000141,-0.000491,0.000694,-0.000093,0.000512,-0.000699,-0.000984
skinnergate,0.000295,-0.000319,-0.000347,0.001271,0.001082,0.000196,0.001986,0.000286,-0.001346,-0.000589,...,-0.001104,-0.000611,-0.001362,-0.000738,-0.000136,0.000010,-0.000460,-0.000370,0.000671,0.001297
hurworth,0.000318,-0.000307,-0.000524,0.001482,0.001447,0.000743,0.002465,0.000008,-0.001269,-0.001076,...,-0.000117,0.000713,-0.002158,-0.000966,-0.000178,-0.000584,-0.000396,0.001742,0.000394,-0.000581
feethams,0.000235,-0.000130,-0.000378,0.000970,0.001388,0.001149,0.001762,-0.000587,0.000242,0.000736,...,-0.001587,0.001468,0.000041,0.000452,0.001719,0.000525,0.000548,-0.000857,0.000236,0.001415


In [39]:
bnc_lsi = models.KeyedVectors.load_word2vec_format(w2v_file_path)

In [40]:
synonym_judgement_accuracy(bnc_lsi, sj_en)

missing in vectors: 'covet'
missing in vectors: 'impel'
missing in vectors: 'debase'
missing in vectors: 'depose'
missing in vectors: 'digress'
missing in vectors: 'hone'
missing in vectors: 'dirge'
missing in vectors: 'emanation'
missing in vectors: 'foible'
missing in vectors: 'proffer'
missing in vectors: 'implore'
missing in vectors: 'irk'
missing in vectors: 'foretell'
missing in vectors: 'foretell'
missing in vectors: 'sadden'
missing in vectors: 'sadden'
missing in vectors: 'idiocy'
missing in vectors: 'toughen'
missing in vectors: 'revere'
missing in vectors: 'centipede'
missing in vectors: 'bison'
missing in vectors: 'conker'
missing in vectors: 'embroider'
missing in vectors: 'uncork'
missing in vectors: 'grasshopper'
missing in vectors: 'honeycomb'
missing in vectors: 'hyena'
missing in vectors: 'jackal'
missing in vectors: 'crayfish'
missing in vectors: 'choker'
missing in vectors: 'shoplift'
missing in vectors: 'sunbathe'
missing in vectors: 'zipper'
missing in vectors: 't

### GERMAN

In [41]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.word2vec import Word2Vec

In [42]:
sj_file_de = sjt_dir / "SJT_stimuli.csv"

sj_de_full = pd.read_csv(sj_file_de)
sj_de = sj_de_full[['probe', 'target', 'foil1', 'foil2']]
sj_de = sj_de[~sj_de.isna().any(axis=1)]
sj_de

Unnamed: 0,probe,target,foil1,foil2
0,verzichten,unterlassen,begehren,spezifizieren
1,Übereinstimmung,Vereinbarung,Austausch,Vergleich
2,Vorteil,Nutzen,Tendenz,Bedingung
3,Alias,Pseudonym,Aspekt,Vergeltung
4,Alternative,Austausch,Ehrgeiz,Diskretion
...,...,...,...,...
195,Verbrecher,Gauner,Hering,Aluminium
196,Geige,Bratsche,Schuppen,Kaninchen
197,Taille,Hüfte,Göttin,Chor
198,Fenster,Tür,Auge,Pflanze


In [None]:
file = DATA_DIR / 'vectors' / 'd2v'
d2v = Doc2Vec.load(str(file))
synonym_judgement_accuracy(d2v.wv, sj_de)

In [None]:
file = DATA_DIR / 'vectors' / 'w2v'
w2v = Word2Vec.load(str(file))
synonym_judgement_accuracy(w2v.wv, sj_de)

In [None]:
file = SEMD_DIR / 'OP2/OnlineParticipation_lsi_gensim_term_vectors.csv'
op_lsi = models.KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

In [None]:
file = SEMD_DIR / 'DEWAC_1000_40k/dewac_lsi_word_vectors.vec'
op_lsi = models.KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

In [None]:
file = SEMD_DIR / 'DEWAC_1000/dewac_lsi_word_vectors.vec'
op_lsi = models.KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

In [None]:
file = SEMD_DIR / 'DEWAC/dewac_lsi_word_vectors.vec'
op_lsi = models.KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

In [43]:
file = SEMD_DIR / 'DEWAC_1000_40k_v2/dewac_lsi_word_vectors.vec'
op_lsi = models.KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

missing in vectors: 'Gepard'

Accuracy: 0.829
Tests with unknown words: 1
