Calculates accuracies for the Synonym Judgement Task for several word vector models.

In [1]:
from pathlib import Path

import pandas as pd
from gensim.models import KeyedVectors
from gensim.models.doc2vec import Doc2Vec
from gensim.models.word2vec import Word2Vec

from semsim.constants import DATA_DIR, SEMD_DIR

sjt_dir = DATA_DIR / "metrics" / "synonym_judgement"

In [2]:
def csv_to_w2v(csv_file, w2v_file):
    df = pd.read_csv(csv_file, index_col=0)
    with open(w2v_file, 'w') as fp:
        fp.write(f'{df.shape[0]} {df.shape[1]}\n')
        df.to_csv(fp, sep=' ', header=False)
    return df


def closest_match(terms, vectors):
    """
    Returns the index of the term closest to the first term in a list of words.
    
    Note that index 0 is taken as the probe and all words with index > 0 are tested.
    """
    terms = terms.to_list()
    try:
        distances = vectors.distances(terms[0], terms[1:])
        min_dist = distances.argmin() + 1
        return min_dist
    except KeyError:
        for term in terms:
            if term not in vectors:
                print(f"missing in vectors: '{term}'")
        return -1

    
def synonym_judgement_accuracy(wordvectors, tests, target_idx=1):
    pred = tests.apply(lambda x: closest_match(x, wordvectors), axis=1)
    pred = pred[pred > 0]
    correct = (pred == target_idx).sum()
    acc = correct / len(pred)
    print()
    print('Accuracy:', round(acc, 3))
    print('Tests with unknown words:', len(tests) - len(pred))

### ENGLISH

In [3]:
sj_file_en = sjt_dir / "cueing_study_stimuli_for_distribution.csv"
sj_en_full = pd.read_csv(sj_file_en)
sj_en = sj_en_full[['Probe', 'Target', 'Foil1', 'Foil2']]
sj_en = sj_en[~sj_en.isna().any(axis=1)]
sj_en

Unnamed: 0,Probe,Target,Foil1,Foil2
0,abstain,refrain,covet,specify
1,accordance,agreement,substitute,analogy
2,advantage,benefit,tendency,condition
3,alias,pseudonym,aspect,reprisal
4,alternative,substitute,ambition,discretion
...,...,...,...,...
195,villain,crook,herring,aluminium
196,violin,viola,shed,rabbit
197,waist,torso,goddess,chorus
198,window,door,eye,plant


Download GoogleNews word vectors from https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300



In [4]:
google_w2v = KeyedVectors.load_word2vec_format(DATA_DIR / 'vectors' / 'GoogleNews-vectors-negative300.bin', binary=True)
synonym_judgement_accuracy(google_w2v, sj_en)

missing in vectors: 'pretence'
missing in vectors: 'theatre'
missing in vectors: 'axe'
missing in vectors: 'aluminium'

Accuracy: 0.939
Tests with unknown words: 4


In [5]:
csv_file_path = SEMD_DIR / 'bnc_cs1000_minsz50_lc_filtered' / 'bnc_lsi_gensim_term_vectors.csv'
w2v_file_path = csv_file_path.with_suffix('.w2v')
csv_to_w2v(csv_file_path, w2v_file_path)
bnc_lsi = KeyedVectors.load_word2vec_format(w2v_file_path)
synonym_judgement_accuracy(bnc_lsi, sj_en)

missing in vectors: 'covet'
missing in vectors: 'impel'
missing in vectors: 'debase'
missing in vectors: 'depose'
missing in vectors: 'digress'
missing in vectors: 'hone'
missing in vectors: 'dirge'
missing in vectors: 'emanation'
missing in vectors: 'foible'
missing in vectors: 'proffer'
missing in vectors: 'implore'
missing in vectors: 'irk'
missing in vectors: 'foretell'
missing in vectors: 'foretell'
missing in vectors: 'sadden'
missing in vectors: 'sadden'
missing in vectors: 'idiocy'
missing in vectors: 'toughen'
missing in vectors: 'revere'
missing in vectors: 'centipede'
missing in vectors: 'bison'
missing in vectors: 'conker'
missing in vectors: 'embroider'
missing in vectors: 'uncork'
missing in vectors: 'grasshopper'
missing in vectors: 'honeycomb'
missing in vectors: 'hyena'
missing in vectors: 'jackal'
missing in vectors: 'crayfish'
missing in vectors: 'choker'
missing in vectors: 'shoplift'
missing in vectors: 'sunbathe'
missing in vectors: 'zipper'
missing in vectors: 't

### GERMAN

In [6]:
sj_file_de = sjt_dir / "SJT_stimuli.csv"
sj_de_full = pd.read_csv(sj_file_de)
sj_de = sj_de_full[['probe', 'target', 'foil1', 'foil2']]
sj_de = sj_de[~sj_de.isna().any(axis=1)]
sj_de

Unnamed: 0,probe,target,foil1,foil2
0,verzichten,unterlassen,begehren,spezifizieren
1,Übereinstimmung,Vereinbarung,Austausch,Vergleich
2,Vorteil,Nutzen,Tendenz,Bedingung
3,Alias,Pseudonym,Aspekt,Vergeltung
4,Alternative,Austausch,Ehrgeiz,Diskretion
...,...,...,...,...
195,Verbrecher,Gauner,Hering,Aluminium
196,Geige,Bratsche,Schuppen,Kaninchen
197,Taille,Hüfte,Göttin,Chor
198,Fenster,Tür,Auge,Pflanze


In [7]:
file = DATA_DIR / 'vectors' / 'w2v'
w2v = Word2Vec.load(str(file))
synonym_judgement_accuracy(w2v.wv, sj_de)

missing in vectors: 'entwürdigen'
missing in vectors: 'Sekte'
missing in vectors: 'präzise'
missing in vectors: 'Ungehörigkeit'
missing in vectors: 'Knuff'
missing in vectors: 'entkorken'
missing in vectors: 'Quelle'
missing in vectors: 'Anziehsachen'
missing in vectors: 'Hecke'
missing in vectors: 'Riese'
missing in vectors: 'Kohle'
missing in vectors: 'Eltern'
missing in vectors: 'sonnenbaden'
missing in vectors: 'Knuff'
missing in vectors: 'Wringen'
missing in vectors: 'Narzisse'

Accuracy: 0.908
Tests with unknown words: 15


In [8]:
csv_file_path = DATA_DIR / 'vectors' / 'OnlineParticipation_lsi_gensim_term_vectors.csv'
w2v_file_path = csv_file_path.with_suffix('.w2v')
csv_to_w2v(csv_file_path, w2v_file_path)
op_lsi = KeyedVectors.load_word2vec_format(w2v_file_path)
synonym_judgement_accuracy(op_lsi, sj_de)

missing in vectors: 'unterlassen'
missing in vectors: 'begehren'
missing in vectors: 'spezifizieren'
missing in vectors: 'Übereinstimmung'
missing in vectors: 'Vereinbarung'
missing in vectors: 'Tendenz'
missing in vectors: 'Bedingung'
missing in vectors: 'Alias'
missing in vectors: 'Pseudonym'
missing in vectors: 'Vergeltung'
missing in vectors: 'Ehrgeiz'
missing in vectors: 'Diskretion'
missing in vectors: 'Analogie'
missing in vectors: 'Metapher'
missing in vectors: 'Amtszeit'
missing in vectors: 'Talent'
missing in vectors: 'Begabung'
missing in vectors: 'Dividende'
missing in vectors: 'Schlichter'
missing in vectors: 'Vermittler'
missing in vectors: 'Reformation'
missing in vectors: 'Blickpunkt'
missing in vectors: 'Harmonie'
missing in vectors: 'abbilden'
missing in vectors: 'Annahme'
missing in vectors: 'Erwartung'
missing in vectors: 'Ironie'
missing in vectors: 'Eigenschaft'
missing in vectors: 'Merkmal'
missing in vectors: 'Vorliebe'
missing in vectors: 'Einklang'
missing in 

In [10]:
file = SEMD_DIR / 'DEWAC' / 'dewac_lsi_word_vectors.vec'
op_lsi = KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

missing in vectors: 'Ungehörigkeit'
missing in vectors: 'Knuff'
missing in vectors: 'entkorken'
missing in vectors: 'Essiggurke'
missing in vectors: 'sonnenbaden'
missing in vectors: 'Knuff'
missing in vectors: 'Wringen'

Accuracy: 0.799
Tests with unknown words: 6


In [9]:
file = SEMD_DIR / 'DEWAC_1000' / 'dewac_lsi_word_vectors.vec'
op_lsi = KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

missing in vectors: 'Ungehörigkeit'
missing in vectors: 'U-Boot'
missing in vectors: 'Kappes'
missing in vectors: 'Bison'
missing in vectors: 'Gepard'
missing in vectors: 'Marone'
missing in vectors: 'Dummy'
missing in vectors: 'Knuff'
missing in vectors: 'entkorken'
missing in vectors: 'Essiggurke'
missing in vectors: 'Apfelwein'
missing in vectors: 'sonnenbaden'
missing in vectors: 'verquirlen'
missing in vectors: 'Knuff'
missing in vectors: 'Wringen'

Accuracy: 0.807
Tests with unknown words: 13


In [11]:
file = SEMD_DIR / 'DEWAC_1000_40k_v2' / 'dewac_lsi_word_vectors.vec'
op_lsi = KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

missing in vectors: 'Gepard'

Accuracy: 0.829
Tests with unknown words: 1


In [12]:
file = SEMD_DIR / 'DEWAC_1000_40k_d2v' / 'dewac_d2v_epoch200'
w2v = Word2Vec.load(str(file))
synonym_judgement_accuracy(w2v.wv, sj_de)


Accuracy: 0.89
Tests with unknown words: 0
