In [1]:
from pathlib import Path
import pandas as pd
from gensim import models

In [2]:
sj_file_en = "../tmp/Psycho-Paper/synonym_judgement/cueing study stimuli for distribution.csv"

sj_en_full = pd.read_csv(sj_file_en)
sj_en = sj_en_full[['Probe', 'Target', 'Foil1', 'Foil2']]
sj_en = sj_en[~sj_en.isna().any(axis=1)]
sj_en

Unnamed: 0,Probe,Target,Foil1,Foil2
0,abstain,refrain,covet,specify
1,accordance,agreement,substitute,analogy
2,advantage,benefit,tendency,condition
3,alias,pseudonym,aspect,reprisal
4,alternative,substitute,ambition,discretion
...,...,...,...,...
195,villain,crook,herring,aluminium
196,violin,viola,shed,rabbit
197,waist,torso,goddess,chorus
198,window,door,eye,plant


In [3]:
sj_file_de = "../tmp/Psycho-Paper/synonym_judgement/SJT_stimuli.csv"

sj_de_full = pd.read_csv(sj_file_de)
sj_de = sj_de_full[['probe', 'target', 'foil1', 'foil2']]
sj_de = sj_de[~sj_de.isna().any(axis=1)]
sj_de

Unnamed: 0,probe,target,foil1,foil2
0,verzichten,unterlassen,begehren,spezifizieren
1,Übereinstimmung,Vereinbarung,Austausch,Vergleich
2,Vorteil,Nutzen,Tendenz,Bedingung
3,Alias,Psyeudonym,Aspekt,Vergeltung
4,Alternative,Austausch,Ehrgeiz,Diskretion
...,...,...,...,...
195,Verbrecher,Gauner,Hering,Aluminium
196,Geige,Bratsche,Schuppen,Kaninchen
197,Taille,Hüfte,Göttin,Chor
198,Fenster,Tür,Auge,Pflanze


In [7]:
google_w2v = models.KeyedVectors.load_word2vec_format('../data/vectors/GoogleNews-vectors-negative300.bin', binary=True)

In [5]:
def closest_match(terms, vectors):
    """
    Returns the index of the term closest to the first term in a list of words.
    
    Note that index 0 is taken as the probe and all words with index > 0 are tested.
    """
    
    terms = terms.to_list()
    # print(terms)
    try:
        distances = vectors.distances(terms[0], terms[1:])
        # print(distances)
        min_dist = distances.argmin() + 1
        return min_dist
    except KeyError:
        for term in terms:
            if term not in vectors:
                print(f"missing in vectors: '{term}'")
        return -1

    
def synonym_judgement_accuracy(wordvectors, tests, target_idx=1):
    pred = tests.apply(lambda x: closest_match(x, wordvectors), axis=1)
    pred = pred[pred > 0]
    correct = (pred == target_idx).sum()
    acc = correct / len(pred)
    print()
    print('Accuracy:', round(acc, 3))
    print('Tests with unknown words:', len(tests) - len(pred))

In [None]:
synonym_judgement_accuracy(google_w2v, sj_en)

In [15]:
file = 'bnc_lsi_gensim_term_vectors.csv'
dir_path = Path('../data/out/SemD/bnc_cs1000_minsz50_lc_filtered')
file_path = dir_path / file
w2v_file_path = file_path.with_suffix('.w2v')

In [None]:
# convert csv file to w2v format

lsi_wv = pd.read_csv(file_path, index_col=0)

with open(w2v_file_path, 'w') as fp:
    fp.write(f'{lsi_wv.shape[0]} {lsi_wv.shape[1]}\n')
    lsi_wv.to_csv(fp, sep=' ', header=False)

lsi_wv

In [16]:
bnc_lsi = models.KeyedVectors.load_word2vec_format(w2v_file_path)
bnc_lsi

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f8a3d58cd90>

In [17]:
synonym_judgement_accuracy(bnc_lsi, sj_en)

missing in vectors: 'covet'
missing in vectors: 'impel'
missing in vectors: 'debase'
missing in vectors: 'depose'
missing in vectors: 'digress'
missing in vectors: 'hone'
missing in vectors: 'dirge'
missing in vectors: 'emanation'
missing in vectors: 'foible'
missing in vectors: 'proffer'
missing in vectors: 'implore'
missing in vectors: 'irk'
missing in vectors: 'foretell'
missing in vectors: 'foretell'
missing in vectors: 'sadden'
missing in vectors: 'sadden'
missing in vectors: 'idiocy'
missing in vectors: 'toughen'
missing in vectors: 'revere'
missing in vectors: 'centipede'
missing in vectors: 'bison'
missing in vectors: 'conker'
missing in vectors: 'embroider'
missing in vectors: 'uncork'
missing in vectors: 'grasshopper'
missing in vectors: 'honeycomb'
missing in vectors: 'hyena'
missing in vectors: 'jackal'
missing in vectors: 'crayfish'
missing in vectors: 'choker'
missing in vectors: 'shoplift'
missing in vectors: 'sunbathe'
missing in vectors: 'zipper'
missing in vectors: 't

In [7]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.word2vec import Word2Vec

### GERMAN

In [11]:
file = '../data/vectors/d2v'
d2v = Doc2Vec.load(file)
synonym_judgement_accuracy(d2v.wv, sj_de)

missing in vectors: 'Psyeudonym'
missing in vectors: 'entwürdigen'
missing in vectors: 'Sekte'
missing in vectors: 'forden'
missing in vectors: 'präzise'
missing in vectors: 'Ungehörigkeit'
missing in vectors: 'Tausenfüßler'
missing in vectors: 'Knuff'
missing in vectors: 'entkorken'
missing in vectors: 'Quelle'
missing in vectors: 'Essigkurke'
missing in vectors: 'Anziehsachen'
missing in vectors: 'Hecke'
missing in vectors: 'Riese'
missing in vectors: 'Kohle'
missing in vectors: 'Eltern'
missing in vectors: 'Pfirsisch'
missing in vectors: 'sonnenbaden'
missing in vectors: 'Knuff'
missing in vectors: 'Wringen'
missing in vectors: 'Narzisse'

Accuracy: 0.811
Tests with unknown words: 20


In [12]:
file = '../data/vectors/w2v'
w2v = Word2Vec.load(file)
synonym_judgement_accuracy(w2v.wv, sj_de)

missing in vectors: 'Psyeudonym'
missing in vectors: 'entwürdigen'
missing in vectors: 'Sekte'
missing in vectors: 'forden'
missing in vectors: 'präzise'
missing in vectors: 'Ungehörigkeit'
missing in vectors: 'Tausenfüßler'
missing in vectors: 'Knuff'
missing in vectors: 'entkorken'
missing in vectors: 'Quelle'
missing in vectors: 'Essigkurke'
missing in vectors: 'Anziehsachen'
missing in vectors: 'Hecke'
missing in vectors: 'Riese'
missing in vectors: 'Kohle'
missing in vectors: 'Eltern'
missing in vectors: 'Pfirsisch'
missing in vectors: 'sonnenbaden'
missing in vectors: 'Knuff'
missing in vectors: 'Wringen'
missing in vectors: 'Narzisse'

Accuracy: 0.917
Tests with unknown words: 20


In [28]:
file = '../data/out/SemD/OP2/OnlineParticipation_lsi_gensim_term_vectors.csv'
op_lsi = models.KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

missing in vectors: 'begehren'
missing in vectors: 'spezifizieren'
missing in vectors: 'Übereinstimmung'
missing in vectors: 'Alias'
missing in vectors: 'Psyeudonym'
missing in vectors: 'Vergeltung'
missing in vectors: 'Ehrgeiz'
missing in vectors: 'Diskretion'
missing in vectors: 'Analogie'
missing in vectors: 'Metapher'
missing in vectors: 'Amtszeit'
missing in vectors: 'Begabung'
missing in vectors: 'Dividende'
missing in vectors: 'Schlichter'
missing in vectors: 'Vermittler'
missing in vectors: 'Reformation'
missing in vectors: 'Blickpunkt'
missing in vectors: 'Eigenschaft'
missing in vectors: 'Merkmal'
missing in vectors: 'Vorliebe'
missing in vectors: 'Buchprüfung'
missing in vectors: 'Inspektion'
missing in vectors: 'Ableitung'
missing in vectors: 'Mittelmaß'
missing in vectors: 'Axiom'
missing in vectors: 'Gamma'
missing in vectors: 'Untersagung'
missing in vectors: 'Vermächtnis'
missing in vectors: 'Chronologie'
missing in vectors: 'Wagnis'
missing in vectors: 'antreiben'
miss

In [7]:
file = '../data/out/SemD/DEWAC_1000/dewac_lsi_word_vectors.vec'
op_lsi = models.KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

missing in vectors: 'Psyeudonym'
missing in vectors: 'forden'
missing in vectors: 'Ungehörigkeit'
missing in vectors: 'Tausenfüßler'
missing in vectors: 'Uboot'
missing in vectors: 'Kappes'
missing in vectors: 'Bison'
missing in vectors: 'Gepard'
missing in vectors: 'Marone'
missing in vectors: 'Dummy'
missing in vectors: 'Knuff'
missing in vectors: 'entkorken'
missing in vectors: 'Essigkurke'
missing in vectors: 'Pfirsisch'
missing in vectors: 'Apfelwein'
missing in vectors: 'sonnenbaden'
missing in vectors: 'verquirlen'
missing in vectors: 'Knuff'
missing in vectors: 'Wringen'

Accuracy: 0.805
Tests with unknown words: 15


In [6]:
file = '../data/out/SemD/DEWAC/dewac_lsi_word_vectors.vec'
op_lsi = models.KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

missing in vectors: 'Psyeudonym'
missing in vectors: 'forden'
missing in vectors: 'Ungehörigkeit'
missing in vectors: 'Tausenfüßler'
missing in vectors: 'Uboot'
missing in vectors: 'Knuff'
missing in vectors: 'entkorken'
missing in vectors: 'Essigkurke'
missing in vectors: 'Pfirsisch'
missing in vectors: 'sonnenbaden'
missing in vectors: 'Knuff'
missing in vectors: 'Wringen'

Accuracy: 0.795
Tests with unknown words: 10
