In [None]:
from pathlib import Path
import pandas as pd
from gensim import models

In [None]:
sj_file_en = "../data/synonym_judgement/cueing study stimuli for distribution.csv"

sj_en_full = pd.read_csv(sj_file_en)
sj_en = sj_en_full[['Probe', 'Target', 'Foil1', 'Foil2']]
sj_en = sj_en[~sj_en.isna().any(axis=1)]
sj_en

In [None]:
sj_file_de = "../data/synonym_judgement/SJT_stimuli.csv"

sj_de_full = pd.read_csv(sj_file_de)
sj_de = sj_de_full[['probe', 'target', 'foil1', 'foil2']]
sj_de = sj_de[~sj_de.isna().any(axis=1)]
sj_de

In [None]:
def closest_match(terms, vectors):
    """
    Returns the index of the term closest to the first term in a list of words.
    
    Note that index 0 is taken as the probe and all words with index > 0 are tested.
    """
    
    terms = terms.to_list()
    # print(terms)
    try:
        distances = vectors.distances(terms[0], terms[1:])
        # print(distances)
        min_dist = distances.argmin() + 1
        return min_dist
    except KeyError:
        for term in terms:
            if term not in vectors:
                print(f"missing in vectors: '{term}'")
        return -1

    
def synonym_judgement_accuracy(wordvectors, tests, target_idx=1):
    pred = tests.apply(lambda x: closest_match(x, wordvectors), axis=1)
    pred = pred[pred > 0]
    correct = (pred == target_idx).sum()
    acc = correct / len(pred)
    print()
    print('Accuracy:', round(acc, 3))
    print('Tests with unknown words:', len(tests) - len(pred))

In [None]:
google_w2v = models.KeyedVectors.load_word2vec_format('../data/vectors/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
synonym_judgement_accuracy(google_w2v, sj_en)

In [None]:
file = 'bnc_lsi_gensim_term_vectors.csv'
dir_path = Path('../data/SemD/bnc_cs1000_minsz50_lc_filtered')
file_path = dir_path / file
w2v_file_path = file_path.with_suffix('.w2v')

In [None]:
# convert csv file to w2v format

lsi_wv = pd.read_csv(file_path, index_col=0)

with open(w2v_file_path, 'w') as fp:
    fp.write(f'{lsi_wv.shape[0]} {lsi_wv.shape[1]}\n')
    lsi_wv.to_csv(fp, sep=' ', header=False)

lsi_wv

In [None]:
bnc_lsi = models.KeyedVectors.load_word2vec_format(w2v_file_path)

In [None]:
synonym_judgement_accuracy(bnc_lsi, sj_en)

In [None]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.word2vec import Word2Vec

### GERMAN

In [None]:
Path('../data/vectors/d2v').resolve()

In [None]:
file = '../data/vectors/d2v'
d2v = Doc2Vec.load(file)
synonym_judgement_accuracy(d2v.wv, sj_de)

In [None]:
file = '../data/vectors/w2v'
w2v = Word2Vec.load(file)
synonym_judgement_accuracy(w2v.wv, sj_de)

In [None]:
file = '../data/SemD/OP2/OnlineParticipation_lsi_gensim_term_vectors.csv'
op_lsi = models.KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

In [None]:
file = '../data/SemD/DEWAC_1000_40k/dewac_lsi_word_vectors.vec'
op_lsi = models.KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

In [None]:
file = '../data/SemD/DEWAC_1000/dewac_lsi_word_vectors.vec'
op_lsi = models.KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

In [None]:
file = '../data/SemD/DEWAC/dewac_lsi_word_vectors.vec'
op_lsi = models.KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)

In [None]:
file = '../data/SemD/DEWAC_1000_40k_v2/dewac_lsi_word_vectors.vec'
op_lsi = models.KeyedVectors.load_word2vec_format(file)
synonym_judgement_accuracy(op_lsi, sj_de)