In [1]:
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline

import numpy as np
from numpy import dot
from numpy.linalg import norm
from gensim.models import KeyedVectors
import argparse
import time
import pathlib
from pathlib import Path


SEM_BIAS = './SemBias/SemBias'
GP_GN = '/media/pavel/tmp/glove/gp_gn.txt'
GP = '/media/pavel/tmp/glove/vectors300.txt'
GLOVE = '/media/pavel/tmp/glove/gp_glove.txt'
GOOGLE_ANT = '/media/pavel/tmp/glove/google_anthology_ds.txt'
RG65 = '/media/pavel/tmp/glove/EN-RG-65.txt'
WSA = '/media/pavel/tmp/glove/WS_A.csv'
WSB = '/media/pavel/tmp/glove/WS_B.csv'
MTURK = '/media/pavel/tmp/glove/MTurk.csv'
RW = '/media/pavel/tmp/glove/RW2034.csv'
MEN = '/media/pavel/tmp/glove/MEN3000.csv'
SIMLEX = '/media/pavel/tmp/glove/SimLex.txt'
SEM_EVAL = '/media/pavel/tmp/glove/SemEval'

In [2]:
emb_gp_gn = KeyedVectors.load_word2vec_format(GP_GN, binary=False)

In [6]:
def get_vocabulary(fn = GLOVE):
    with open(fn, "r") as f:
        vocab = [line.lower().split()[0] for line in f]
    return vocab[1:]


def get_anthology_ds(fn = GOOGLE_ANT):
    with open(fn, "r") as f:
        vocab = [line.lower().split() for line in f if line[0] != ':']
    return vocab


def get_semeval(path = SEM_EVAL):
    fnames = Path(path).rglob("*.txt")
    full_lst = []
    for n in fnames:
        pth = str(n)
        with open(pth, "r") as f:
            vocab = [line.replace('"', "").lower().split() for line in f if line[0] != '#']
        full_lst = full_lst + vocab
    res = [[p[0].split(':')] + [p[1].split(':')]+[p[2].split(':')] + [p[3].split(':')] for p in full_lst]
    return res


def cosmul_sim(w, a, b, c, to_do, eps = 0.01):
    """ 3COSMUL similarity """
    if to_do:
        print(cos(w, c), cos(w, b), cos(w, a))
    c1, c2, c3 = cos(w, c), cos(w, b), cos(w, a)
    if to_do:
        print(c1, c2, c3, (c1 * c2)/(c3 + eps))
    if c3 < 0:
        eps = -0.01
    return c1 * c2/(c3 + eps)

def cosine_sim(v, w, eps = 0.001, eps2 = 0.000001):
    """cosine similarity with numerical overflow protection or 3COSADD similarity"""
    return np.dot(w, v)/((np.linalg.norm(w) + eps2) * (np.linalg.norm(v) + eps))


def find_simi(lst, a, b, c, vocab, emb):
    """find argmax using 3COSADD method"""
    v1 = b - a + c
    amax = 0.
    result = ""
    for w2 in vocab:
        if w2 not in lst:
            sim = cosine_sim(emb[w2], v1)
            if sim > amax:
                result = w2
                amax = sim
    return result, amax

#anth_semeval = get_semeval()
anthologies = get_anthology_ds()
vocab_gp_gn = get_vocabulary(fn = GP_GN)

In [7]:
def cosmul_sim(w, a, b, c, to_do, eps = 0.01):
    """ 3COSMUL similarity """
    if to_do:
        print(cos(w, c), cos(w, b), cos(w, a))
    c1, c2, c3 = cos(w, c), cos(w, b), cos(w, a)
    if to_do:
        print(c1, c2, c3, (c1 * c2)/(c3 + eps))
    if c3 < 0:
        eps = -0.01
    return c1 * c2/(c3 + eps)

def cosine_sim(v, w, eps = 0.001, eps2 = 0.000001):
    """cosine similarity with numerical overflow protection or 3COSADD similarity"""
    return np.dot(w, v)/((np.linalg.norm(w) + eps2) * (np.linalg.norm(v) + eps))


def find_simi(lst, a, b, c, vocab, emb):
    """find argmax using 3COSADD method"""
    v1 = b - a + c
    amax = 0.
    result = ""
    for w2 in vocab:
        if w2 not in lst:
            sim = cosine_sim(emb[w2], v1)
            if sim > amax:
                result = w2
                amax = sim
    return result, amax


def calculate_acc_for_analogies(anthologies, emb_gn, vocab_gn):
    """ check accuracy of anthologies detection method using 3COSADD """
    corrects = 0
    total = 0
    a, b, c = "", "", ""
    emb1, emb2, emb3 = [0], [0], [0]
    for tup in anthologies:
        # saving vectors to speed up computation
        e1 = emb_gn[tup[0]] if a != tup[0] else emb1
        e2 = emb_gn[tup[1]] if b != tup[1] else emb2
        e3 = emb_gn[tup[2]] if c != tup[2] else emb3
        # trying to find similar word as argmax(3COSADD(e2-e1+e3, E(w))) for all words w in vocab except tup[:3]
        word, sim = find_simi(tup[:3], e1, e2, e3, vocab_gn, emb_gn)
        if word == tup[3]:
            corrects +=1
        total +=1
        emb1, emb2, emb3 = e1, e2, e3
        a, b, c = tup[0], tup[1], tup[2]
        if total % 200 == 0:
            print(total, *tup, word, sim, "actual:", tup[3], corrects/total)
    return corrects, total


corrects, total = calculate_acc_for_analogies(anthologies, emb_gp_gn, vocab_gp_gn)
print(f"Final score: {corrects/total} {corrects} {total}")

200 havana cuba islamabad pakistan pakistan 0.6373196279240257 actual: pakistan 0.96
400 paris france tokyo japan japan 0.6296925698790207 actual: japan 0.965
600 algiers algeria belgrade serbia serbia 0.708826016842488 actual: serbia 0.97
800 ashgabat turkmenistan cairo egypt egypt 0.595151121535518 actual: egypt 0.9675
1000 baku azerbaijan dublin ireland ireland 0.5962528090608104 actual: ireland 0.972
1200 beirut lebanon jakarta indonesia indonesia 0.6490071423248388 actual: indonesia 0.9725
1400 bishkek kyrgyzstan lima peru peru 0.5495437757975783 actual: peru 0.9707142857142858
1600 cairo egypt caracas venezuela venezuela 0.7132207770556203 actual: venezuela 0.969375
1800 copenhagen denmark funafuti tuvalu tuvalu 0.46459788067411767 actual: tuvalu 0.9688888888888889
2000 dublin ireland kampala uganda uganda 0.6038106880559133 actual: uganda 0.9705
2200 hanoi vietnam ljubljana slovenia slovenia 0.5541343648369021 actual: slovenia 0.9686363636363636
2400 jakarta indonesia mogadishu 

17200 thinking thought hitting hit hit 0.45592083463097943 actual: hit 0.6898837209302325
17400 bird birds machine machines machines 0.6135082378812335 actual: machines 0.6897701149425287
17600 cloud clouds donkey donkeys donkeys 0.3652489444464547 actual: donkeys 0.690625
17800 dollar dollars bird birds birds 0.4779135821040585 actual: birds 0.6906741573033708
18000 finger fingers mouse mice mickey 0.5046365748423155 actual: mice 0.6910555555555555
18200 machine machines elephant elephants elephants 0.5408956606179126 actual: elephants 0.6918681318681319
18400 onion onions child children children 0.5061565602422232 actual: children 0.6927717391304348
18600 road roads pig pigs pigs 0.45635719960984766 actual: pigs 0.693763440860215
18800 estimate estimates say says know 0.5408191099736609 actual: says 0.6931914893617022
19000 play plays shuffle shuffles composes 0.32336827837727344 actual: shuffles 0.6928421052631579
19200 shuffle shuffles speak speaks speaks 0.4804177180933778 actual:

In [8]:
corrects, total = 13487, 19544

In [13]:
def calculate_acc_for_sem_analogies(anthologies, emb_gn, vocab_gn):
    """ check accuracy of sem anthologies
        initial accuracy was taken from row '8800 ...' of previous cell
    """
    previous_res = "8800 stepbrother stepsister policeman policewoman boyfriend 0.6182800965333808 actual: policewoman 0.7822727272727272"
    print(previous_res)
    total = 8800
    corrects = int(0.7822727272727272 * total)
    total = 0
    corrects = 0
    a, b, c = "", "", ""
    emb1, emb2, emb3 = [0], [0], [0]
    for tup in anthologies[8800:8867]:
        # saving vectors to speed up computation
        e1 = emb_gn[tup[0]] if a != tup[0] else emb1
        e2 = emb_gn[tup[1]] if b != tup[1] else emb2
        e3 = emb_gn[tup[2]] if c != tup[2] else emb3
        # trying to find similar word as argmax(3COSADD(e2-e1+e3, E(w))) for all words w in vocab except tup[:3]
        word, sim = find_simi(tup[:3], e1, e2, e3, vocab_gn, emb_gn)
        if word == tup[3]:
            corrects +=1
        total +=1
        emb1, emb2, emb3 = e1, e2, e3
        a, b, c = tup[0], tup[1], tup[2]
        if total % 200 == 0:
            print(total, *tup, word, sim, "actual:", tup[3], corrects/total)
    print(total, *tup, word, sim, "actual:", tup[3], corrects/total)
    return corrects , total


In [12]:
print(f"GP-GN-GLOVE check")
sem_corrects, sem_total = calculate_acc_for_sem_analogies(anthologies, emb_gp_gn, vocab_gp_gn)
print(f"GP-GN-GLOVE accuracy of sem anthologies: {sem_corrects/sem_total} {sem_corrects} {sem_total}")
syn_acc = (corrects - sem_corrects) / (total - sem_total)
print(f"GP-GN-GLOVE accuracy of syn anthologies: {syn_acc} correct answers: {corrects - sem_corrects}")
print(f"GP-GN-GLOVE overal accuracy for anthologies: {corrects/ total} ")

GP-GN-GLOVE check
200 havana cuba islamabad pakistan pakistan 0.6373196279240257 actual: pakistan 0.96
400 paris france tokyo japan japan 0.6296925698790207 actual: japan 0.965
600 algiers algeria belgrade serbia serbia 0.708826016842488 actual: serbia 0.97
800 ashgabat turkmenistan cairo egypt egypt 0.595151121535518 actual: egypt 0.9675
1000 baku azerbaijan dublin ireland ireland 0.5962528090608104 actual: ireland 0.972
1200 beirut lebanon jakarta indonesia indonesia 0.6490071423248388 actual: indonesia 0.9725
1400 bishkek kyrgyzstan lima peru peru 0.5495437757975783 actual: peru 0.9707142857142858
1600 cairo egypt caracas venezuela venezuela 0.7132207770556203 actual: venezuela 0.969375
1800 copenhagen denmark funafuti tuvalu tuvalu 0.46459788067411767 actual: tuvalu 0.9688888888888889
2000 dublin ireland kampala uganda uganda 0.6038106880559133 actual: uganda 0.9705
2200 hanoi vietnam ljubljana slovenia slovenia 0.5541343648369021 actual: slovenia 0.9686363636363636
2400 jakarta in

KeyboardInterrupt: 