In [1]:
from gensim.models import KeyedVectors

import itertools

In [2]:
path = '../models/en/pretrained/GoogleNews-vectors-negative300.bin'

model = KeyedVectors.load_word2vec_format(path, binary=True)

Let's see how the pretrained model performs on the keywords. The model has been trained without lowercasing words so the proper nouns are capitalized.

In [3]:
keywords = ['Syria', 'Damascus', 'Assad', 'politics', 'Putin', 'Trump', 'Russia', 'USA', 'Skripal', 'UK']

In [4]:
# for keyword in keywords:
#     print('keyword: ' + keyword)
#     for word, sim in model.most_similar(positive=[keyword]):
#         print(word)
#     print()
    
xs = [[word for word, _ in model.wv.most_similar(keyword)] for keyword in keywords]

for a in (list(zip(*xs))):
    print(a)

  import sys


('Syrian', 'Syria', 'President_Bashar_Assad', 'partisan_politics', 'Medvedev', 'Donald_Trump', 'Ukraine', 'lifts_Squaw_Valley', 'Sergei_Skripal', 'Britain')
('Syrians', 'Syrian', 'al_Assad', 'Politics', 'Vladimir_Putin', 'impersonator_entertained', 'Moscow', 'Mobility_NASDAQ_USMO', 'Gennady_Vasilenko', 'United_Kingdom')
('Iran', 'Beirut', 'Mubarak', 'political', 'President_Vladimir_Putin', 'Ivanka_Trump', 'Russian', 'lifts_Sugarloaf', 'Gennady_Vasilenko_former', 'UKs')
('Damascus', 'Tehran', 'Bashar_Assad', 'politcs', 'Prime_Minister_Vladimir_Putin', 'Ivanka', 'Belarus', 'proudly_proclaims_Made', 'Zaporozhsky', 'British')
('Hezbollah', 'Assad', 'Syrian', 'poltics', 'Kremlin', 'mogul_Donald_Trump', 'Kremlin', 'World_Premiere_Narrative', 'Herman_Simm', 'Britains')
('Hizbullah', 'Teheran', 'President_Bashar', 'Lisa_Vorderbrueggen_covers', 'Lukashenko', 'Trump_Tower', 'Kazakhstan', 'Mobility_Sets_Date', 'businessman_Andrei_Lugovoi', 'Wiley_Chichester')
('Egypt', 'Cairo', 'Syria', 'partisan

Let's see how well the pretrained model performs on our custom analogical reasoning task

In [5]:
president_country = [
    ('Russia', 'Putin'),
    ('America', 'Trump'),
    ('Britain', 'May'),
    ('Syria', 'Assad'),
    ('Germany', 'Merkel'),
    ('France', 'Macron'),
    ('Japan', 'Abe'),
    ('Turkey', 'Erdogan')
]

combinations_pres_country = [pair for pair in itertools.combinations(president_country, r=2)]

In [6]:
def evaluateModel(m, debug=0):
    find_country = 0
    find_president = 0
    for equation in combinations_pres_country:
        
        a = equation[0][0]
        b = equation[1][1]
        c = equation[0][1]
        d = equation[1][0]
        
        if a in m.wv.vocab and b in m.wv.vocab and c in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[a, b], negative=[c])):
                if d == x[0]:
                    find_country += 1.0/(i + 1)
                    if debug == 1:
                        print("Found %s for relation %s -> %s and %s -> ? in position %d"%(d, c, a, b, i + 1))
        
        if c in m.wv.vocab and d in m.wv.vocab and b in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[c, d], negative=[b])):
                if a == x[0]:
                    find_country += 1.0/(i + 1)
                    if debug == 1:
                        print("Found %s for relation %s -> %s and %s -> ? in position %d"%(a, b, d, c, i + 1))

        if a in m.wv.vocab and c in m.wv.vocab and d in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[c, d], negative=[a])):
                if b == x[0]:
                    find_president += 1.0/(i + 1)
                    if debug == 1:
                        print("Found %s for relation %s -> %s and %s -> ? in position %d"%(b, a, c, d, i + 1))
        
        if a in m.wv.vocab and b in m.wv.vocab and d in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[a, b], negative=[d])):
                if c == x[0]:
                    find_president += 1.0/(i + 1)
                    if debug == 1:
                        print("Found %s for relation %s -> %s and %s -> ? in position %d"%(c, d, b, a, i + 1))
                    
    return find_country, find_president

In [7]:
score_country, score_president = evaluateModel(model, 1)

score = score_country + score_president

print(score)

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


Found Russia for relation Trump -> America and Putin -> ? in position 1




Found Russia for relation May -> Britain and Putin -> ? in position 1
Found Syria for relation Putin -> Russia and Assad -> ? in position 1
Found Russia for relation Assad -> Syria and Putin -> ? in position 1
Found Assad for relation Russia -> Putin and Syria -> ? in position 1
Found Putin for relation Syria -> Assad and Russia -> ? in position 2
Found Germany for relation Putin -> Russia and Merkel -> ? in position 1
Found Russia for relation Merkel -> Germany and Putin -> ? in position 1
Found Merkel for relation Russia -> Putin and Germany -> ? in position 4
Found Putin for relation Germany -> Merkel and Russia -> ? in position 1
Found Russia for relation Macron -> France and Putin -> ? in position 4
Found Japan for relation Putin -> Russia and Abe -> ? in position 1
Found Russia for relation Abe -> Japan and Putin -> ? in position 1
Found Putin for relation Japan -> Abe and Russia -> ? in position 1
Found Turkey for relation Putin -> Russia and Erdogan -> ? in position 1
Found Rus

In [8]:
eval_path = '/work/courses/unix/T/ELEC/E5550/data/eval/analogical_reasoning_questions-words.txt'

In [9]:
combinations = []

with open(eval_path, 'r') as f:
    for line in f.readlines():
        line = line.strip()
        
        if line.startswith(':'):
            continue
        else:
            a, b, c, d = line.split()
            combinations.append(((a, b), (c, d)))
            
print(len(combinations))

19544


In [10]:
def evaluateModel2(m, combinations):
    found = 0
    
    for equation in combinations:
        
        a = equation[0][0]
        b = equation[1][1]
        c = equation[0][1]
        d = equation[1][0]
        
        if a in m.wv.vocab and b in m.wv.vocab and c in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[a, b], negative=[c])):
                if d == x[0]:
                    found += 1
        
        if c in m.wv.vocab and d in m.wv.vocab and b in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[c, d], negative=[b])):
                if a == x[0]:
                    found += 1

        if a in m.wv.vocab and c in m.wv.vocab and d in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[c, d], negative=[a])):
                if b == x[0]:
                    found += 1
                    
        if a in m.wv.vocab and b in m.wv.vocab and d in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[a, b], negative=[d])):
                if c == x[0]:
                    found += 1
                    
    return found

In [None]:
print(evaluateModel2(model, combinations))

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':
  app.launch_new_instance()
