In [56]:
import gensim 
from gensim.models import KeyedVectors
from pprint import pprint
from itertools import combinations

In [3]:
model_path = './models/GoogleNews-vectors-negative300.bin'

In [8]:
model = KeyedVectors.load_word2vec_format(model_path, binary=True)  

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs


In [24]:
red_words = [
    "whip",
    "wind",
    "pole",
    "life",
    "washington",
    "ray",
    "tokyo",
    "change",
    "screen",
]
blue_words = [
    "china",
    "antarctica",
    "stream",
    "fighter",
    "stone",
    "space",
    "spring",
    "belt",
]
neutral_words = [
    "buck",
    "nurse",
    "laser",
    "ball",
    "crane",
    "revolution",
    "mass",
]
black_words = [
    "pan",
]

In [10]:
def most_similar(*args, **kwargs):
    """Wrap gensim's most_similar function to filter similar words or n_grams.
    
    Use like:
    most_similar(
        positive = ["belt", "stone"],
        negative = ["buck", "nurse"],
        topn = 10
    )
    
    """
    topn = kwargs.get("topn", 10)
    kwargs["topn"] = topn + 20
    words = model.most_similar(*args, **kwargs)
    words = [(w.lower(), n) for w, n in words]
    
    exclude_substrings=True
    if exclude_substrings:
        input_words = kwargs["positive"]
        words = [
            (w.lower(), n)
            for w, n in words
            if not (any(w in i_w for i_w in input_words) or
                    any(i_w in w for i_w in input_words) or
                    "_" in w)
        ]
    return words[:topn]

In [11]:
help(most_similar)

Help on function most_similar in module __main__:

most_similar(*args, **kwargs)
    Wrap gensim's most_similar function to filter similar words or n_grams.
    
    Use like:
    most_similar(
        positive = ["belt", "stone"],
        negative = ["buck", "nurse"],
        topn = 10
    )



In [136]:
most_similar(
    positive = ["belt", "stone"],
#     negative = ["buck", "nurse"],
    topn = 10
)

[('marble', 0.4827648401260376),
 ('granite', 0.47009357810020447),
 ('knapped', 0.4621773660182953),
 ('chedi', 0.45354604721069336),
 ('rocks', 0.4529286026954651),
 ('bricks', 0.44156622886657715),
 ('flint', 0.44095659255981445),
 ('brick', 0.43910306692123413),
 ('ring', 0.43649399280548096),
 ('lintels', 0.4330446124076843),
 ('fireclay', 0.4248562157154083),
 ('posthole', 0.4231618344783783)]

In [120]:
def compute_all_similarities(words, clue_size=2):
    """Compute the pairwise similarities for all words in the given `words` list."""
    word_pairs = []
    for w1, w2 in combinations(words, r=clue_size):
        # TODO: support more than 2 words here
        # Do it by doing all pairwise similarities
        # Then averaging them, and include the std dev of similarities for ref
        word_pairs.append((w1, w2, model.similarity(w1, w2)))
        
    word_pairs = sorted(word_pairs, key=lambda v: v[2], reverse=True)
    return word_pairs

In [140]:
in_words = ["doubt", "surrender", "agreement", "avoid", "freeze", "election", "fleet", "shot"]

In [141]:
compute_all_similarities(in_words, clue_size=2)

[('surrender', 'freeze', 0.20315844),
 ('surrender', 'shot', 0.17242557),
 ('surrender', 'avoid', 0.17037044),
 ('agreement', 'freeze', 0.16948375),
 ('surrender', 'agreement', 0.14466019),
 ('surrender', 'election', 0.13724972),
 ('doubt', 'election', 0.1198404),
 ('doubt', 'agreement', 0.11001658),
 ('avoid', 'freeze', 0.10682899),
 ('doubt', 'fleet', 0.10228983),
 ('agreement', 'election', 0.09806852),
 ('agreement', 'fleet', 0.083731815),
 ('freeze', 'election', 0.071638785),
 ('doubt', 'shot', 0.066546075),
 ('avoid', 'election', 0.062058914),
 ('election', 'shot', 0.06052341),
 ('agreement', 'avoid', 0.052880652),
 ('election', 'fleet', 0.051850107),
 ('freeze', 'fleet', 0.048861697),
 ('doubt', 'surrender', 0.046274703),
 ('avoid', 'shot', 0.03572711),
 ('doubt', 'freeze', 0.020384984),
 ('surrender', 'fleet', 0.011949156),
 ('avoid', 'fleet', -0.0014913228),
 ('fleet', 'shot', -0.014988079),
 ('freeze', 'shot', -0.02550612),
 ('agreement', 'shot', -0.026432715),
 ('doubt', 'avo

In [167]:
most_similar(positive=["cheese", "plane"], topn=30)

[('jet', 0.5682516098022461),
 ('jetliner', 0.5431833863258362),
 ('aircraft', 0.5421854257583618),
 ('cessna', 0.5411485433578491),
 ('camembert', 0.5361970067024231),
 ('airliner', 0.5288451910018921),
 ('emmenthal', 0.5261411070823669),
 ('camembert', 0.5080846548080444),
 ('helicopter', 0.505513072013855),
 ('cheddar', 0.50484699010849),
 ('turboprop', 0.5037637948989868),
 ('jets', 0.5007143020629883),
 ('mozzarella', 0.49749767780303955),
 ('emmenthaler', 0.4910329282283783)]

In [152]:
model.similarity("tokyo", "washington")

0.5545972

In [40]:
# Todo, see if this other similarity func is ever worth using
model.most_similar_cosmul(positive=["space", "belt"])

[('belts', 0.465261310338974),
 ('spaces', 0.43275758624076843),
 ('Feustel_replied', 0.429858922958374),
 ('Shuttle_Atlantis_arrives', 0.42639511823654175),
 ('Shuttle_Discovery_blasts', 0.4256429374217987),
 ('black_grosgrain', 0.42392557859420776),
 ('Shuttle_docks', 0.42277610301971436),
 ('Shuttle_Endeavour_undocks', 0.4217846393585205),
 ('Shuttle_Discovery_docks', 0.41901007294654846),
 ('stowage_space', 0.4154626429080963)]

In [50]:
model.most_similar_to_given("china", blue_words[1:])

'stone'

In [73]:
word_dist = [*zip(blue_words[1:], model.distances("china", blue_words[1:]))]
word_dist = sorted([(n, w) for w, n in word_dist])

In [74]:
word_dist

[(0.70375335, 'stone'),
 (0.911139, 'antarctica'),
 (0.9438042, 'space'),
 (0.952535, 'belt'),
 (0.9542776, 'fighter'),
 (0.9791071, 'spring'),
 (1.0015758, 'stream')]