### Vector Math

You don't need to run the code below unless you've trained your own model. Otherwise, simply download the word vectors from the URL below:

In [65]:
!wget https://zenodo.org/record/49902/files/vocab.npy

--2016-04-17 12:37:26--  https://zenodo.org/record/49902/files/vocab.npy
Resolving zenodo.org (zenodo.org)... 188.184.66.202
Connecting to zenodo.org (zenodo.org)|188.184.66.202|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81754640 (78M) [application/octet-stream]
Saving to: ‘vocab.npy.1’


2016-04-17 12:38:00 (2.95 MB/s) - ‘vocab.npy.1’ saved [81754640/81754640]



In [69]:
!wget https://zenodo.org/record/49902/files/word_vectors.npy

--2016-04-17 12:38:40--  https://zenodo.org/record/49902/files/word_vectors.npy
Resolving zenodo.org (zenodo.org)... 188.184.66.202
Connecting to zenodo.org (zenodo.org)|188.184.66.202|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 116273232 (111M) [application/octet-stream]
Saving to: ‘word_vectors.npy’


2016-04-17 12:39:19 (3.07 MB/s) - ‘word_vectors.npy’ saved [116273232/116273232]



In [17]:
#from lda2vec_model import LDA2Vec
#from chainer import serializers
#import numpy as np
#import pandas as pd
#import pickle
#
#features = pd.read_pickle("../data/features.pd")
#vocab = np.load("../data/vocab")
#npz = np.load(open('topics.story.pyldavis.npz', 'r'))
#dat = {k: v for (k, v) in npz.iteritems()}
#vocab = dat['vocab'].tolist()
#dat = np.load("../data/data.npz")
#n_stories = features.story_id_codes.max() + 1
#n_units = 256
#n_vocab = dat['flattened'].max() + 1
#model = LDA2Vec(n_stories=n_stories, n_story_topics=40,
#                n_authors=5664, n_author_topics=20,
#                n_units=n_units, n_vocab=n_vocab, counts=np.zeros(n_vocab),
#                n_samples=15)
#serializers.load_hdf5("lda2vec.hdf5", model)
#np.save("word_vectors", model.sampler.W.data)
#np.save("vocab", vocab)

In [52]:
import numpy as np
word_vectors_raw = np.load("word_vectors.npy")
vocab = np.load("vocab.npy").tolist()

L2 Normalize the word vectors

In [53]:
word_vectors = word_vectors_raw / np.linalg.norm(word_vectors_raw, axis=-1)[:, None]

In [54]:
def get_vector(token):
    index = vocab.index(token)
    return word_vectors[index, :].copy()

def most_similar(token, n=20):
    word_vector = get_vector(token)
    similarities = np.dot(word_vectors, word_vector)
    top = np.argsort(similarities)[::-1][:n]
    return [vocab[i] for i in top]

# This is Levy & Goldberg's 3Cosmul Metric
# Based on the Gensim implementation: https://github.com/piskvorky/gensim/blob/master/gensim/models/word2vec.py
def cosmul(positives, negatives, topn=20):
    positive = [get_vector(p) for p in positives]
    negative = [get_vector(n) for n in negatives]
    pos_dists = [((1 + np.dot(word_vectors, term)) / 2.) for term in positive]
    neg_dists = [((1 + np.dot(word_vectors, term)) / 2.) for term in negative]
    dists = np.prod(pos_dists, axis=0) / (np.prod(neg_dists, axis=0) + 1e-6)
    idxs = np.argsort(dists)[::-1][:topn]
    return [vocab[i] for i in idxs if (vocab[i] not in positives) and (vocab[i] not in negatives)]
def most_similar_posneg(positives, negatives, topn=20):
    positive = np.sum([get_vector(p) for p in positives], axis=0)
    negative = np.sum([get_vector(n) for n in negatives], axis=0)
    vector = positive - negative
    dists = np.dot(word_vectors, vector)
    idxs = np.argsort(dists)[::-1][:topn]
    return [vocab[i] for i in idxs if (vocab[i] not in positives) and (vocab[i] not in negatives)]

In [55]:
most_similar('san francisco')

[u'san francisco',
 u'seattle',
 u'sf',
 u'new york',
 u'mountain view',
 u'nyc',
 u'palo alto',
 u'new york city',
 u'austin',
 u'los angeles',
 u'atlanta',
 u'chicago',
 u'boston',
 u'soma',
 u'portland',
 u'london',
 u'sunnyvale',
 u'san jose',
 u'ny',
 u'oakland']

In [19]:
cosmul(['california', 'technology'], [], topn=20)

[u'silicon valley',
 u'industry',
 u'u.s.',
 u'in',
 u'west',
 u'agriculture',
 u'area',
 u'tech',
 u'manufacturing',
 u'city',
 u'finance',
 u'valley',
 u'dc',
 u'cities',
 u'america',
 u'sf',
 u'new york',
 u'many areas']

In [39]:
cosmul(['digital', 'currency'], [], topn=20)

[u'currencies',
 u'bitcoin',
 u'bitcoins',
 u'gold',
 u'btc',
 u'analog',
 u'commodities',
 u'trading',
 u'bitcoin&#x27;s',
 u'commodity',
 u'digital goods',
 u'cryptocurrency',
 u'mining',
 u'fiat currency',
 u'goods',
 u'fiat',
 u'coins',
 u'consumer']

In [59]:
cosmul(['text editor', 'terminal'], [], topn=20)

[u'vim',
 u'emacs',
 u'editor',
 u'sublime',
 u'iterm',
 u'notepad',
 u'gui',
 u'vi',
 u'window manager',
 u'command line',
 u'tmux',
 u'web browser',
 u'terminals',
 u'ide',
 u'editing',
 u'textmate',
 u'debugger',
 u'gvim']

In [68]:
cosmul(['continental', 'germany'], [], topn=20)

[u'canada',
 u'france',
 u'europe',
 u'australia',
 u'uk',
 u'poland',
 u'paris',
 u'hong kong',
 u'spain',
 u'usa',
 u'quebec',
 u'new zealand',
 u'japan',
 u'netherlands',
 u'italy',
 u'abroad',
 u'montreal',
 u'denmark']

In [88]:
cosmul(['microsoft'], [], topn=20)

[u'apple',
 u'ms',
 u'nokia',
 u'hp',
 u'google',
 u'rim',
 u'adobe',
 u'samsung',
 u'msft',
 u'ibm',
 u'oracle',
 u'valve',
 u'motorola',
 u'ballmer',
 u'sony',
 u'canonical',
 u'intel',
 u'cisco',
 u'yahoo']

In [87]:
cosmul(['microsoft', 'cloud'], [], topn=20)

[u'apple',
 u'azure',
 u'ms',
 u'enterprise',
 u'google',
 u'oracle',
 u'nokia',
 u'adobe',
 u'cloud services',
 u'samsung',
 u'android',
 u'ibm',
 u'carriers',
 u'intel',
 u'hardware',
 u'hp',
 u'chromeos',
 u'mobile os']

Queen is several rankings down, so not exactly the same as out of the box word2vec!

In [432]:
cosmul(['king', 'woman'], ['man'], topn=20)

[u'female',
 u'prussia',
 u'teen',
 u'females',
 u'male',
 u'queen',
 u'rapist',
 u'males',
 u'young woman',
 u'girl',
 u'stairwell',
 u'white',
 u'predominately',
 u'she',
 u'pronoun',
 u'accuser',
 u'celebrity',
 u'newspaper']

In [461]:
print 'Most similar'
print '\n'.join(most_similar('mark zuckerberg'))
print '\nCosmul'
pos = ['mark zuckerberg', 'amazon']
neg = ['facebook']
print '\n'.join(cosmul(pos, neg, topn=20))
print '\nTraditional Similarity'
print '\n'.join(most_similar_posneg(pos, neg, topn=20))

Most similar
mark zuckerberg
bill gates
larry page
zuck
zuckerberg
steve jobs
larry ellison
jeff bezos
sergey brin
paul allen
richard branson
peter thiel
mark pincus
jack dorsey
mark cuban
eric schmidt
paul graham
warren buffet
sergey
billionaire

Cosmul
jeff bezos
bezos
richard branson
elon musk
elon
sells
hp
dell
tesla
musk
bill gates
john carmack
amazon&#x27;s
warren buffet
michael dell
prime
edison
hitachi

Traditional Similarity
jeff bezos
bezos
richard branson
bill gates
amazon&#x27;s
sells
hp
elon musk
dell
warren buffet
prime
john carmack
paul allen
michael dell
edison
tesla
elon
ibm


In [464]:
pos = ['hacker news', 'question']
neg = ['story']

print 'Most similar'
print '\n'.join(most_similar(pos[0]))
print '\nCosmul'
print '\n'.join(cosmul(pos, neg, topn=20))
print '\nTraditional Similarity'
print '\n'.join(most_similar_posneg(pos, neg, topn=20))

Most similar
hacker news
hn
hn.
front page
reddit
posted
hackernews
upvoted
comment
frontpage
commenting
comments
post
slashdot
posting
quora
forum
news.yc
thread
techcrunch

Cosmul
stackoverflow
stack overflow
answer
answering
answers
quora
questions
answered
ask
asking
programming questions
obvious question
technical questions
hn
important question
first question
such questions
stack exchange

Traditional Similarity
answer
stackoverflow
stack overflow
answering
quora
answers
answered
ask
questions
hn
asking
obvious question
first question
important question
begs
real question
such questions
stack exchange


In [58]:
pos = ['san francisco']
neg = []

print 'Most similar'
print '\n'.join(most_similar(pos[0]))
print '\nCosmul'
print '\n'.join(cosmul(pos, neg, topn=20))
print '\nTraditional Similarity'
print '\n'.join(most_similar_posneg(pos, neg, topn=20))

Most similar
san francisco
seattle
sf
new york
mountain view
nyc
palo alto
new york city
austin
los angeles
atlanta
chicago
boston
soma
portland
london
sunnyvale
san jose
ny
oakland

Cosmul
seattle
sf
new york
mountain view
nyc
palo alto
new york city
austin
los angeles
atlanta
chicago
boston
soma
portland
london
sunnyvale
san jose
ny
oakland

Traditional Similarity
seattle
sf
new york
mountain view
nyc
palo alto
new york city
austin
los angeles
atlanta
chicago
boston
soma
portland
london
sunnyvale
san jose
ny
oakland


In [511]:
pos = ['nlp', 'image']
neg = ['text']

print 'Most similar'
print '\n'.join(most_similar(pos[0]))
print '\nCosmul'
print '\n'.join(cosmul(pos, neg, topn=20))
print '\nTraditional Similarity'
print '\n'.join(most_similar_posneg(pos, neg, topn=20))

Most similar
nlp
machine learning
natural language processing
data mining
algorithms
computer vision
clustering
ml
analysis
image processing
hadoop
visualization
information retrieval
classification
numerical
data analysis
algorithm design
statistical
opencv
analytics

Cosmul
machine learning
computer vision
natural language processing
ai
data mining
analysis
algorithm
randomized
simulations
engine
image processing
visualization
computational
statistical
information retrieval
probabilistic graphical models
opencv
clustering
machine

Traditional Similarity
machine learning
natural language processing
computer vision
data mining
analysis
ai
algorithm
image processing
randomized
visualization
engine
clustering
simulations
information retrieval
statistical
opencv
algorithms
computational


In [498]:
pos = ['vim', 'graphics']
neg = ['terminal']

print 'Most similar'
print '\n'.join(most_similar(pos[0]))
print '\nCosmul'
print '\n'.join(cosmul(pos, neg, topn=20))
print '\nTraditional Similarity'
print '\n'.join(most_similar_posneg(pos, neg, topn=20))

Most similar
vim
emacs
vi
textmate
sublime
zsh
tmux
terminal
sublime text
eclipse
macvim
intellij
xmonad
iterm
st2
netbeans
ide
text editor
gedit
editor

Cosmul
photoshop
typography
animations
design
programming
gradients
gameplay
textures
illustrator
inkscape
fonts
colors
ides
visual
graphic design
algorithms
usability
gimp
layouts

Traditional Similarity
photoshop
typography
animations
textures
gameplay
gradients
inkscape
design
illustrator
programming
ides
fonts
colors
visual
gimp
layouts
canvas
uis


In [517]:
pos = ['vegetables', 'drink']
neg = ['eat']

print 'Most similar'
print '\n'.join(most_similar(pos[0]))
print '\nCosmul'
print '\n'.join(cosmul(pos, neg, topn=20))
print '\nTraditional Similarity'
print '\n'.join(most_similar_posneg(pos, neg, topn=20))

Most similar
vegetables
meat
rice
protein
eat
veggies
fruits
meats
cheese
soy
pasta
veg
beans
foods
cook
milk
eating
grains
fresh fruit
bread

Cosmul
tea
drinking
beer
coffee
alcohol
cup
soda
milk
cups
rice
vodka
drank
drinks
sugar
beans
red wine
pot
wine

Traditional Similarity
tea
drinking
beer
coffee
alcohol
cup
soda
milk
rice
cups
drank
drinks
vodka
sugar
beans
pot
red wine
wine


In [60]:
pos = ['lda', '']
neg = ['']

print 'Most similar'
print '\n'.join(most_similar(pos[0]))

Most similar
lda
kmeans
173
classification
stdev
linear
clustering
regression
g(
scala&#62
fns
f(n
haruki murakami
f(a
.map
vec
chroma


         
sqrt
cache-control
