# word2vec
- google's pretrained model containing 50 billion words stored as vectors
- similar words have similar vectors and similarity can be measured using cosine distance
- each vector has length of about 300

## Applications 
- text similarity
- word analogy 
- language translation
- finding odd words

# Word embeddings
- numerical representation of words using vectors of length 300
- we using pretrained word2vec model
- we work with gensim which is the popular NLP package

## Here we start the implementation

# Choose the odd one out

In [3]:
import gensim


In [5]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
word_vectors = KeyedVectors.load_word2vec_format("../GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin", binary= True) 

In [41]:
input1= ["apple" ,"orange", "party", "juice", "guava"];

word1 = word_vectors["apple"]
word2 = word_vectors["orange"]
word3 = word_vectors["party"]
word4 = word_vectors["guava"]


In [42]:
print(cosine_similarity([word1], [word2]))
print(cosine_similarity([word1], [word3]))
print(cosine_similarity([word2], [word4]))

[[0.39203462]]
[[0.06456922]]
[[0.3344945]]


In [43]:
import numpy as np

In [78]:
def odd_one_out(words):
    odd_one_out= None ;
    all_vectors= [word_vectors[w] for w in words]
    mean_vector = np.mean(all_vectors, axis =0 )
    min_similarity = 1.0;
    for w in words:
        sim = cosine_similarity([mean_vector] , [word_vectors[w]])
        print("for the word %s the similarity is %.2f "%(w, sim))
        if sim < min_similarity:
            
            min_similarity = sim 
            odd_one_out = w;
    return odd_one_out

In [79]:
input2 = ["sleep", "dance", "dancer", "music", "party"]
input3 = ["russia" , "india", "paris", "germany", "france"]


In [80]:
odd_one_out(input1)

for the word apple the similarity is 0.76 
for the word orange the similarity is 0.67 
for the word party the similarity is 0.35 
for the word juice the similarity is 0.71 
for the word guava the similarity is 0.75 


'party'

In [81]:
odd_one_out(input2)

for the word sleep the similarity is 0.48 
for the word dance the similarity is 0.83 
for the word dancer the similarity is 0.74 
for the word music the similarity is 0.65 
for the word party the similarity is 0.42 


'party'

In [82]:
odd_one_out(input3)

for the word russia the similarity is 0.79 
for the word india the similarity is 0.81 
for the word paris the similarity is 0.75 
for the word germany the similarity is 0.84 
for the word france the similarity is 0.81 


'paris'

In [94]:
input4 = ["USA", "rajasthan", "gujrat"]
odd_one_out(input4)

for the word USA the similarity is 0.70 
for the word rajasthan the similarity is 0.68 
for the word gujrat the similarity is 0.61 


'gujrat'

# Word analogies
- for example man:woman what prince: a , predict a 

In [101]:
def predict(a , b , c , word_vectors):
    a , b , c = a.lower(), b.lower(), c.lower()
    
    d = None 
    
    similarity = -100
    
    words = word_vectors.vocab.keys()
    wc = word_vectors[c];
    rel1 = word_vectors[a]- word_vectors[b]
    
    #calculating d such that a-b is most similar(cosine_similarity) to c-d
    for w in words:
        for w in [a, b, c]:
            continue
        sim= cosine_similarity([rel1], [wc- word_vectors[w]]) 
        if(sim > similarity ):
            similarity = sim
            d = w 
    return d 


In [102]:
triad1 = ["man", "woman", "prince"]
predict(*triad1, word_vectors)

man woman prince


# Using most similar method

In [99]:
word_vectors.most_similar(positive = ["woman", "king"], negative = ["man"] , topn= 1)

[('queen', 0.7118192911148071)]