## Use Glove Pretrained

**Objective: Use pretrained GloVe (Global Vectors ) and find out the analogies and nearest neighbours.**

The pretrained vectors are availabe at the following websites.

#### GloVe: https://nlp.stanford.edu/projects/glove/
#### Direct link: http://nlp.stanford.edu/data/glove.6B.zip
#### Git hub: https://github.com/stanfordnlp/GloVe

In [1]:
import numpy as np

In [2]:
print np.__version__

1.14.2


# Load GloVe 

In [3]:
import os

In [6]:
os.listdir('/Users/abalaji/myData/NLP/Glove/glove.6B')

['glove.6B.100d.txt',
 'glove.6B.200d.txt',
 'glove.6B.300d.txt',
 'glove.6B.50d.txt']

In [10]:
# different versions of global vectors are available : Dimensions 50, 100, 200 etc.
# let's load the smaller footprint
# it's a text file.
!head -1 '/Users/abalaji/myData/NLP/Glove/glove.6B/glove.6B.50d.txt'

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581


In [15]:
#set up the globals for GloVe
word2vec = {} # dictionary
embedding = []
idx2word  = []   

In [16]:
print('Loading word vectors...')
# basically, a word followed by the vector related to it.
dataLocation = '/Users/abalaji/myData/NLP/Glove/glove.6B/glove.6B.50d.txt'
with open(dataLocation) as f:
    for line in f:
        # format is word bunch of vectors
        tokens = line.split()
        word = tokens[0]
        vector = np.asarray(tokens[1:], dtype='float32')
        #
        # add to globals
        word2vec[word] = vector
        embedding.append(vector)
        idx2word.append(word)
        
        
print('Done')
print('Length of dictionary %s' % len(word2vec))

Loading word vectors...
Done
Length of dictionary 400000


In [17]:
type(embedding)

list

In [18]:
embedding = np.array(embedding)

In [19]:
embedding.shape

(400000, 50)

In [20]:
VocabSize, Dim = embedding.shape

In [21]:
print VocabSize, Dim

400000 50


### Cosine Distance

In [24]:

# for a given vector, find out the nearest one (cosine distance) by iterating thru
# the global variable - embedding
from sklearn.metrics import pairwise_distances

In [49]:
# about np.argsort()
print np.argsort(np.array([3, 1, 0, -1, 5, 6]))

# returns the index to sorted array in ascending order

[3 2 1 0 4 5]


In [75]:
# in the case of word analogies, 3 words are given, we have to find out the fourth one.
def find_analogies(w1, w2, w3):
    # sanity check
    for word in (w1, w2, w3):
        if word not in word2vec:
            print('%s not found' % word)
            return
        
    # process
    # get the vectors for the given words
    # use popular analogy to name the variable
    king = word2vec[w1]
    man  = word2vec[w2]
    woman = word2vec[w3]
    # note that vector is nothing but a bunch of numbers (based on dim)
    input_vector = king - man + woman
    
    # find out the closest vector
    # want the outut in the shape of global variable - 'embedding', so that 
    # we can index it after sorting
    distances = pairwise_distances(input_vector.reshape(1, Dim), embedding, metric='cosine').reshape(VocabSize)
    #print('distances shape %d' % len(distances))
    #print(distances)
    
    # get the index using np.argsort()
    # get only the top 4, we are looking for the fourth word.
    indices = distances.argsort()[:4] 
    for index in indices:
        # get the word
        word = idx2word[index]
        
        if word not in (w1, w2, w3):
            best_word = word
            break
    
    print('%10s - %10s = %10s - %10s' % (w1,  w2, w3, best_word) )    

In [76]:
find_analogies('king', 'man', 'woman')
find_analogies('france', 'paris', 'london')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')
find_analogies('france', 'french', 'english')
find_analogies('japan', 'japanese', 'chinese')
find_analogies('japan', 'japanese', 'italian')
find_analogies('japan', 'japanese', 'australian')
find_analogies('december', 'november', 'june')
find_analogies('miami', 'florida', 'texas')
find_analogies('einstein', 'scientist', 'painter')
find_analogies('china', 'rice', 'bread')
find_analogies('man', 'woman', 'she')
find_analogies('man', 'woman', 'aunt')
find_analogies('man', 'woman', 'sister')
find_analogies('man', 'woman', 'wife')
find_analogies('man', 'woman', 'actress')
find_analogies('man', 'woman', 'mother')
find_analogies('heir', 'heiress', 'princess')
find_analogies('nephew', 'niece', 'aunt')
find_analogies('france', 'paris', 'tokyo')
find_analogies('france', 'paris', 'beijing')
find_analogies('february', 'january', 'november')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')

      king -        man =      woman -      queen
    france -      paris =     london -    britain
    france -      paris =       rome -      italy
     paris -     france =      italy -       rome
    france -     french =    english -    england
     japan -   japanese =    chinese -      china
     japan -   japanese =    italian -      italy
     japan -   japanese = australian -  australia
  december -   november =       june -       july
     miami -    florida =      texas -    houston
  einstein -  scientist =    painter -    matisse
     china -       rice =      bread -    chinese
       man -      woman =        she -         he
       man -      woman =       aunt -      uncle
       man -      woman =     sister -    brother
       man -      woman =       wife -     friend
       man -      woman =    actress -      actor
       man -      woman =     mother -     father
      heir -    heiress =   princess -      queen
    nephew -      niece =       aunt -      uncle
