In [1]:
"""
-----------------------------------------------------------------------------
Word2vec test demo
-----------------------------------------------------------------------------
AUTHOR: Soumitra Samanta (soumitra.samanta@gm.rkmvu.ac.in)
-----------------------------------------------------------------------------
Package required:
Numpy: https://numpy.org/
scikit-learn: https://scikit-learn.org/stable/
gensim: https://radimrehurek.com/gensim/
-----------------------------------------------------------------------------
"""
import numpy as np

%matplotlib notebook
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.decomposition import PCA

# import gensim.downloader

In [2]:
import gensim.downloader

# List all the pre-trained model for word representation available in gensim

In [3]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


# Load one model

In [4]:

word2vec_model = gensim.downloader.load('word2vec-ruscorpora-300')



# Find similar words

In [5]:
word2vec_model.most_similar('Swami_Vivekananda')

KeyError: "Key 'Swami_Vivekananda' not present in vocabulary"

In [8]:
word2vec_model.most_similar('Swami_Vivekananda', topn=20)

In [9]:
word2vec_model.most_similar('Swami_Vivekanand', topn=20)

In [10]:
word2vec_model.most_similar('Vivekananda')

In [11]:
word2vec_model.most_similar('paratha')

In [12]:
word2vec_model.most_similar('bangla')

In [13]:
word2vec_model.most_similar('bengali')

In [14]:
word2vec_model.most_similar('English', topn=20)

# Find word analogy

In [15]:
def word_analogy(x1, x2, y1):
    result = word2vec_model.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

In [16]:
word_analogy('tall', 'tallest', 'long')

In [17]:
word_analogy('Kolkata', 'Durga_Puja', 'Delhi')

In [18]:
word_analogy('Vivekananda', 'Kolkata', 'Abdul_Kalam')

In [19]:
word_analogy('West_Bengal', 'Kolkata', 'Bihar')

In [20]:
word_analogy('West_Bengal', 'Barasat', 'Bihar')


In [21]:
word_analogy('Man', 'computer_programmer', 'woman')

In [22]:
word_analogy('Father', 'computer_programmer', 'Mother')

In [23]:
word_analogy('Father', 'doctor', 'Mother')

In [24]:
word_analogy('Vivekananda', 'monk', 'Abdul_Kalam')

# Display 2-dims projection of some words 

In [25]:
def display_pca_scatterplot(model, words=None, sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(glove_vectors.key_to_index.keys()), sample)
        else:
            words = [ word for word in model.vocab ]
        
    word_vectors = np.array([model[w] for w in words])

    twodim = PCA().fit_transform(word_vectors)[:,:2]
    
    plt.figure(figsize=(10,10))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)

In [26]:
display_pca_scatterplot(word2vec_model, 
                        ['eye', 'blood', 'ear', 'hand', 'leg', 'lips', 'face',
                         'coffee', 'tea', 'water', 'paratha', 'idli', 'khichri',
                         'bed', 'bedroom', 'bottle', 'cup', 'comb', 'door',
                         'homework', 'assignment', 'problem', 'exam', 'test', 'class',
                         'school', 'college', 'university', 'institute', 
                         'dog', 'horse', 'cat', 'monkey', 'parrot', 'cow', 
                         'Kolkata', 'Delhi', 'Mumbai', 'Chennai', 'Hyderabad', 'Ranchi',
                         'India', 'France', 'Germany', 'australia', 'Fiji', 'China'
                        ])