# Word2Vec using Google's Pre-Trained ML Model
- Google has a pretrained model - gensim.models.Word2Vec
- This model uses pre-trained vectors trained on Google's News dataset (about 100 billion words).
- This model contains 300 dimensional vectors over 3 million words and phrases.
- The shape of the vector generated by this model for the words in our vocabulary is always 300 dimensions as configured by Google, cannot be changed by us.


In [22]:
import gensim
from gensim.models import Word2Vec, keyedvectors
import gensim.downloader as api
import numpy as np


In [None]:
# Step -1) Load the Google News pre-trained Word2Vec model
# word2vec-google-news-300 = Name of Google's pre-trained Word2Vec model.
# This is a 1600 MB download and may take a few minutes to complete.
google_pretrained_model = api.load("word2vec-google-news-300")

In [None]:
# Step-2) Use pre-defined word functions provied by this model.

# Step 2.1) Get vector representation of a word (='king' in this case)
vector_king = google_pretrained_model['king']
print(f"Vector representation of word 'king' is : \n{vector_king}")

# Shape of this vector is 300 dimensions
# - This is always 300 dimensions as configured by Google, cannot be changed by us.
print('vector_king.shape = ',vector_king.shape) 


Vector representation of word 'king' is : 
[ 1.25976562e-01  2.97851562e-02  8.60595703e-03  1.39648438e-01
 -2.56347656e-02 -3.61328125e-02  1.11816406e-01 -1.98242188e-01
  5.12695312e-02  3.63281250e-01 -2.42187500e-01 -3.02734375e-01
 -1.77734375e-01 -2.49023438e-02 -1.67968750e-01 -1.69921875e-01
  3.46679688e-02  5.21850586e-03  4.63867188e-02  1.28906250e-01
  1.36718750e-01  1.12792969e-01  5.95703125e-02  1.36718750e-01
  1.01074219e-01 -1.76757812e-01 -2.51953125e-01  5.98144531e-02
  3.41796875e-01 -3.11279297e-02  1.04492188e-01  6.17675781e-02
  1.24511719e-01  4.00390625e-01 -3.22265625e-01  8.39843750e-02
  3.90625000e-02  5.85937500e-03  7.03125000e-02  1.72851562e-01
  1.38671875e-01 -2.31445312e-01  2.83203125e-01  1.42578125e-01
  3.41796875e-01 -2.39257812e-02 -1.09863281e-01  3.32031250e-02
 -5.46875000e-02  1.53198242e-02 -1.62109375e-01  1.58203125e-01
 -2.59765625e-01  2.01416016e-02 -1.63085938e-01  1.35803223e-03
 -1.44531250e-01 -5.68847656e-02  4.29687500e-0

In [None]:
# Step 2.2) Find similar words to the one provided (here 'cricket')
# - topn=5 means top 5 similar words only
similar_words = google_pretrained_model.most_similar('cricket', topn=5)
print(f"\nTop 5 words similar to 'cricket' are : \n{similar_words}")


Top 5 words similar to 'cricket' are : 
[('cricketing', 0.8372225761413574), ('cricketers', 0.8165745735168457), ('Test_cricket', 0.8094819784164429), ('Twenty##_cricket', 0.8068488240242004), ('Twenty##', 0.7624265551567078)]


In [None]:
# Step 2.3) Find the similarity of the two words provided here (hockey and sports)
# Output observation: Similarity = 0.53, i.e. 53 % similar
google_pretrained_model.similarity('hockey', 'sports')

np.float32(0.5354152)

In [None]:
# Step 2.4) Additional and substraction of vectors
vector_queen_generated = google_pretrained_model['king'] - google_pretrained_model['man'] + google_pretrained_model['woman']
print(f"\nVector representation of word 'queen' is : \n{vector_queen_generated}")



# Get similar vectors to vector_queen_generated
# Output observation: king and queen are most similar vectors to 'vector_queen_generated'
similar_vectors_to_vector_queen_generated = google_pretrained_model.most_similar(positive=[vector_queen_generated_normalized], topn=10)
print(f"\nTop 10 words similar to generated 'queen' vector are : \n{similar_vectors_to_vector_queen_generated}")


# Get the actual queen vector from Google's model
vector_queen_actual = google_pretrained_model['queen']

# Test for similarity between the generated vector and actual 'queen' vector
# Output observation: Similarity = 0.73, i.e. 73 % similar
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

similarity_between_generated_and_actual_queen_vectors = cosine_similarity(vector_queen_generated, vector_queen_actual)
print("Cosine Similarity between generated and actual 'queen' vectors:", similarity_between_generated_and_actual_queen_vectors)


Vector representation of word 'queen' is : 
[ 4.29687500e-02 -1.78222656e-01 -1.29089355e-01  1.15234375e-01
  2.68554688e-03 -1.02294922e-01  1.95800781e-01 -1.79504395e-01
  1.95312500e-02  4.09919739e-01 -3.68164062e-01 -3.96484375e-01
 -1.56738281e-01  1.46484375e-03 -9.30175781e-02 -1.16455078e-01
 -5.51757812e-02 -1.07574463e-01  7.91015625e-02  1.98974609e-01
  2.38525391e-01  6.34002686e-02 -2.17285156e-02  0.00000000e+00
  4.72412109e-02 -2.17773438e-01 -3.44726562e-01  6.37207031e-02
  3.16406250e-01 -1.97631836e-01  8.59375000e-02 -8.11767578e-02
 -3.71093750e-02  3.15551758e-01 -3.41796875e-01 -4.68750000e-02
  9.76562500e-02  8.39843750e-02 -9.71679688e-02  5.17578125e-02
 -5.00488281e-02 -2.20947266e-01  2.29492188e-01  1.26403809e-01
  2.49023438e-01  2.09960938e-02 -1.09863281e-01  5.81054688e-02
 -3.35693359e-02  1.29577637e-01  2.41699219e-02  3.48129272e-02
 -2.60009766e-01  2.42309570e-01 -3.21777344e-01  1.45416260e-02
 -1.59179688e-01 -8.37402344e-02  1.65039062e