
We will create an embedding using a small text corpus, called text8. The text8 dataset is the first 108 bytes the Large Text Compression Benchmark, which
consists of the first 109 bytes of English Wikipedia [7]. The text8 dataset is accessible from within the gensim API as an iterable of tokens, essentially a list of tokenized sentences.

[GitHub](https://github.com/PacktPublishing/Deep-Learning-with-TensorFlow-2-and-Keras/blob/master/Chapter%207/create_embedding_with_text8.py)



In [0]:
!mkdir data

In [3]:
import gensim.downloader as api
from gensim.models import Word2Vec

info = api.info("text8")
assert(len(info) > 0)

dataset = api.load("text8")  # download and load text 8  dataset
model = Word2Vec(dataset) # we create an embedding using Word2vec model for this data

model.save("data/text8-word2vec.bin")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Let us now explore the saved model

In [4]:
from gensim.models import KeyedVectors

def print_most_similar(word_conf_pairs, k):
    for i, (word, conf) in enumerate(word_conf_pairs):
        print("{:.3f} {:s}".format(conf, word))
        if i >= k-1:
            break
    if k < len(word_conf_pairs):
        print("...")



model = KeyedVectors.load("data/text8-word2vec.bin")
word_vectors = model.wv

# get words in the vocabulary
words = word_vectors.vocab.keys()
print([x for i, x in enumerate(words) if i < 10])
assert("king" in words)


print("# words similar to king")
print_most_similar(word_vectors.most_similar("king"), 5)

print("# vector arithmetic with words (cosine similarity)")
print("# france + berlin - paris = ?")
print_most_similar(word_vectors.most_similar(
    positive=["france", "berlin"], negative=["paris"]), 1
)

print("# vector arithmetic with words (Levy and Goldberg)")
print("# france + berlin - paris = ?")
print_most_similar(word_vectors.most_similar_cosmul(
    positive=["france", "berlin"], negative=["paris"]), 1
)

print("# find odd one out")
print("# [hindus, parsis, singapore, christians]")
print(word_vectors.doesnt_match(["hindus", "parsis", 
    "singapore", "christians"]))

print("# similarity between words")
for word in ["woman", "dog", "whale", "tree"]:
    print("similarity({:s}, {:s}) = {:.3f}".format(
        "man", word,
        word_vectors.similarity("man", word)
    ))

print("# similar by word")
print(print_most_similar(
    word_vectors.similar_by_word("singapore"), 5)
)

print("# distance between vectors")
print("distance(singapore, malaysia) = {:.3f}".format(
    word_vectors.distance("singapore", "malaysia")
))

vec_song = word_vectors["song"]
print("\n# output vector obtained directly, shape:", vec_song.shape)

vec_song_2 = word_vectors.word_vec("song", use_norm=True)
print("# output vector obtained using word_vec, shape:", vec_song_2.shape)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
# words similar to king
0.738 queen
0.732 prince
0.713 emperor
0.699 kings
0.686 throne
...
# vector arithmetic with words (cosine similarity)
# france + berlin - paris = ?
0.796 germany
...
# vector arithmetic with words (Levy and Goldberg)
# france + berlin - paris = ?
0.929 germany
...
# find odd one out
# [hindus, parsis, singapore, christians]
singapore
# similarity between words
similarity(man, woman) = 0.741
similarity(man, dog) = 0.442
similarity(man, whale) = 0.262
similarity(man, tree) = 0.255
# similar by word
0.865 malaysia
0.827 indonesia
0.826 uganda
0.821 tanzania
0.819 zimbabwe
...
None
# distance between vectors
distance(singapore, malaysia) = 0.135

# output vector obtained directly, shape: (100,)
# output vector obtained using word_vec, shape: (100,)


  if np.issubdtype(vec.dtype, np.int):
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
