# Hands on word embeddings

Pre-trained embeddings are available from many companies and organisations. You can adopt them, saving you some time and resources

In [None]:
# Downloading Gensim's word2vec pre-trained model (run it only once)
# This didn't work for me (and they even warn in the book this might not work!)
from nlpia.data.loaders import get_data
word_vectors = get_data('word2vec')

Once the resouce has been downloaded, we have to import it

In [None]:
from gensim.models.keyedvectors import KeyedVectors
GOOGLE_VECTORS = "/Users/albarron/corpora/embeddings/GoogleNews/GoogleNews-vectors-negative300.bin.gz"
word_vectors = KeyedVectors.load_word2vec_format(GOOGLE_VECTORS,
    binary=True, limit=400000)
# 200000 limits the number of loaded vectors to 200k only 
# The aim is speeding up and saving some memory (just for the class)
# Back to the slides

Retrieving the most similar vectors

In [None]:
word_vectors.most_similar(positive=['cooking', 'potatoes'], topn=5)

In [None]:
word_vectors.most_similar(positive=['cooking'], topn=5)

In [None]:
word_vectors.most_similar(positive=['bush', 'clinton'], topn=1) # not there woth 200k
# word_vectors.most_similar(positive=['bush', 'president'], topn=1)

In [None]:
word_vectors.most_similar(positive=['bologna', 'pasta'], topn=3)

In [None]:
word_vectors.most_similar(positive=['chicago', 'football'], topn=3)

In [None]:
# Something else?
word_vectors.most_similar(positive=["china", "italy"] , topn=3)

Retrieving the most similar vectors, after subtraction

In [None]:
# Not Germany with 200k
word_vectors.most_similar(positive=['germany', 'france'], negative=['europe'], topn=3)

In [None]:
word_vectors.most_similar(positive=['spain', 'america'], negative=['europe'], topn=3)

Finding the "outlier" (or ideed the least similar word)

In [None]:
word_vectors.doesnt_match("potatoes milk cake computer".split())

In [None]:
word_vectors.doesnt_match("spanish italian french".split())

In [None]:
word_vectors.doesnt_match("beer wine spritz water".split())

Adding and subtracting

In [None]:
word_vectors.most_similar(positive=['king', 'woman'], negative=['man'], topn=2)

In [None]:
word_vectors.most_similar(positive=['pizza', 'mozzarella'], negative=['pineapple'], topn=3)

In [None]:
word_vectors.most_similar(positive=['york', 'mafia'], negative=['italy'], topn=3)

In [None]:
word_vectors.most_similar(positive=['black'], topn=10)

In [None]:
# Some other interesting example?
word_vectors.most_similar(positive=None, negative=None, topn=2)

Similarity between two words

In [None]:
word_vectors.similarity('princess', 'queen')

In [None]:
word_vectors.similarity('prince', 'frog')

In [None]:
word_vectors.similarity('god', 'monster')

In [None]:
word_vectors.similarity('gaze', 'watch')

In [None]:
word_vectors.similarity('frog', 'toad')

In [None]:
word_vectors.similarity('headache', 'flu')

In [None]:
word_vectors.similarity('Aztec', 'Mayan')

In [None]:
word_vectors.similarity('Rome', 'Athens')

In [None]:
word_vectors.similarity('automobile', 'car')

In [None]:
word_vectors.similarity('rail', 'train')

In [None]:
# Some other interesting example?
word_vectors.similarity(None, None)

Accessing the actual vectors

In [None]:
word_vectors['phone']

## Training a word2vec model

In [None]:
# Setup 
from gensim.models.word2vec import Word2Vec
from nltk.corpus import brown

num_features = 300   # The  cardinality of the embedding space
min_word_count = 3   # Words appearing less times will be discarded (depends on the size of the corpus)
num_workers = 2      # Number of cores to be used
window_size = 6      # Size of the context
subsampling = 1e-3   # Threshold for configuring which higher-frequency words are randomly downsampled

In [None]:
# Loading some data

token_list = brown.sents()
len(token_list)

In [None]:
token_list

In [None]:
# Model initialisation 
# I RAN THIS EARLIER. I wont do it now, as it takes a few minutes!!
model = Word2Vec(
    token_list,
    workers=num_workers,
    size=num_features,
    #min_count=min_word_count,
    window=window_size,
    sample=subsampling)

In [None]:
# Discarding the unneeded output weights and freezing the rest
model.init_sims(replace=True)

In [None]:
# Saving the model 
model_name = "my_domain_specific_word2vec_model"
model.save(model_name)

In [None]:
# Loading a model

model = Word2Vec.load(model_name)
model.wv.most_similar('brown')
# Notice that model.most_similar('brown') will be deprecated soon

## fastText

In [None]:
from gensim.models.fasttext import FastText
MODEL_PATH = "/Users/albarron/corpora/embeddings/PretrainedFastText/en/wiki.en.bin"
# MODEL_PATH = "~/corpora/embeddings/FastText/cc.it.300.bin.gz"
ft_model = FastText.load_fasttext_format(model_file=MODEL_PATH)


In [None]:
ft_model.most_similar('calcio')

In [None]:
ft_model.most_similar('football')

In [None]:
from gensim.models import fasttext
MODEL_PATH = "/Users/albarron/corpora/embeddings/FastText/it/cc.it.300.bin.gz"
# MODEL_PATH = "~/corpora/embeddings/FastText/cc.it.300.bin.gz"
ft_model = fasttext.load_facebook_vectors(MODEL_PATH)
# ft_model.most_similar('calcio')


In [None]:
ft_model.most_similar('calcio')