<h2>Compare new sentences with a list of fixed words,<br/>
    using fastText word embeddings</h2>

In [5]:
import fasttext
import fasttext.util
import numpy as np
from collections import OrderedDict
from operator import itemgetter

In [2]:
def cosine_similarity(a, b):
    """
    Takes 2 vectors a, b and returns the cosine similarity according 
    to the definition of the dot product
    (https://masongallo.github.io/machine/learning,/python/2016/07/29/cosine-similarity.html)
    """
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [3]:
def compare_sentence(s, words_vectors):
    """
    Compare new sentence/word with those in the words vectors dictionary
    """
    vec=ft.get_sentence_vector(s)
    dic=OrderedDict({w1:cosine_similarity(vec,vec1) for w1,vec1 in words_vectors.items()})
    return sorted(dic.items(), key=itemgetter(1), reverse=True)

In [6]:
# download pretrained English model
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')

In [8]:
# define words list
words_list=[ "fruit", "metal", "books"]

In [9]:
# word vectors obtained from list
words_vectors={w:ft.get_word_vector(w) for  w in words_list}

In [11]:
compare_sentence('I like pineapple juice', words_vectors)

[('fruit', 0.49017134), ('metal', 0.26521403), ('books', 0.1630114)]

In [12]:
compare_sentence('iron extraction is a dangerous activity', words_vectors)

[('metal', 0.36723247), ('fruit', 0.27444276), ('books', 0.09892692)]

In [13]:
compare_sentence('Alice in Wonderland was written in 1865 by Lewis Carroll', words_vectors)

[('books', 0.22377338), ('metal', 0.109032266), ('fruit', 0.07240309)]