In [1]:
import gensim
from gensim.matutils import softcossim 
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel
from gensim import models
import warnings
warnings.filterwarnings('ignore')
print(gensim.__version__)

3.8.3


# load model

In [2]:
googleModel = models.KeyedVectors.load_word2vec_format(r'models/google-negative300.bin', binary=True)
fasttextModel =  models.KeyedVectors.load_word2vec_format(r'models/fasttext-wiki-news-subwords-300.gz', binary=False)

In [3]:
# doc1 = "Soup is a primarily liquid food, generally served warm or hot (but may be cool or cold), that is made by combining ingredients of meat or vegetables with stock, juice, water, or another liquid."
# doc2 = "Noodles are a staple food in many cultures. They are made from unleavened dough which is stretched, extruded, or rolled flat and cut into one of a variety of shapes."
doc1 = "girl in a pink dress"
doc2 = "man in a pink shirt"

# dictionary and corpus.

In [4]:
documents = [doc1, doc2]
dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents])

# tfidf if needed

In [5]:
corpus = [dictionary.doc2bow(simple_preprocess(doc)) for doc in documents] 
tfidfModel = TfidfModel(corpus)

# similarity matrix

In [6]:
simMatFastText = fasttextModel.similarity_matrix(dictionary, tfidf=tfidfModel, threshold=0.0, exponent=2.0, nonzero_limit=100)
simMatGoogle = googleModel.similarity_matrix(dictionary, tfidf=tfidfModel, threshold=0.0, exponent=2.0, nonzero_limit=100)

# simple preprocess

In [7]:
processedDoc1 = dictionary.doc2bow(simple_preprocess(doc1))
processedDoc2 = dictionary.doc2bow(simple_preprocess(doc2))

In [8]:
print("fasttext ", softcossim(processedDoc1, processedDoc2, simMatFastText))
print("google ", softcossim(processedDoc1, processedDoc2, simMatGoogle))


fasttext  0.6932604312896729
google  0.5876667499542236


- MatrixSimilarity: Index similarity (dense with cosine distance).

- SparseMatrixSimilarity: Index similarity (sparse with cosine distance).

- WmdSimilarity: Index similarity (with word-mover distance).

### similar words test

In [11]:
result = googleModel.most_similar(positive=['adult', 'old'], negative=['women'])
print("most_similar_key, similarity : ",result[0])
result = fasttextModel.most_similar(positive=['adult', 'old'], negative=['women'])
print("most_similar_key, similarity : ",result[0])


most_similar_key, similarity :  ('yearold', 0.4635971188545227)
most_similar_key, similarity :  ('older', 0.5929399728775024)
