In [2]:
#TFIDF

documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]
processed_docs = [doc.lower().replace(".","") for doc in documents]
processed_docs

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
bow_rep_tfidf = tfidf.fit_transform(processed_docs)

#IDF for all words in the vocabulary
print("IDF for all words in the vocabulary",tfidf.idf_)
print("-"*10)
#All words in the vocabulary.
print("All words in the vocabulary",tfidf.get_feature_names())
print("-"*10)

#TFIDF representation for all documents in our corpus 
print("TFIDF representation for all documents in our corpus\n",bow_rep_tfidf.toarray()) 
print("-"*10)

temp = tfidf.transform(["dog and man are friends"])
print("Tfidf representation for 'dog and man are friends':\n", temp.toarray())

IDF for all words in the vocabulary [1.51082562 1.22314355 1.51082562 1.91629073 1.22314355 1.91629073]
----------
All words in the vocabulary ['bites', 'dog', 'eats', 'food', 'man', 'meat']
----------
TFIDF representation for all documents in our corpus
 [[0.65782931 0.53256952 0.         0.         0.53256952 0.        ]
 [0.65782931 0.53256952 0.         0.         0.53256952 0.        ]
 [0.         0.44809973 0.55349232 0.         0.         0.70203482]
 [0.         0.         0.55349232 0.70203482 0.44809973 0.        ]]
----------
Tfidf representation for 'dog and man are friends':
 [[0.         0.70710678 0.         0.         0.70710678 0.        ]]




In [None]:
# Using gensim, spacy, nltk to train a doc2vec model

In [None]:
#Packages needed for this experiment
#!pip install gensim==3.6.0
#!pip install spacy==2.2.4
#!pip install nltk==3.2.5

In [2]:
import warnings
warnings.filterwarnings('ignore')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from pprint import pprint
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shiv1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
data = ["dog bites man",
        "man bites dog",
        "dog eats meat",
        "man eats food",
        "man is not sheep",
        "king is made",
        "queen is chosen"]

tagged_data = [TaggedDocument(words=word_tokenize(word.lower()), tags=[str(i)]) for i, word in enumerate(data)]

In [16]:
tagged_data

[TaggedDocument(words=['dog', 'bites', 'man'], tags=['0']),
 TaggedDocument(words=['man', 'bites', 'dog'], tags=['1']),
 TaggedDocument(words=['dog', 'eats', 'meat'], tags=['2']),
 TaggedDocument(words=['man', 'eats', 'food'], tags=['3']),
 TaggedDocument(words=['man', 'is', 'not', 'sheep'], tags=['4']),
 TaggedDocument(words=['king', 'is', 'made'], tags=['5']),
 TaggedDocument(words=['queen', 'is', 'chosen'], tags=['6'])]

In [17]:
#dbow
model_dbow = Doc2Vec(tagged_data,vector_size=20, min_count=1, epochs=2,dm=0)

In [18]:
# Feature vector of man eats food
print(model_dbow.infer_vector(['man','eats','food']))

[ 0.01955524 -0.01711161 -0.01388226 -0.00497426  0.00237928 -0.00134615
 -0.02040341 -0.02425806  0.01450477 -0.018003    0.00464677 -0.00692584
  0.01673512  0.01154605 -0.02399587 -0.01162305  0.01260396  0.01774428
  0.01493157  0.02138604]


In [20]:
# Top 5 most simlar words
model_dbow.wv.most_similar("man",topn=5)

[('king', 0.4304395914077759),
 ('chosen', 0.2972422242164612),
 ('dog', 0.25138211250305176),
 ('food', 0.20270395278930664),
 ('not', 0.1977836638689041)]

In [21]:
# Check similarity score between dog and man
model_dbow.wv.n_similarity(["dog"],["man"])

0.2513821

In [22]:
#dm
model_dm = Doc2Vec(tagged_data, min_count=1, vector_size=20, epochs=2,dm=1)

print("Inference Vector of man eats food\n ",model_dm.infer_vector(['man','eats','food']))

print("Most similar words to cat in food corpus\n",model_dm.wv.most_similar("food",topn=5))
print("Similarity between man and queen: ",model_dm.wv.n_similarity(["queen"],["man"]))

Inference Vector of man eats food
  [ 0.01956423 -0.01711311 -0.01390795 -0.00498287  0.00240287 -0.00136158
 -0.02038174 -0.02426959  0.01448358 -0.01801188  0.0046541  -0.00692043
  0.01671058  0.01152617 -0.02401017 -0.01160503  0.01260363  0.01773504
  0.01495108  0.02139715]
Most similar words to cat in food corpus
 [('king', 0.3562750220298767), ('man', 0.20270395278930664), ('made', 0.16116014122962952), ('not', 0.13961383700370789), ('queen', 0.1377629041671753)]
Similarity between man and queen:  -0.066151686


In [11]:
# Error when we compare words not in vocabulary
# model_dm.wv.n_similarity(['covid'],['man'])