In [1]:
import spacy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Sample document
docs = [
    "The cat sat on the mat.",
    "The do lay on the mat.",
    "The ct and dog played together."
]

In [3]:
# ==== Bag of Words ====
print("\n ==== Bag of Words ====")
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(docs)
bow_df = pd.DataFrame(bow_matrix.toarray(), columns = bow_vectorizer.get_feature_names_out())
print(bow_df)


 ==== Bag of Words ====
   and  cat  ct  do  dog  lay  mat  on  played  sat  the  together
0    0    1   0   0    0    0    1   1       0    1    2         0
1    0    0   0   1    0    1    1   1       0    0    2         0
2    1    0   1   0    1    0    0   0       1    0    1         1


In [4]:
# ==== TF-IDF ====
print("\n ==== TF-IDF ====")
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(docs)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df


 ==== TF-IDF ====


Unnamed: 0,and,cat,ct,do,dog,lay,mat,on,played,sat,the,together
0,0.0,0.468699,0.0,0.0,0.0,0.0,0.356457,0.356457,0.0,0.468699,0.553642,0.0
1,0.0,0.0,0.0,0.468699,0.0,0.468699,0.356457,0.356457,0.0,0.0,0.553642,0.0
2,0.432385,0.0,0.432385,0.0,0.432385,0.0,0.0,0.0,0.432385,0.0,0.255374,0.432385


In [None]:
# ==== Word2Vec using spaCy ====
print(" ==== Word2Vec (spaCy averaged vectors ) ====")
nlp = spacy.load("en_core_web_lg")

 ==== Word2Vec (spaCy averaged vectors ) ====
Note: you may need to restart the kernel to use updated packages.
Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     - ------------------------------------ 19.4/400.7 MB 95.6 MB/s eta 0:00:04
     -- ----------------------------------- 28.0/400.7 MB 67.5 MB/s eta 0:00:06
     --- ---------------------------------- 35.1/400.7 MB 55.0 MB/s eta 0:00:07
     --- ---------------------------------- 41.4/400.7 MB 48.3 MB/s eta 0:00:08
     ---- --------------------------------- 47.4/400.7 MB 44.2 MB/s eta 0:00:08
     ---- --------------------------------- 51.4/400.7 MB 40.1 MB/s eta 0:00:09
     ----- -------------------------------- 54.8/400.7 MB 36.3 MB/s eta 0:00:10
     ----- -------------------------------- 59.0/400.7 MB 34.2 MB/s eta 0:00:1

In [7]:
def document_vector(doc):
    return nlp(doc).vector

In [8]:
word2vec_matrix = np.array([document_vector(doc) for doc in docs])
print("Shape of Word2Vec matrix : ", word2vec_matrix.shape)

# ==== Document Similarity using Word2Vec ====
print("Word2 Vec Document Similarity Matrix : ")
print(pd.DataFrame(cosine_similarity(word2vec_matrix), 
                   columns=[f'Doc {i+1}' for i in range(len(docs))],
                   index=[f'Doc {i+1}' for i in range(len(docs))]
                   ))

Shape of Word2Vec matrix :  (3, 300)
Word2 Vec Document Similarity Matrix : 
          Doc 1     Doc 2     Doc 3
Doc 1  1.000000  0.922870  0.783612
Doc 2  0.922870  1.000000  0.753228
Doc 3  0.783612  0.753228  1.000000


In [9]:
# ==== Word Similarity 'cat' vs 'dog' ====

print("==== Word Similarity : 'cat' vs 'dog' ====")

# ------ Using BOW ------
bow_vocab = bow_vectorizer.vocabulary_
cat_bow = np.zeros(len(bow_vocab))
dog_bow = np.zeros(len(bow_vocab))
if 'cat' in bow_vocab:
    cat_bow[bow_vocab['cat']] = 1
if 'dog' in bow_vocab:
    dog_bow[bow_vocab['dog']] = 1
sim_bow = cosine_similarity([cat_bow], [dog_bow])[0][0]    
print(f"BoW Similarity :  {sim_bow:.4f}")

==== Word Similarity : 'cat' vs 'dog' ====
BoW Similarity :  0.0000


In [10]:
# ===== Using TF-IDF =====

tfidf_vocab = tfidf_vectorizer.vocabulary_
cat_tfidf = np.zeros(len(tfidf_vocab))
dog_tfidf = np.zeros(len(tfidf_vocab))
if 'cat' in tfidf_vocab:
    cat_tfidf[tfidf_vocab['cat']] = 1
if 'dog' in tfidf_vocab:
    dog_tfidf[tfidf_vocab['dog']] = 1    
sim_tfidf = cosine_similarity([cat_tfidf], [dog_tfidf])[0][0]
print(f"TF-IDF Similarity : {sim_tfidf:.4f}")

TF-IDF Similarity : 0.0000


In [11]:
# ------ Using Word2Vec ------
cat_vec = nlp("cat").vector
dog_vec = nlp("dog").vector
sim_word2vec = cosine_similarity([cat_vec], [dog_vec])[0][0]
print(f"Word2Vec Similarity : {sim_word2vec:.4f}")

Word2Vec Similarity : 0.8017
