**Bag of Words**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ["Computers can analyze text", "They do it using vectors and matrices", "Computers can process massive amounts of text data"]

vectorizer = CountVectorizer(stop_words="english")
bow_matrix = vectorizer.fit_transform(corpus)

print("Vocabulary", vectorizer.vocabulary_)
print("Feature names", vectorizer.get_feature_names())
print("BoW matrix", bow_matrix.toarray())
print("BoW matrix shape", bow_matrix.shape)

Vocabulary {'computers': 2, 'analyze': 1, 'text': 7, 'using': 8, 'vectors': 9, 'matrices': 5, 'process': 6, 'massive': 4, 'amounts': 0, 'data': 3}
Feature names ['amounts', 'analyze', 'computers', 'data', 'massive', 'matrices', 'process', 'text', 'using', 'vectors']
BoW matrix [[0 1 1 0 0 0 0 1 0 0]
 [0 0 0 0 0 1 0 0 1 1]
 [1 0 1 1 1 0 1 1 0 0]]
BoW matrix shape (3, 10)


**TF-IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english")
tf_idf_matrix = vectorizer.fit_transform(corpus)

print("Corpus", corpus)
print("Feature names", vectorizer.get_feature_names())
print("TF-IDF matrix", tf_idf_matrix.toarray())

Corpus ['Computers can analyze text', 'They do it using vectors and matrices', 'Computers can process massive amounts of text data']
Feature names ['amounts', 'analyze', 'computers', 'data', 'massive', 'matrices', 'process', 'text', 'using', 'vectors']
TF-IDF matrix [[0.         0.68091856 0.51785612 0.         0.         0.
  0.         0.51785612 0.         0.        ]
 [0.         0.         0.         0.         0.         0.57735027
  0.         0.         0.57735027 0.57735027]
 [0.44036207 0.         0.3349067  0.44036207 0.44036207 0.
  0.44036207 0.3349067  0.         0.        ]]


**Cosine similarity**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer(stop_words="english")
tf_idf_matrix = vectorizer.fit_transform(corpus)

similarity_matrix = cosine_similarity(tf_idf_matrix)

print(corpus)
print(similarity_matrix)

['Computers can analyze text', 'They do it using vectors and matrices', 'Computers can process massive amounts of text data']
[[1.         0.         0.34686697]
 [0.         1.         0.        ]
 [0.34686697 0.         1.        ]]


In [None]:
import numpy as np

corpus_2 = ["Computers can store data"]

tf_idf_matrix_2 = vectorizer.transform(corpus_2)
similarity_matrix = cosine_similarity(tf_idf_matrix_2, tf_idf_matrix)

print("Comparing", corpus_2, "and", corpus)
print("The similaryty matrix is", similarity_matrix)

max_similary_value = np.amax(similarity_matrix)
max_similary_angle = np.rad2deg(np.arccos(max_similary_value))
max_similary_index = np.argmax(similarity_matrix)

print(f"The most similar sentence in both corpora are (angle {max_similary_angle}°)")
print(corpus[max_similary_index])
print(corpus_2[0])

Comparing ['Computers can store data'] and ['Computers can analyze text', 'They do it using vectors and matrices', 'Computers can process massive amounts of text data']
The similaryty matrix is [[0.31348343 0.         0.5532461 ]]
The most similar sentence in both corpora are (angle 56.410004560487295°)
Computers can process massive amounts of text data
Computers can store data
