In [5]:
import copy
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import OrderedDict

In [2]:
import math

def cosine_sim(vec1, vec2):
    """
    Since our vectors are dictionaries, lets convert them to lists for easier mathing.
    """
    vec1 = [val for val in vec1.values()]
    vec2 = [val for val in vec2.values()]
    
    dot_prod = 0
    for i, v in enumerate(vec1):
        dot_prod += v * vec2[i]
        
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))
    
    return dot_prod / (mag_1 * mag_2)

In [3]:
doc_0 = "The faster Harry got to the store, the faster Harry, the faster, would get home."
doc_1 = "Harry is hairy and faster than Jill."
doc_2 = "Jill is not as hairy as Harry."

In [6]:
corpus = [doc_0, doc_1, doc_2]

vectorizer = TfidfVectorizer(min_df=1)
model = vectorizer.fit_transform(corpus)

print(model.todense())  # The model becomes a sparse numpy matrix, as in a large corpus there would be mostly zeros to deal with.  todense() brings it back to a regular numpy matrix for our viewing pleasure.

[[0.         0.         0.42662402 0.18698644 0.18698644 0.
  0.22087441 0.18698644 0.         0.         0.         0.18698644
  0.         0.74794576 0.18698644 0.18698644]
 [0.46312056 0.         0.35221512 0.         0.         0.35221512
  0.27352646 0.         0.35221512 0.35221512 0.         0.
  0.46312056 0.         0.         0.        ]
 [0.         0.75143242 0.         0.         0.         0.28574186
  0.22190405 0.         0.28574186 0.28574186 0.37571621 0.
  0.         0.         0.         0.        ]]


In [7]:
vector_template = OrderedDict((token, 0) for token in vectorizer.get_feature_names_out())
print(vector_template)

OrderedDict([('and', 0), ('as', 0), ('faster', 0), ('get', 0), ('got', 0), ('hairy', 0), ('harry', 0), ('home', 0), ('is', 0), ('jill', 0), ('not', 0), ('store', 0), ('than', 0), ('the', 0), ('to', 0), ('would', 0)])


In [24]:
document_vectors = []
for doc in model.todense().tolist():

    vec = copy.copy(vector_template)  # So we are dealing with new objects, not multiple references to the same object

    for key, value in zip(vectorizer.get_feature_names_out(), doc):
        vec[key] = value
    document_vectors.append(vec)

In [35]:
query = "How long does it take to get to the store?"
test_vec = vectorizer.transform([query]).todense() 
print(test_vec)

[[0.         0.         0.         0.37796447 0.         0.
  0.         0.         0.         0.         0.         0.37796447
  0.         0.37796447 0.75592895 0.        ]]


In [36]:
test = copy.copy(vector_template)
for key, value in zip(vectorizer.get_feature_names_out(), test_vec.tolist()[0]):
    test[key] = value

In [37]:
test

OrderedDict([('and', 0.0),
             ('as', 0.0),
             ('faster', 0.0),
             ('get', 0.37796447300922725),
             ('got', 0.0),
             ('hairy', 0.0),
             ('harry', 0.0),
             ('home', 0.0),
             ('is', 0.0),
             ('jill', 0.0),
             ('not', 0.0),
             ('store', 0.37796447300922725),
             ('than', 0.0),
             ('the', 0.37796447300922725),
             ('to', 0.7559289460184545),
             ('would', 0.0)])

In [38]:
print(cosine_sim(test, document_vectors[0]))
print(cosine_sim(test, document_vectors[1]))
print(cosine_sim(test, document_vectors[2]))

0.5653938522457077
0.0
0.0
