In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
np.set_printoptions(edgeitems=30, linewidth=100000, precision=3)

In [3]:
corpus = [
    "lorem ipsum news dolor about set",
    "sed do news about organic food campaign incididunt  eiusmod tempor ",
    "labore et news of presidential campaign dolore magna about",
    "Ut enim news of presidential campaign ad minim presidential candidate veniam",
    "sint news of organic food campaign occaecat campaign non about campaign"
]

In [4]:
vectorizer = TfidfVectorizer(smooth_idf=False)

In [5]:
tfidf = vectorizer.fit_transform(corpus).toarray()
tfidf

array([[0.224, 0.   , 0.   , 0.   , 0.   , 0.479, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.479, 0.   , 0.479, 0.   , 0.   , 0.183, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.479, 0.   , 0.   , 0.   , 0.   ],
       [0.182, 0.   , 0.182, 0.   , 0.387, 0.   , 0.   , 0.387, 0.   , 0.   , 0.284, 0.387, 0.   , 0.   , 0.   , 0.   , 0.   , 0.148, 0.   , 0.   , 0.   , 0.284, 0.   , 0.387, 0.   , 0.   , 0.387, 0.   , 0.   ],
       [0.201, 0.   , 0.201, 0.   , 0.   , 0.   , 0.428, 0.   , 0.   , 0.428, 0.   , 0.   , 0.   , 0.428, 0.   , 0.428, 0.   , 0.164, 0.   , 0.   , 0.248, 0.   , 0.314, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.336, 0.157, 0.336, 0.   , 0.   , 0.   , 0.   , 0.336, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.336, 0.129, 0.   , 0.   , 0.195, 0.   , 0.493, 0.   , 0.   , 0.   , 0.   , 0.336, 0.336],
       [0.18 , 0.   , 0.541, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.282, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.147, 0.385, 0.385, 0.22

In [12]:
query = vectorizer.transform(["news about presidential campaign"]).toarray()[0]
query

array([0.442, 0.   , 0.442, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.361, 0.   , 0.   , 0.   , 0.   , 0.692, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ])

In [13]:
tfidf @ query.reshape(-1, 1)

array([[0.165],
       [0.214],
       [0.454],
       [0.458],
       [0.372]])

In [15]:
tokens_in_query = np.flatnonzero(query)
tokens_in_query

array([ 0,  2, 17, 22])

In [18]:
docs_with_tok_0 = np.flatnonzero(tfidf[:,0])
docs_with_tok_0

array([0, 1, 2, 4])

In [19]:
docs_with_tok_2 = np.flatnonzero(tfidf[:, 2])
docs_with_tok_2

array([1, 2, 3, 4])

In [20]:
docs_with_tok_17 = np.flatnonzero(tfidf[:, 17])
docs_with_tok_17

array([0, 1, 2, 3, 4])

In [22]:
docs_with_tok_22 = np.flatnonzero(tfidf[:,22])
docs_with_tok_22

array([2, 3])

In [28]:
all_docs = np.concatenate((docs_with_tok_0, docs_with_tok_2, docs_with_tok_17, docs_with_tok_22), axis=0)
all_docs

array([0, 1, 2, 4, 1, 2, 3, 4, 0, 1, 2, 3, 4, 2, 3])

In [29]:
relevant_docs = np.unique(all_docs)
relevant_docs

array([0, 1, 2, 3, 4])

In [32]:
tfidf[relevant_docs] @ query.reshape(-1, 1)

array([[0.165],
       [0.214],
       [0.454],
       [0.458],
       [0.372]])

In [47]:
query = vectorizer.transform(["lorem presidential"]).toarray()[0]
query

array([0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.806, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.592, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ])

In [52]:
toks_in_query = np.argwhere(query > 0).flatten()
toks_in_query

array([14, 22])

In [60]:
docs = []
for tok in toks_in_query:
    docs_with_tok = np.argwhere(tfidf[:, tok] > 0).flatten()
    docs.append(docs_with_tok)
relevant_docs = np.concatenate(docs, axis=0)
relevant_docs

array([0, 2, 3])

In [61]:
sub_tfidf = tfidf[relevant_docs]

In [62]:
sub_tfidf @ query.reshape(-1, 1)

array([[0.386],
       [0.186],
       [0.292]])

In [64]:
tfidf @ query.reshape(-1, 1)

array([[0.386],
       [0.   ],
       [0.186],
       [0.292],
       [0.   ]])