In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# assign documents
d0 = 'geeks for geeks'
d1 = 'geeks'
d2 = 'r2j'
 
# merge documents into a single corpus
string = [d0, d1, d2]

In [10]:
# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(string)

In [12]:
# get idf values
print('\nidf values:')
for ele1, ele2 in zip(tfidf.get_feature_names_out(), tfidf.idf_):
	print(ele1, ':', ele2)



idf values:
for : 1.6931471805599454
geeks : 1.2876820724517808
r2j : 1.6931471805599454


In [13]:
# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf value:')
print(result)

# in matrix form
print('\ntf-idf values in matrix form:')
print(result.toarray())


Word indexes:
{'geeks': 1, 'for': 0, 'r2j': 2}

tf-idf value:
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4 stored elements and shape (3, 3)>
  Coords	Values
  (0, 1)	0.8355915419449176
  (0, 0)	0.5493512310263033
  (1, 1)	1.0
  (2, 2)	1.0

tf-idf values in matrix form:
[[0.54935123 0.83559154 0.        ]
 [0.         1.         0.        ]
 [0.         0.         1.        ]]


In [1]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def load_data():
    return fetch_20newsgroups(subset='all')

def preprocess(data):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(data)
    return X, vectorizer

In [3]:
class SearchEngine:
    def __init__(self, tfidf_matrix, vectorizer):
        self.tfidf_matrix = tfidf_matrix
        self.vectorizer = vectorizer
        self.ranked_indices = None

    def get_ranked_indices(self, query):
        query_vec = self.vectorizer.transform([query])
        scores = np.dot(self.tfidf_matrix, query_vec.T).toarray()
        self.ranked_indices = np.argsort(scores, axis=0)[::-1]
        
        return self.ranked_indices
    
    def get_top_k_results(self, data, k=10):
        if isinstance(data, list):
            data = np.array(data)
        else:
            data = np.array(data.data)

        top_k_indices = self.ranked_indices[:k]

        return data[top_k_indices]

In [5]:
if __name__ == "__main__":
    data = load_data()
    tfidf_matrix, vectorizer = preprocess(data.data)
    engine = SearchEngine(tfidf_matrix, vectorizer)
    
    query = "machine learning"
    results = engine.get_ranked_indices(query)
    text_results = engine.get_top_k_results(data.data)
    print(text_results)


[["Organization: University of Central Florida - Computer Services\nFrom: Mark Woodruff <CDA90038@UCF1VM.BITNET>\nSubject: Many people on one machine\nLines: 9\n\nI have several people sharing my machine and would like to set up separate\nenvironments under Windows for each of them.  Is there some way of setting\nthings up separate desktops/directories for each of them?  Ideally,\nI'd like totally separate virtual machines.  I'd be willing to settle for\nless, and may end up having batch files that copy .ini files around\ndepending on who wants to use the machine.\n\nmark\nAlas, Setup/n doesn't work if you don't have a network.\n"]
 ['Organization: City University of New York\nFrom: <NT3QC@CUNYVM.BITNET>\nSubject: RE: Hot new 3D software\nLines: 16\n\nI don\'t think speed has been determined, since it has never run on Intel chips.\nBut on the Amiga\'s Motorola Chips, it was one of the fastest true \'Ray Tracers\'\n\nI don\'t think Impulse would port it over and not take speed into cons

In [5]:
len(data.data), len(results)

(18846, 18846)

In [6]:
data.keys()
len(data.data)

18846

In [7]:
temp = np.array(data.data)

In [20]:
# take top k results; NOTE: otherwise the kernel crashes? 
k = 10
top_k_indices = results[:k]
top_k_indices

array([[ 4601],
       [ 1035],
       [ 5687],
       [ 8905],
       [ 9693],
       [16344],
       [ 6582],
       [ 4012],
       [  168],
       [ 1269]])

In [21]:
temp[top_k_indices]

array([["Organization: University of Central Florida - Computer Services\nFrom: Mark Woodruff <CDA90038@UCF1VM.BITNET>\nSubject: Many people on one machine\nLines: 9\n\nI have several people sharing my machine and would like to set up separate\nenvironments under Windows for each of them.  Is there some way of setting\nthings up separate desktops/directories for each of them?  Ideally,\nI'd like totally separate virtual machines.  I'd be willing to settle for\nless, and may end up having batch files that copy .ini files around\ndepending on who wants to use the machine.\n\nmark\nAlas, Setup/n doesn't work if you don't have a network.\n"],
       ['Organization: City University of New York\nFrom: <NT3QC@CUNYVM.BITNET>\nSubject: RE: Hot new 3D software\nLines: 16\n\nI don\'t think speed has been determined, since it has never run on Intel chips.\nBut on the Amiga\'s Motorola Chips, it was one of the fastest true \'Ray Tracers\'\n\nI don\'t think Impulse would port it over and not take sp

In [18]:
print(temp[np.array([1035])][0])

Organization: City University of New York
From: <NT3QC@CUNYVM.BITNET>
Subject: RE: Hot new 3D software
Lines: 16

I don't think speed has been determined, since it has never run on Intel chips.
But on the Amiga's Motorola Chips, it was one of the fastest true 'Ray Tracers'

I don't think Impulse would port it over and not take speed into consideration.

In terms of features, and learning curve... ALL that you stated for 3DS is also
true for Imagine, and lots more... But I'll have to admit that after 3 years of
use on the Amiga, the learning curve is very steep. This is due ONLY to the
manual. It is realy BAD. However, there is a lot of after market support for
this product, including regular 'Tips' articles in many magazines such as "AVID
and a great book by Steve Worley called "Understanding Imagine 2.0" This book i
is not just recommened, IT IS A MUST!

I think an important consideration should be price......
$3000 for 3DS (Not including "tool" packages)
Under $500 for Imagine comple

In [21]:
# Print the ranked indices
print("Ranked document indices for the query '{}':".format(query))
for idx in results:
    print(idx, data.data[idx][:200])

Ranked document indices for the query 'machine learning':


TypeError: only integer scalar arrays can be converted to a scalar index