In [1]:
from pyserini.index.lucene import IndexReader
import json

# Initialize the IndexReader with the path to your index
index_reader = IndexReader('indexes/lucene-index-msmarco-passage')

# Get the term frequency (TF) vector for a specific document by its ID
tf = index_reader.get_document_vector('7187158')

# Compute the BM25 weight for each term in the document
bm25_weights = {term: index_reader.compute_bm25_term_weight('7187158', term, analyzer=None) for term in tf.keys()}

# Print the BM25 weights in a nicely formatted JSON
print(json.dumps(bm25_weights, indent=4, sort_keys=True))


Jul 06, 2024 9:20:09 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


{
    "be": 2.637899875640869,
    "brother": 4.09124231338501,
    "bubba": 7.102361679077148,
    "bubba's\u00e2": 11.091651916503906,
    "deen": 7.4197235107421875,
    "earl": 5.663764953613281,
    "former": 3.8262834548950195,
    "gener": 2.2932770252227783,
    "her": 2.7393782138824463,
    "hier": 8.24051284790039,
    "manag": 2.832794189453125,
    "paula": 6.438521862030029,
    "su": 5.404428005218506,
    "uncl": 5.362298488616943,
    "w": 3.9339818954467773
}


In [2]:
from pyserini.analysis import Analyzer, get_lucene_analyzer

# Initialize the analyzer
analyzer = Analyzer(get_lucene_analyzer())

# Analyze the query to get its tokens
query_tokens = analyzer.analyze('what is paula deen\'s brother')

# Create a multi-hot vector for the query tokens
multihot_query_weights = {k: 1 for k in query_tokens}

# Print the query tokens and the multi-hot vector
print("Query Tokens:", query_tokens)
print("Multi-Hot Query Weights:", multihot_query_weights)


Query Tokens: ['what', 'paula', 'deen', 'brother']
Multi-Hot Query Weights: {'what': 1, 'paula': 1, 'deen': 1, 'brother': 1}


In [3]:
import numpy as np

# Gather up the dimensions (i.e., the combined dictionary).
terms = set.union(set(bm25_weights.keys()), set(multihot_query_weights.keys()))

# Create numpy vectors for the document and query
bm25_vec = np.array([bm25_weights.get(t, 0) for t in terms])
multihot_qvec = np.array([multihot_query_weights.get(t, 0) for t in terms])

# Compute the dot product
score = np.dot(multihot_qvec, bm25_vec)
print("Dot Product:", score)


Dot Product: 17.949487686157227


In [4]:
# Compute the inner product using a dictionary comprehension
score_alt = sum({term: bm25_weights[term] for term in bm25_weights.keys() & multihot_query_weights.keys()}.values())
print("Dot Product (Alternative):", score_alt)


Dot Product (Alternative): 17.949487686157227


In [5]:
from pyserini.search.lucene import LuceneSearcher

# Initialize the searcher with the index path
searcher = LuceneSearcher('indexes/lucene-index-msmarco-passage')

# Perform the search with the query
hits = searcher.search('what is paula deen\'s brother')

# Print the top 10 hits
for i in range(0, 10):
    print(f'{i+1:2} {hits[i].docid:7} {hits[i].score:.5f}')


  from .autonotebook import tqdm as notebook_tqdm


 1 7187158 17.94950
 2 7187157 17.66560
 3 7187163 17.39060
 4 7546327 17.03410
 5 7187160 16.56520
 6 8227279 15.74180
 7 2298838 15.60820
 8 7617404 15.40040
 9 7187156 15.27550
10 2298839 14.97780
