In [12]:
from rank_bm25 import BM25Okapi
import pandas as pd

#### BM25 Formula

$BM25(q, d) =
\sum_{t \in q}
\mathrm{IDF}(t)
\cdot
\frac{
f(t, d)\,(k_1 + 1)
}{
f(t, d) + k_1 \left(1 - b + b \cdot \frac{|d|}{\mathrm{avgdl}}\right)
}$


$\mathrm{IDF}(t) =
\log \left(
\frac{N - df_t + 0.5}{df_t + 0.5} + 1
\right)$

```
q       : query
d       : document
t       : term in query
f(t,d)  : term frequency of t in d
|d|     : document length
avgdl   : average document length
df_t    : document frequency of term t
N       : total number of documents
k_1     : term saturation parameter
b       : length normalization parameter
V       : vocabulary
```



In [29]:
docs = [
    "a red blue car is fast",
    "a red book has a blue cover. but engine is a car part",
]
tokenised_docs = [doc.split(" ") for doc in docs]


bm25_index = BM25Okapi(tokenised_docs, k1=0, b=0)

query = "red blue car"
bm25_scores = bm25_index.get_scores(query.split())

df = pd.DataFrame({"doc": docs, "query": query, "bm25_score": bm25_scores})
df

Unnamed: 0,doc,query,bm25_score
0,a red blue car is fast,red blue car,-0.502949
1,a red book has a blue cover. but engine is a c...,red blue car,-0.502949
