In [None]:
import nltk as nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from tqdm import tqdm
from collections import defaultdict, Counter
import numpy as np
import math as math



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [5]:
import pandas as pd
import numpy as np

stop_words = set(stopwords.words('english'))
df = pd.DataFrame(pd.read_json('sample_data/corpus.jsonl', lines=True))
df.drop(columns=['metadata'], inplace=True)
corpus_tokens = {}

def tokenize(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return filtered_tokens

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    tokens = tokenize(row['text'])
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    corpus_tokens[row['_id']] = filtered_tokens

100%|██████████| 171332/171332 [03:26<00:00, 829.47it/s]


In [6]:
inverted_index = defaultdict(dict)
for doc_id, tokens in tqdm(corpus_tokens.items(), desc='Indexing...'):
    for term, frequency in Counter(tokens).items():
        inverted_index[term][doc_id] = frequency

Indexing...: 100%|██████████| 171332/171332 [00:21<00:00, 7919.93it/s]


In [9]:
docs_len = {}
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc='Calculating doc stats...'):
    docs_len[row['_id']] = len(corpus_tokens[row['_id']])

Calculating doc stats...: 100%|██████████| 171332/171332 [00:07<00:00, 21527.21it/s]


In [10]:
N = len(df)
average_dl = sum(docs_len.values()) / N

def bm25_score(term, doc_id, k1=0.50, b=0.75):
  if term not in inverted_index or doc_id not in inverted_index[term]:
    return 0.0

  tf = inverted_index[term][doc_id]
  dl = docs_len[doc_id]
  df = len(inverted_index[term])
  idf = math.log((N - df + 0.5) / (df + 0.5))
  denom = tf + k1 * (1 - b + b * dl / average_dl)
  score = idf * (tf * (k1 + 1) / denom)
  return score


In [11]:
query = 'serological tests for coronavirus'
query_tokens = tokenize(query)
union_docs = set().union(*(inverted_index[t].keys() for t in query_tokens))

scores = defaultdict(float)
for doc_id in tqdm(union_docs, desc='Calculating scores...'):
    score = sum(bm25_score(t, doc_id) for t in query_tokens)
    scores[doc_id] = score

sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
sorted_scores = sorted_scores[:50]
sorted_scores

Calculating scores...: 100%|██████████| 32364/32364 [00:00<00:00, 167898.54it/s]


[('upwn9o2m', 13.262579642878489),
 ('r1yf75bo', 12.93898716833149),
 ('q2b4ig1h', 12.933873642886558),
 ('8hrjgcas', 12.933873642886558),
 ('923jpec0', 12.924054121568695),
 ('3ea1ngo2', 12.73053316260355),
 ('8y0v6d2i', 12.69486123713582),
 ('hkvh2lc9', 12.525952883867507),
 ('px4fe7mn', 12.434294183914886),
 ('1dbeh8q7', 12.434294183914886),
 ('qjma4rsp', 12.368386360263894),
 ('9skvbk8m', 12.27905056986098),
 ('7ayg3typ', 12.204081839954071),
 ('rko7qdqk', 12.201964367381942),
 ('5jtzt8um', 12.161446626393719),
 ('84yjdlab', 12.08466273133778),
 ('82iy2prw', 12.08466273133778),
 ('0yj3xp9s', 12.065366536001829),
 ('0jl6qu0i', 12.065366536001829),
 ('m60w5dnl', 11.995399166047573),
 ('91872v0l', 11.995399166047573),
 ('vijh6x1l', 11.995399166047573),
 ('xw0o5ca7', 11.968254963975365),
 ('bj8wn9dh', 11.885209178111644),
 ('8cg5yj20', 11.868722638743765),
 ('cxt9oq0j', 11.777941772341551),
 ('ovlb53ek', 11.706928200266706),
 ('wf5cozst', 11.612425823688675),
 ('g693adjd', 11.598373773