## Replicate metapy functionality using TfidfVectorizer

This notebook explores how to replicate certain MeTA functionality found in metapy using sklearn's TfidfVectorizer.

The code extends TfidfVectorizer functionality to create a doc_term_matrix, score queries using BM25, and calculate nDCG.

MP2.4 was used to facilitate a side-by-side feature evaluation.

### MP2.4 using metapy

In [6]:
!pip install metapy pytoml

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting metapy
  Downloading metapy-0.2.13-cp37-cp37m-manylinux1_x86_64.whl (14.3 MB)
[K     |████████████████████████████████| 14.3 MB 4.1 MB/s 
[?25hCollecting pytoml
  Downloading pytoml-0.1.21-py2.py3-none-any.whl (8.5 kB)
Installing collected packages: pytoml, metapy
Successfully installed metapy-0.2.13 pytoml-0.1.21


In [56]:
import math
import sys
import time
import metapy
import pytoml

class myBM25Ranker(metapy.index.RankingFunction):
  """
  Create a new ranking function in Python that can be used in MeTA.
  """
  def __init__(self, k1=1.2, b=0.75):    
    # You *must* call the base class constructor here!
    self.k1 = k1
    self.b = b
    super(myBM25Ranker, self).__init__()

  def score_one(self, sd):
    """
    You need to override this function to return a score for a single term.
    For fields available in the score_data sd object,
    @see https://meta-toolkit.org/doxygen/structmeta_1_1index_1_1score__data.html
    """
    b = self.b
    k1 = self.k1
    idf = math.log((sd.num_docs - sd.doc_count + 0.50)/(sd.doc_count + 0.50) + 1)         
    num = sd.doc_term_count*(k1 + 1)     
    den = sd.doc_term_count + k1*(1 - b + b*sd.doc_size/sd.avg_dl)    
    score = idf*(num/den)          
    return score

def load_ranker(cfg_file):
  """
  Use this function to return the Ranker object to evaluate, 
  The parameter to this function, cfg_file, is the path to a
  configuration file used to load the index.
  """
  return myBM25Ranker()  

if __name__ == '__main__':
  
  cfg = "config.toml"

  print('Building or loading index...')
  idx = metapy.index.make_inverted_index(cfg)
  
  ev = metapy.index.IREval(cfg)

  with open(cfg, 'r') as fin:
    cfg_d = pytoml.load(fin)

  query_cfg = cfg_d['query-runner']
  if query_cfg is None:
    print("query-runner table needed in {}".format(cfg))
    sys.exit(1)

  start_time = time.time()
  top_k = 10
  query_path = query_cfg.get('query-path', 'queries.txt')
  query_start = query_cfg.get('query-id-start', 0)

  query = metapy.index.Document()

  ranker = load_ranker(cfg)
  ndcg = 0.0
  num_queries = 0
  print('Running queries')
  with open(query_path) as query_file:
    for query_num, line in enumerate(query_file):
      query.content(line.strip())
      results = ranker.score(idx, query, top_k)  
      if results != []:          
        ndcg += ev.ndcg(results, query_start + query_num, top_k)
        num_queries += 1      
      break
  ndcg = ndcg / num_queries
          
  print("NDCG@{}: {}".format(top_k, ndcg))
  print("Elapsed: {} seconds".format(round(time.time() - start_time, 4)))

Building or loading index...
Running queries
NDCG@10: 0.26664379756251727
Elapsed: 0.0085 seconds


### MP2.4 using TfidfVectorizer

In [417]:
import pytoml
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

In [418]:
np.set_printoptions(edgeitems=10, linewidth=100000, formatter=dict(float=lambda x: "%.3g" % x))

In [419]:
cfg = "config.toml"

with open(cfg, 'r') as fin:
  cfg_d = pytoml.load(fin)

query_cfg = cfg_d['query-runner']
if query_cfg is None:
  print("query-runner table needed in {}".format(cfg))
  sys.exit(1)

query_path = query_cfg.get('query-path', 'queries.txt')
query_start = query_cfg.get('query-id-start', 1)

In [420]:
with open(query_path, 'r') as fin:
  queries = [query.strip('\n') for query in fin.readlines()][(query_start - 1):]

In [421]:
num_queries = len(queries)
num_queries

225

In [422]:
dataset = cfg_d.get('dataset', 'data')
dataset

'cranfield'

In [423]:
with open(f'{dataset}/{dataset}.dat', 'r') as fin:
  corpus = [doc.strip('\n') for doc in fin.readlines()]

In [424]:
len(corpus)

1400

In [425]:
stop_words_file = cfg_d.get('stop-words', 'stopwords.txt')

with open(stop_words_file) as fin:
  stop_words = [word.strip('\n') for word in fin.readlines()]

In [426]:
vectorizer = TfidfVectorizer()

In [427]:
X = vectorizer.fit_transform(stop_words)

In [428]:
stop_word_tokens = list(vectorizer.get_feature_names_out())

In [429]:
vectorizer = TfidfVectorizer(stop_words=stop_word_tokens, smooth_idf=False, norm=None)

In [431]:
idx_tf = vectorizer.fit_transform(corpus)

In [432]:
num_docs = idx_tf.shape[0]
num_docs

1400

In [433]:
vocabulary_size = idx_tf.shape[1]
vocabulary_size

7166

In [434]:
tf_matrix = idx_tf.toarray()
tf_matrix.shape

(1400, 7166)

In [435]:
idx_count = super(TfidfVectorizer, vectorizer).transform(corpus)

In [436]:
doc_term_matrix = idx_count.toarray()
doc_term_matrix.shape

(1400, 7166)

In [437]:
doc_count = np.sum(doc_term_matrix > 0, axis=0)

In [438]:
avg_doc_length = idx_count.sum(axis=1).mean()
avg_doc_length

88.34714285714286

In [439]:
doc_lengths = idx_count.sum(axis=1).A1
doc_lengths

array([77, 112, 17, 46, 33, 50, 126, 81, 174, 29, ..., 120, 159, 103, 84, 57, 55, 47, 109, 45, 61])

In [440]:
query_term_matrix = super(TfidfVectorizer, vectorizer).transform(queries).toarray()
query_term_matrix.shape

(225, 7166)

In [441]:
idf = vectorizer.idf_ - 1
idf.shape

(7166,)

In [442]:
idf

array([5.16, 3.29, 6.55, 7.24, 7.24, 6.55, 7.24, 6.55, 7.24, 7.24, ..., 7.24, 2.33, 7.24, 7.24, 7.24, 4.76, 6.55, 7.24, 7.24, 6.55])

In [443]:
idf_ = np.log((num_docs - doc_count + 0.50)/(doc_count + 0.50) + 1)
idf_.shape

(7166,)

In [444]:
idf_

array([5.1, 3.28, 6.33, 6.84, 6.84, 6.33, 6.84, 6.33, 6.84, 6.84, ..., 6.84, 2.33, 6.84, 6.84, 6.84, 4.72, 6.33, 6.84, 6.84, 6.33])

### BM25 Calculator

In [461]:
k1 = 1.2
b = 0.75
num = idf_ * doc_term_matrix * (k1 + 1)
num.shape

(1400, 7166)

In [462]:
den = doc_term_matrix + (k1*(1 - b + b*doc_lengths/avg_doc_length))[:, np.newaxis]
den.shape

(1400, 7166)

In [463]:
scores = np.divide(num, den2, out=np.zeros_like(num), where=den!=0)
scores.shape

(1400, 7166)

In [464]:
bm25_scores = np.dot(query_term_matrix, scores.T)
bm25_scores.shape

(225, 1400)

In [465]:
top_k = 10
idx_k = np.argpartition(bm25_scores, -top_k, axis=1)
idx_k = idx_k[:, -top_k:]

In [466]:
ranks = np.take_along_axis(idx_k, np.argsort(-np.take_along_axis(bm25_scores, idx_k, axis=1)), axis=1)
ranks

array([[ 183,   12,  485,   11,  877,   50, 1267, 1143,  140,  874],
       [  11,  745,   50,  723, 1168,  883,  140,   13, 1088, 1169],
       [   4,  398,  180,  143,  484,  541,  578,  583,  158,  581],
       [ 165,  487, 1188, 1084,  184, 1274, 1060, 1311,  235,  574],
       [ 102, 1031,  942, 1295,  745, 1378, 1271,  649,  171,  848],
       [ 490,  256,  385,  384,  609,  405, 1373, 1272, 1195,  766],
       [ 491,  972,   55,   56,  433,  121, 1230,  123, 1039,  247],
       [ 121,  491,  906,  710,  442,  236,  231,  568, 1351, 1082],
       [  20,   44,  549,  305,  570,   21,  101, 1214,  269,  302],
       [ 492, 1198,  948,  523,  301,  690,  404, 1008, 1285, 1009],
       ...,
       [  38,  957, 1318,  174, 1273,  406, 1155,  904,  973,  691],
       [ 982, 1392,  665,  322, 1394, 1393,  323,   43, 1160,  945],
       [1392, 1394,  322, 1393,  982,  665,  669,  294, 1160,  558],
       [ 992,  991, 1220, 1221, 1081,  808,  656,  267,  341,  207],
       [ 291, 1374,  2

In [467]:
np.take_along_axis(bm25_scores, ranks, axis=1)

array([[15.7, 15, 14.8, 14.3, 11, 9.62, 8.82, 8.61, 8.26, 8.2],
       [24.2, 13.1, 11.7, 10.7, 10.3, 10.3, 10.2, 10.1, 9.77, 9.68],
       [18.3, 17, 15.6, 14.2, 13.7, 11.7, 8.04, 7.9, 7.23, 7.21],
       [20.3, 15.9, 15.3, 12.8, 12.7, 12.7, 12.5, 12.4, 11.8, 11.4],
       [12.3, 10.4, 8.78, 8.27, 8.18, 7.77, 7.58, 7.18, 6.64, 6.58],
       [12.9, 8.67, 8.28, 8.04, 6.61, 6.39, 6.36, 6.35, 6.32, 6.07],
       [55.4, 27.4, 25.7, 24.7, 24.3, 23.5, 20.8, 20.1, 20.1, 17.5],
       [16.2, 14, 13.3, 12.9, 12.6, 11.9, 11.4, 10.9, 10.8, 10.7],
       [15, 12.9, 11.5, 10.6, 10.4, 10.2, 10.2, 9.15, 9.04, 8.59],
       [18.2, 12.8, 12.5, 12.4, 12.2, 11.9, 11.2, 11, 11, 10.8],
       ...,
       [10.3, 9.34, 9.05, 9.02, 8.91, 8.87, 8.74, 8.6, 8.41, 8.22],
       [12.2, 11.8, 11.5, 11.4, 11.3, 11.2, 11, 10.7, 10.6, 10],
       [10.3, 9.76, 9.74, 9.29, 9.17, 9.03, 8.74, 8.74, 8.51, 8.46],
       [8.96, 8.81, 8.68, 7.64, 7.61, 7.47, 7.44, 7.35, 7.26, 7.16],
       [14.2, 13.2, 13.1, 12.3, 12.2, 11.8,

### nDCG Calculator

standard formula for nDCG:

$$
DCG_p = \sum_{i=1}^p \frac{rel_i}{\log_2(i+1)}, \quad p = num\_docs \\
 nDCG_p = \frac{DCG_p}{IDCG_p}
$$

metapy uses a slightly different formula for calculating nDCG:
[Link](https://meta-toolkit.org/doxygen/classmeta_1_1index_1_1ir__eval.html#a23827b8671dffbfbc85494def49d66c1)

 $$
DCG_p = \sum_{i=1}^p \frac{2^{rel_i}-1}{\log_2(i+1)}, \quad p = num\_docs \\
 nDCG_p = \frac{DCG_p}{IDCG_p}
 $$

In [468]:
query_judgements = cfg_d.get('query-judgements', 'qrels.txt')
query_judgements

'cranfield-qrels.txt'

In [469]:
with open(query_judgements, 'r') as fin:
  qranks = [doc.strip('\n').split(' ') for doc in fin.readlines()]

qranks = np.array(qranks, dtype=int)

In [470]:
relevant = np.zeros((num_queries, num_docs), dtype=int)
relevant[qranks[:, 0]-1, qranks[:, 1]-1] = qranks[:, 2]

In [471]:
rel = np.take_along_axis(relevant, ranks-1, axis=1)
rel

array([[3, 1, 0, 2, 0, 2, 0, 0, 0, 3],
       [4, 4, 2, 0, 0, 0, 0, 1, 0, 0],
       [2, 2, 2, 2, 0, 0, 0, 0, 0, 0],
       [2, 0, 0, 0, 0, 0, 0, 0, 2, 0],
       [0, 0, 0, 4, 0, 0, 0, 0, 0, 0],
       [0, 2, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 2, 2, 0, 0, 0, 0, 0, 0],
       [4, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [3, 0, 3, 0, 0, 3, 0, 0, 0, 0],
       [0, 0, 0, 0, 2, 0, 3, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 4, 0, 4, 3, 0, 0, 0, 0],
       [4, 0, 0, 0, 0, 2, 0, 0, 0, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [2, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [0, 2, 1, 2, 0, 0, 0, 0, 0, 0],
       [0, 2, 1, 2, 2, 0, 0, 0, 0, 0],
       [2, 0, 0, 0, 0, 2, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 2, 0, 0, 2],
       [0, 1, 0, 2, 0, 0, 2, 0, 0, 0]])

In [472]:
irel = np.sort(relevant, axis=1)[:, -top_k:][:, ::-1]
irel

array([[3, 3, 3, 3, 3, 3, 3, 2, 2, 2],
       [4, 4, 3, 3, 3, 3, 3, 2, 2, 2],
       [2, 2, 2, 2, 2, 2, 2, 2, 0, 0],
       [2, 2, 0, 0, 0, 0, 0, 0, 0, 0],
       [4, 4, 2, 2, 0, 0, 0, 0, 0, 0],
       [3, 2, 2, 2, 0, 0, 0, 0, 0, 0],
       [3, 2, 2, 2, 1, 0, 0, 0, 0, 0],
       [4, 4, 4, 4, 4, 2, 2, 2, 2, 2],
       [3, 3, 3, 0, 0, 0, 0, 0, 0, 0],
       [3, 3, 2, 2, 2, 2, 2, 2, 0, 0],
       ...,
       [4, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [4, 4, 4, 3, 3, 2, 2, 2, 2, 2],
       [4, 4, 4, 2, 2, 2, 2, 2, 2, 2],
       [4, 3, 3, 3, 3, 3, 2, 2, 2, 2],
       [3, 3, 2, 2, 2, 2, 2, 2, 2, 2],
       [3, 3, 3, 2, 2, 2, 2, 2, 2, 2],
       [2, 2, 2, 2, 2, 1, 1, 1, 1, 0],
       [2, 2, 2, 2, 0, 0, 0, 0, 0, 0],
       [2, 2, 2, 2, 2, 2, 2, 1, 0, 0],
       [3, 3, 3, 2, 2, 2, 2, 2, 2, 2]])

In [473]:
# discounted cumulative gain
def dcg_p_np(rel):
  i = np.full(rel.shape, np.arange(1, rel.shape[1]+1))
  return np.sum((2**rel - 1)/np.log2(i+1), axis=1)

In [474]:
# normalized discounted cumulative gain
def ndcg_p_np(rel, irel):
  return dcg_p_np(rel)/dcg_p_np(irel)

In [475]:
ndcg_p_np(rel, irel)

array([0.426, 0.637, 0.648, 0.798, 0.237, 0.162, 0.231, 0.306, 0.871, 0.19, 0.072, 0.268, 0, 0.987, 0.738, 0.182, 0.339, 0.202, 0, 0.44, 0.24, 0, 0.338, 0.523, 0.502, 0.622, 0.103, 0, 0.468, 0.108, 0, 0.0645, 0.683, 0.327, 0, 0.106, 0.213, 0, 0.219, 0.032, 0.873, 0.289, 0.751, 0, 0.394, 0.583, 0.345, 0.357, 0.413, 0, 0.465, 0.536, 0.174, 0.284, 0.324, 0.13, 0.323, 0.251, 0, 0.39, 0.405, 0, 0, 0.881, 0.609, 0.287, 0.594, 0.344, 0, 0.0797, 0.117, 0.181, 0.587, 0.144, 0.234, 0.253, 0.433, 0.643, 0.223, 0, 0.347, 0.534, 0.139, 0.286, 0.039, 0.693, 0, 0.534, 0.284, 0.199, 0.489, 0.301, 0.631, 0.318, 0.064, 0.334, 0.16, 0.0373, 0.507, 0.497, 0.8, 0.141, 0, 0.233, 0.544, 0.369, 0.378, 0.726, 0, 0, 0.0446, 0.434, 0.559, 0.195, 0, 0.0644, 0, 0.798, 1, 0.576, 0.65, 0.18, 0, 0, 0.392, 0.571, 0.218, 0, 0.547, 0.655, 0.304, 0.49, 0.211, 0.12, 0.115, 0.587, 0.21, 0.204, 0, 0.298, 0.557, 0, 0.613, 0.661, 0.568, 0.556, 0.34, 0.403, 0.526, 0.693, 0, 0.274, 0.322, 0.6, 0.241, 0.342, 0.351, 0.609, 0.142,

In [476]:
ndcg_p_np(rel, irel).mean()

0.3434456122724809