In [1]:
import nltk
from nltk.corpus import wordnet

nltk.download('punkt')
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

nltk.download('averaged_perceptron_tagger')

from collections import defaultdict
from typing import Union, Callable, Iterable, Literal
import os
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Mounted at /content/drive


In [None]:
path = '/content/drive/MyDrive/School/IR/Fourth/cranfield-trec-dataset-main'
docs = pd.read_xml(path + '/cran.all.1400.xml')[['title', 'text', 'docno']].set_index('docno')
queries = pd.read_xml(path + '/cran.qry.xml').set_index('num')
relevance = pd.read_csv(path + '/cranqrel.trec.txt', sep=' ', usecols=['topic', 'doc', 'rel'])
print(docs.head())
print(queries.head())
print(relevance.head())

                                                   title  \
docno                                                      
1      experimental investigation of the aerodynamics...   
2      simple shear flow past a flat plate in an inco...   
3      the boundary layer in simple shear flow past a...   
4      approximate solutions of the incompressible la...   
5      one-dimensional transient heat conduction into...   

                                                    text  
docno                                                     
1      experimental investigation of the aerodynamics...  
2      simple shear flow past a flat plate in an inco...  
3      the boundary layer in simple shear flow past a...  
4      approximate solutions of the incompressible la...  
5      one-dimensional transient heat conduction into...  
                                                 title
num                                                   
1    what similarity laws must be obeyed when const...
2

In [None]:
# Tokenizing and lemmatizing words

def preprocess(Doc:str, tokenizer:Callable=None, stop_words:Iterable=None, lemmatizer:Callable=None, verbose:int=0) -> list[list[str]]:
    """
    Parameters:
    ---
    `Doc`: the document

    `tokenizer`: The tokenizing function for tokenizing the documents

    `stop_words`: The list of stop words to be removed from tokens

    `lemmatizer`: A function to lemmatize the tokens

    Returns:
    ---
    The tokens list!
    """
    Marks = {".", ",", "?", "!"}
    if tokenizer == None:
      tokenizer = word_tokenize
    if lemmatizer == None:
      lemmatizer = WordNetLemmatizer().lemmatize
    if stop_words == None:
      stop_words = set(stopwords.words('english')).difference({'not'}).union(Marks)
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  # for punctuation marks

    for i in range(len(Doc)):
      if Doc[i] in Marks:
        if Doc[i] in {'.', ','} and (Doc[i-1].isnumeric() or i == 0) and (Doc[i+1].isnumeric() or i == len(Doc) - 1):
          continue # In this case, marks are thousand seperator or decimal point
        # putting space after marks when needed
        Doc = Doc[:i + 1] + " " + Doc[i + 1:]
    Words = [w for w in tokenizer(Doc) if w not in stop_words]

    if verbose > 0:
        for w in Words:
            print(w)

    if lemmatizer:
          tags = nltk.pos_tag(Words)
          for j in range(len(Words)):
              Words[j] = lemmatizer(Words[j], pos=get_wordnet_pos(tags[j][1]))

    if verbose > 0:
        print("-"*50)
        for w in Words:
            print(w)

    return Words

In [None]:
# Save each term in a dict (I didnt use TRIE anymore in new versions as it isn't efficient in Python),
# `termDict` keys are terms. Value is a dict, where keys are docnoms, values are their positions
# tf: len(termDict[term][docno])/len(docs_tokens[docno]), df: len(termDict[term])
termDict = defaultdict(lambda: defaultdict(lambda: []))
docs_tokens = []
for docno in docs.index:
  doc = docs.loc[docno]
  # print(doc['text'])
  print(end=f"\r{docno}")
  if doc['text']:
    text  = preprocess(doc['text'])
    docs_tokens.append(text)
    for i in range(len(text)):
      term = text[i]
      termDict[term][docno].append(i)

1400

In [None]:
print(len(termDict.keys()))

8296


# TF-IDF based Ranking

In [None]:
def rank_tfidf(query_tokens:list[str], term_dict:dict[str, dict[int, list[int]]], docs_tokens:list[list[str]], verbose:int=0) -> list[int]:
  """
  Parameters:
  -----
  `query`: The query tokens!
  `term_dict`: The dictionary of terms as described above
  `docs_tokens`: The tokens of each document
  `verbose`: Verbosity

  Returns:
  -----
  sorted indices of docs (based on `docs_tokens`)
  """
  N = len(docs_tokens)
  def termFreq(term:str, doc_tokens:list[str]) -> float:
    return doc_tokens.count(term)/len(doc_tokens)
  def cal_tfidf(term:str, doc_tokens:list[str], tf_mode='n'):
    """
    Parameters:
    -----
    `term`: term
    `doc_tokens`: The tokens in the document
    `tf_mode`: Term freq weight  'n' for natural, 'l': logarithm, 'a': augmented, 'b': boolean, `L`: log avg
    """
    if term not in doc_tokens:
      return 0
    tf = termFreq(term, doc_tokens)
    df = len(term_dict[term])
    idf = 1 + np.log(N / df)
    if tf_mode == 'l':
      tf = 1 + np.log(tf)
    elif tf_mode == 'a':
      tf = 0.5 + (0.5 + tf) / max([termFreq(t, doc_tokens) for t in doc_tokens])
    elif tf_mode == 'b':
      tf = tf > 0
    elif tf_mode == 'L':
      tf = (1 + np.log(tf)) / (1 + np.log(np.mean([termFreq(t, doc_tokens) for t in doc_tokens])))

    if verbose > 1:
      print(f"For term {term}: tf = {tf}, and idf = {idf}")
    return tf * idf

  query_tokens_set = set(query_tokens)
  qVec = np.array([cal_tfidf(t, query_tokens) for t in query_tokens_set])
  qVecNorm = np.linalg.norm(qVec)
  if verbose > 0:
    print(f"query vector: {qVec}")
  scores = []
  for i in range(N):
    docVec = np.array([cal_tfidf(t, docs_tokens[i]) for t in query_tokens_set])
    inner = np.inner(qVec, docVec)
    score = 0 if inner == 0 else inner / (np.linalg.norm(docVec) * qVecNorm)
    if verbose > 0:
      print(f"doc vec = {docVec}, and score is {score}")
    scores.append((i, score))

  return sorted(scores, reverse=True, key=lambda x: x[1])

# Okapi (BM25) Ranking
The **RSV** for each document is as below: \\
$
RSV_d = \Sigma_{t\in q}{[log \frac{N}{df_t}] . \frac{(k_1 + 1)tf_d}{k_1((1-b) + b \times (L_d / L_{avg})) + tf_d} . \frac{(k_3 + 1)tf_q}{k_3 + tf_q}}
$

In [None]:
def rank_okapi(query_tokens:list[str],
               term_dict:dict[str, dict[int, list[int]]],
               docs_tokens:list[list[str]],
               b=0.75, k1=1.2, k3=2, verbose:int=0) -> list[int]:
  """
  Parameters:
  -----
  `query`: The query tokens!
  `term_dict`: The dictionary of terms as described above
  `docs_tokens`: The tokens of each document
  `verbose`: Verbosity

  Returns:
  -----
  sorted indices of docs (based on `docs_tokens`)
  """
  N = len(docs_tokens)
  def RSV(doc:list[str]):
    """
    Parameters:
    -----
    `doc`: list of doc's tokens

    Returns:
    -----
    the RSV score
    """
    Lavg = np.mean([len(doc) for doc in docs_tokens])
    ans = 0
    for term in set(doc):
      tfd = doc.count(term)
      tfq = query_tokens.count(term)
      idf = np.log(N / len(term_dict[term]))
      dweight = ((k1 + 1)*tfd) / (k1*((1-b) + b*(len(doc) / Lavg)) + tfd)
      qweight = ((k3 + 1)*tfq) / (k3 + tfq)
      ans += idf * dweight * qweight
    return ans

  scores = []
  for i in range(N):
    doc = docs_tokens[i]
    scores.append((i, RSV(doc)))

  return sorted(scores, reverse=True, key=lambda x: x[1])

# Language Model Ranking
Rank based on: \\
> $P(q|M_d) = Π_{1\le i \le |q|}{P(t_i|M_d)}$ \\

By taking $log$ we can rank based on: \\
> $P(q|M_d) = \Sigma_{1\le i \le |q|}{log(P(t_i|M_d))}$ \\

Where
 $P(t_i|M_d) = \frac{tf_{t_i, d}}{L_d}$

 ----
 ## 2 Smoothing Method
 > ## Jelinek-Mercer

 $\quad P = λ * P(t|M_d) + (1-λ) * P(t|M_c)$

 > ## Dirichlet

 $\quad P = \frac{tf_{t,d} + α P(t|M_c)}{L_d + α}$

In [None]:
def rank_langModel(query_tokens:list[str],
               term_dict:dict[str, dict[int, list[int]]],
               docs_tokens:list[list[str]],
               smoothing:Literal['jelinek', 'dirichlet']='jelinek',
               lambdaa:float=0.5, alpha:float=0.5,
               verbose:int=0) -> list[int]:
  """
  Parameters:
  -----
  `query`: The query tokens!
  `term_dict`: The dictionary of terms as described above
  `docs_tokens`: The tokens of each document
  `verbose`: Verbosity

  Returns:
  -----
  sorted indices of docs (based on `docs_tokens`)
  """
  N = len(docs_tokens)
  pcols = {t:sum([len(term_dict[t][d]) for d in term_dict[t]]) / sum([len(d) for d in docs_tokens]) for t in query_tokens}
  if verbose > 1:print("P(t|M_c)=\n" + "\n".join(f"{key}: {pcols[key]}" for key in pcols))
  def probQ_Doc(doc:list[str]):
    ans = 1
    for term in query_tokens:
      pcol = pcols[term]
      p = 0
      if smoothing == 'jelinek':
        pdoc = doc.count(term) / len(doc)
        p = lambdaa * pdoc + (1-lambdaa) * pcol
        if verbose > 1: print(f"term {term}: P(t|M_d)={pdoc}")
      elif smoothing == 'dirichlet':
        p = (doc.count(term) + alpha*pcol) / (len(doc) + alpha)
        if verbose > 1: print(f"term {term}: p = {doc.count(term) + alpha*pcol} / {len(doc) + alpha}")
      else:
        raise ValueError("You should use some smoothing method")
      if verbose:print(f"ans={ans}, p={p}")
      ans *= p
    return ans

  scores = []
  for i in range(N):
    scores.append((i, probQ_Doc(docs_tokens[i])))
  return sorted(scores, reverse=True, key=lambda x: x[1])


# Testing

In [None]:
qqqq = ['machine learning teaches machine how to learn'.split(), 'machine translation is my favorite subject'.split(), 'term frequency and inverse document frequency is important'.split()]
wwww = defaultdict(lambda: defaultdict(lambda: []))
for i in range(len(qqqq)):
  d = qqqq[i]
  for j in range(len(d)):
    t = d[j]
    wwww[t][i].append(j)
print(rank_tfidf(['machine', 'learning'], wwww, qqqq, verbose=0))
print("-" * 75)

print(rank_okapi(['machine', 'learning'], wwww, qqqq, verbose=0))
print("-" * 75)

print(rank_langModel(['machine', 'learning'], wwww, qqqq, verbose=0))


[(0, 0.9429634084277353), (1, 0.5564505207186616), (2, 0)]
---------------------------------------------------------------------------
[(0, 1.6561268123168358), (1, 0.43063190792177464), (2, 0.0)]
---------------------------------------------------------------------------
[(0, 0.02040816326530612), (1, 0.0036848072562358277), (2, 0.0017006802721088433)]
