# Assignment no.1 - Implementation of information retrieval 

---



## ℹ️ Project info
*   **Course**: PV211 Introduction to information retrieval
*   Semester: Summer semester 2021
*   **Author**: Adam Hospodka ([506521@muni.cz](mailto:506521@muni.cz))
*   **UČO**: 506521

## 🧭 Application overview

### Idea:
Ranking the documents for given queries using the standard *TF-IDF* approach with small tweaks.

###Steps:
1. List every word from every document using ```buildPairs()``` function
2. Remove duplicates using the ```uniq()``` function
3. Based on these pairs create inverted index using the ```buildInvertedIndex()``` function
4. Use knowledge of inverted index to build frequency index (inv. index with word frequencies) using the ```buildFrequencyIndex()``` function
5. Build index that contains length of each document using the ```buildDocumentsLengthIndex()``` function
6. Prepare *Pandas* DataFrame with document instances ready to be ranked using the ```buildRankingDf()``` function
7. Initialize the search with ```IRSystem().search()```
8. Clean the given query with the ```cleanQuery()``` function
9. Rank the relevant documents with the ```rank()``` function



## 📚 Imports

In [None]:
pip install git+https://gitlab.fi.muni.cz/xstefan3/pv211-utils.git@master | grep "Succ"

  Running command git clone -q https://gitlab.fi.muni.cz/xstefan3/pv211-utils.git /tmp/pip-req-build-jz0cv0_u
Successfully built gdown
Successfully built pv211-utils
      Successfully uninstalled gdown-3.6.4
Successfully installed gdown-3.12.2 pv211-utils-1.0.0


In [None]:
import nltk
import time
import math
import random
import numpy as np
import pandas as pd
from typing import Iterable
from tqdm.notebook import tqdm
from nltk.corpus import stopwords

from pv211_utils.cranfield.loader import load_queries
from pv211_utils.cranfield.loader import load_documents
from pv211_utils.cranfield.loader import load_judgements
from pv211_utils.cranfield.eval import CranfieldEvaluation
from pv211_utils.cranfield.entities import CranfieldQueryBase
from pv211_utils.cranfield.irsystem import CranfieldIRSystemBase
from pv211_utils.cranfield.entities import CranfieldDocumentBase
from pv211_utils.cranfield.leaderboard import CranfieldLeaderboard

nltk.download('punkt')
nltk.download('stopwords')

stemmer = nltk.PorterStemmer()
stopwords = stopwords.words('english')

## 🏗 Instance load


### Documents

In [None]:
class Document(CranfieldDocumentBase):

    def __init__(self, document_id: str, authors: str, bibliography: str, title: str, body: str):
      stem_body = body
      stem_body = nltk.word_tokenize(stem_body)
      stem_body = [stemmer.stem(token) for token in stem_body]

      self.stem_body = stem_body

      stem_title = title
      stem_title = nltk.word_tokenize(stem_title)
      stem_title = [stemmer.stem(token) for token in stem_title]
      stem_title = [token for token in stem_title if token not in stopwords]

      self.stem_title = stem_title

      super().__init__(document_id, authors, bibliography, title, body)
    

In [None]:
documents = load_documents(Document)

### Queries

In [None]:
class Query(CranfieldQueryBase):

    def __init__(self, query_id: int, body: str):
        # preprocessing!
        super().__init__(query_id, body)

In [None]:
queries = load_queries(Query)

## 📜 Index construction

### Listing pairs (term, document)

In [None]:
def isStringNumber(token):
        try: 
            x = int(token) > 0
            return(False)
        except:
            return(True)

In [None]:
def buildPairs():
    
    pairs = []
            
    for key, value in documents.items(): 
        body = str(value.body)
        tokens = nltk.word_tokenize(body)

        for token in tokens:

          cond1 = len(token) > 1
          cond2 = token not in stopwords
          cond3 = isStringNumber(token) == True

          if cond1 and cond2 and cond3:   
            token = stemmer.stem(token)
            doc_id = key
            pairs.append((token, doc_id))
    
    return(pairs)

### Unique pairs sorter

In [None]:
def uniq(sorted_list):
    if len(sorted_list) <= 1:
        return sorted_list

    uniq_list = sorted_list[:1]
    previous_value = sorted_list[0]

    for value in sorted_list[1:]:
        if value != previous_value:
            uniq_list.append(value)
            previous_value = value
                
    return uniq_list

### Inverted index construction

In [None]:
def buildInvertedIndex(uniq_pairs):
    inverted_index = {}

    for term, document_id in pairs:
        if term not in inverted_index:
            inverted_index[term] = []

        inverted_index[term].append(document_id)
    
    return inverted_index

### Frequency index construction

In [None]:
def buildFrequencyIndex():
    frequency_index = {}

    for term, relevant_documents in inverted_index.items():
      local_list = {}

      for doc_id in relevant_documents:
        stem_body = documents[doc_id].stem_body
        frequency = stem_body.count(term)
        local_list[doc_id] = frequency

      frequency_index[term] = local_list

    return frequency_index

### Documents length index construction

In [None]:
def buildDocumentsLengthIndex():
    documentsLengthIndex = {}

    for key, value in documents.items():
        count_of_words = len(nltk.word_tokenize(value.body))
        documentsLengthIndex[key] = count_of_words
    
    return documentsLengthIndex

## 🧮 Dataframe for ranking results

In [None]:
def buildRankingDf():
    id_as_list = [key for key, value in documents.items()]
    text_as_list = [value for key, value in documents.items()]

    df = pd.DataFrame({"Numbers":  id_as_list, "Values": text_as_list, "Rank": 0, "Matches": "" }) 
    df = df.set_index("Numbers")

    return df

## 🧹 Query cleaning

In [None]:
def cleanQuery(query):
            
  query = query.body
  query = nltk.word_tokenize(query)
  query = [stemmer.stem(token) for token in query]
  query = [token for token in query if len(token) > 1 if token not in stopwords]

  return query

## 💯 Ranking mechanism

In [None]:
def rank(query):
    global ranking_df
    ranking_df["Rank"] = 0.0
    ranking_df["Matches"] = ""


    for term in query:
        if term in frequency_index:
            
            for doc_id in frequency_index[term]:
                
                # Matched words
                ranking_df.at[doc_id, "Matches"] = ranking_df.at[doc_id, "Matches"] + " " + term + "-" + str(frequency_index[term][str(doc_id)])

                # Scoring
                tf = frequency_index[term][str(doc_id)] / documentsLengthIndex[str(doc_id)]
                idf = math.log(len(documents) / len(frequency_index[term]),2)
                score = tf * idf
                ranking_df.at[doc_id, "Rank"] += 1 + score

                if term in documents[doc_id].title:
                  ranking_df.at[doc_id, "Rank"] += 1



    ranking_df = ranking_df.sort_values("Rank", ascending = False)
    sorted_documents = ranking_df["Values"].tolist()

    return sorted_documents

## 🦸‍♂️ Functions, assemble!



In [None]:
pairs = buildPairs()
uniq_pairs = uniq(sorted(pairs, key=lambda x: (x[0].lower(), x[1])))
inverted_index = buildInvertedIndex(uniq_pairs)
frequency_index = buildFrequencyIndex()
documentsLengthIndex = buildDocumentsLengthIndex()
ranking_df = buildRankingDf()

## 🚧 IR System testing



In [None]:
class IRSystem(CranfieldIRSystemBase):

    def __init__(self, print_matrix):
        self.documents = documents
        self.print_matrix = print_matrix

    def search(self, query: Query):

        # Query preprocess
        query = cleanQuery(query)

        # Ranking
        sorted_documents = rank(query)
        
        if self.print_matrix == True:
          print("Query: ",query)
          print(ranking_df.head(20))
        
        return(sorted_documents)

In [None]:
submit_result = False
author_name = "Hospodka, Adam"

system = IRSystem(print_matrix = False)

print('Initializing your system ...', end='', flush=True)
evaluation = CranfieldEvaluation(system, load_judgements(queries, documents), CranfieldLeaderboard(), author_name)
print(end='\r', flush=True)
evaluation.evaluate(tqdm(queries.values(), desc="Querying your system", leave=False), submit_result)



HBox(children=(FloatProgress(value=0.0, description='Querying your system', max=225.0, style=ProgressStyle(des…



Your system achieved **37.46% MAP score**.

Congratulations, you passed the **35%** minimum! 🥳

Set `submit_result = True` and write your name to the `author_name` variable to submit your result to [the leaderboard](https://docs.google.com/spreadsheets/d/e/2PACX-1vRRR4eDkQIWx5FSU08Uj5DciWwxNfHJeLruNR1T0WW9xmSsYl457Zqv5SlA1jfvsYHpsaUw_8P3z1OF/pubhtml). 🏆

The best submissions on the leaderboard will receive *small awards during the semester*, and some *__seriously big__ awards* after the personal check at the end of the competition (2021-04-18). Please be polite, do not spoil the game for the others, and **have fun!** 😉