# Information Retrieval

This notebook builds and improves on an IR system using Whoosh, NLTK, and pytrec_eval.

The dataset uses documents from US Government web sites and the topics are 15 needs for government information. Both were part of the TREC conference in 2003.

## Setup

In [None]:
!pip install whoosh
!pip install pytrec_eval
!pip install wget
!pip install transformers
!pip install sentence_transformers



In [None]:
import wget
wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/government.zip", "government.zip")

'government (1).zip'

In [None]:
!unzip government.zip

Archive:  government.zip
replace government/topics-with-full-descriptions.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:
# imports
# Put all your imports here
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget
import abc
from abc import abstractmethod
from whoosh.analysis import Filter
from whoosh.scoring import BM25F
import nltk
from nltk.stem import *
from nltk.corpus import wordnet

nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
class IRSystem(metaclass=abc.ABCMeta):
    """
    Abstract class which is inherited by other IR system
    """

    def __init__(self, data_dir):
        self.topic_file = os.path.join(data_dir, "gov.topics")
        self.qrels_file = os.path.join(data_dir, "gov.qrels")
        self.document_dir = os.path.join(data_dir, "documents")
        self.file_list = [str(filePath) for filePath in Path(self.document_dir).glob("**/*") if filePath.is_file()]

        self.create_index()
        self.create_parser_searcher()

    @abstractmethod
    def create_index(self):
        pass

    @abstractmethod
    def add_files(self):
        pass

    @abstractmethod
    def create_parser_searcher(self):
        pass

    @abstractmethod
    def perform_search(self, topic_phrase):
        pass

    @staticmethod
    def post_process_score(score):
        return score

    @staticmethod
    def print_trec_eval_result(results):

        if not results:
            print('empty results')
            return

        def print_line(name, scope, num):
            print('{:25s}{:8s}{:.4f}'.format(name, scope, num))

        for query_id, query_measures in results.items():
            for measure, value in query_measures.items():
                if measure == "runid":
                    continue
                print_line(measure, query_id, value)

        for measure in query_measures.keys():
            if measure == "runid":
                continue
            print_line(
                measure,
                'all',
                pytrec_eval.compute_aggregated_measure(
                    measure,
                    [query_measures[measure]
                     for query_measures in results.values()]))


    def score(self,docnum,topic_results, topic_phrase):
        return topic_results.score(docnum)


    def print_rel_name(self, q_id):
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            if topic_id == q_id:
                print("---------------------------Topic_id and Topic_phrase----------------------------------")
                print(topic_id, topic_phrase)
                 # get search result
                topic_results = self.perform_search(topic_phrase)
                print("---------------------------Return documents----------------------------------")
                for (docnum, result) in enumerate(topic_results):
                    score = self.score(docnum, topic_results, topic_phrase)
                    score = self.post_process_score(score)
                    print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                print("---------------------------Relevant documents----------------------------------")
                with open(self.qrels_file, 'r') as f_qrel:
                    qrels = f_qrel.readlines()
                    for i in qrels:
                        qid, _, doc, rel = i.rstrip().split(" ")
                        if qid == q_id and rel == "1":
                            print(i.rstrip())

    def py_trec_eval(self):

        self.create_parser_searcher()
        # Load topic file - a list of topics(search phrases) used for evalutation
        with open(self.topic_file, "r") as tf:
            topics = tf.read().splitlines()

            # create an output file to which we'll write our results
        temp_output_file = tempfile.mkstemp()[1]
        with open(temp_output_file, "w") as outputTRECFile:
            # for each evaluated topic:
            # build a query and record the results in the file in TREC_EVAL format
            for topic in topics:
                topic_id, topic_phrase = tuple(topic.split(" ", 1))
                # get search result
                topic_results = self.perform_search(topic_phrase)
                # format the result
                for (docnum, result) in enumerate(topic_results):
                    score = self.score(docnum, topic_results, topic_phrase)
                    outputTRECFile.write(
                        "%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                    topic_with_result = topic_id


        with open(self.qrels_file, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)

        with open(temp_output_file, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)

        results = evaluator.evaluate(run)

        #fill results dictionary with queries that were returned 0 documents
        topic_ids = {t.split()[0] for t in topics}
        for emptyresult_topicid in topic_ids.difference(set(results.keys())):
            num_rel = float(sum(qrel[emptyresult_topicid].values()))
            if num_rel>0:
              topic_stats={measure:0.0 for measure in results[topic_with_result]}
            else:
              topic_stats={measure:1.0 for measure in results[topic_with_result]}
            topic_stats["num_rel"]=num_rel
            topic_stats["num_ret"] = 0.0
            topic_stats["num_rel_ret"] = 0.0
            topic_stats["num_q"]=1.0

            results[emptyresult_topicid] = topic_stats


        self.print_trec_eval_result(results)


In [None]:
# ir_sys_0 = IRSystem("lab-data")

In [None]:
# This filter will run for both the index and the query
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

## Baseline IR System

In [None]:
class IRQ2(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # Generate a temporary directory for the index
        indexDir = tempfile.mkdtemp()

        # Define Schema for vanilla whoosh
        schema = Schema(file_path = ID(stored=True),
                        file_content = TEXT(analyzer = RegexTokenizer()))

        # Create index
        self.index_sys = index.create_in(indexDir, schema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys
        """
        # open writer
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)

        try:
            # write each file to index
            for docNum, filePath in enumerate(self.file_list):
                with open(filePath, "r", encoding="utf-8") as f:
                    fileContent = f.read()
                    writer.add_document(file_path = filePath,
                                        file_content = fileContent)

                    # print status every 1000 documents
                    if (docNum+1) % 1000 == 0:
                        print("already indexed:", docNum+1)
            print("done indexing.")

        finally:
            # close the index
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topic_results: whoosh.searching.Results

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        topic_query = self.query_parser.parse(topic_phrase)
        topic_results = self.searcher.search(topic_query, limit=None)
        return topic_results

In [None]:
q2 = IRQ2("government")

In [None]:
q2.add_files()

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [None]:
q2.py_trec_eval()

num_q                    1       1.0000
num_ret                  1       1.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

In [None]:
q2.print_rel_name('1')

---------------------------Topic_id and Topic_phrase----------------------------------
1 mining gold silver coal
---------------------------Return documents----------------------------------
1 Q0 G00-90-0342721 0 26.645398 test
---------------------------Relevant documents----------------------------------
1 0 G00-00-1006224 1
1 0 G00-02-0901987 1
1 0 G00-03-1898526 1
1 0 G00-10-3730888 1
1 0 G00-10-3849661 1


The MAP is 0.2365 for baseline Whoosh

Based on the MAP and RPrec metrics, some particular topics that the baseline Whoosh system did really well on include topics 18 and 24. However both of these didn't have really high precision@k values.

Based on the MAP and RPrec metrics, some particular topics that the baseline Whoosh system did really poorly on include topics 1, 2, and 6.

## Improved IR System

In [None]:
q2.print_rel_name('9')

---------------------------Topic_id and Topic_phrase----------------------------------
9 genealogy searches
---------------------------Return documents----------------------------------
9 Q0 G00-26-1048210 0 12.268873 test
9 Q0 G00-59-3622783 1 5.132722 test
---------------------------Relevant documents----------------------------------
9 0 G00-91-3181951 1


**Analysis for query 9 "genealogy searches":**

One false positive was document G00-59-3622783 which contained a comprehensive list of related resources. In the document, "searches" appeared 1 time and "genealogy" appeared 2 times. While these terms appeared infrequently compared to the length of the document, the document still was returned.

One false negative was document G00-91-3181951 which provides a "guide to sources for genealogy in the california history section". In the document, "searches" appears no times and while the term "genealogy" appears 4 times, only one of them is all lowercase to match the query term which suggests why the document wasn't returned. The document also contained many variations of the word "genealogy" such as "genealogical" and also with "searches" and "search" which were not accounted for in the retrieval.

In order to improve the performance, we can do the following:
* Lowercasing
* Stemming



In [None]:
class IRQ3(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # DON't change the name of 'index_sys'
        # Generate a temporary directory for the index
        indexDir = tempfile.mkdtemp()

        # Define Schema
        myAnalyzer = RegexTokenizer() | LowercaseFilter() | StemFilter()
        schema = Schema(file_path = ID(stored=True),
                        file_content = TEXT(analyzer = myAnalyzer))

        # Create index
        self.index_sys = index.create_in(indexDir, schema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys
        """
        # open writer
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)

        try:
            # write each file to index
            for docNum, filePath in enumerate(self.file_list):
                with open(filePath, "r", encoding="utf-8") as f:
                    fileContent = f.read()
                    writer.add_document(file_path = filePath,
                                        file_content = fileContent)

                    # print status every 1000 documents
                    if (docNum+1) % 1000 == 0:
                        print("already indexed:", docNum+1)
            print("done indexing.")

        finally:
            # close the index
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema)
        self.searcher = self.index_sys.searcher()

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: whoosh.searching.Results

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        topic_query = self.query_parser.parse(topic_phrase)
        topic_results = self.searcher.search(topic_query, limit=None)
        return topic_results

In [None]:
q3 = IRQ3("government")

In [None]:
q3.add_files()

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [None]:
q3.py_trec_eval()

num_q                    1       1.0000
num_ret                  1       3.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

In [None]:
q3.print_rel_name('1')

---------------------------Topic_id and Topic_phrase----------------------------------
1 mining gold silver coal
---------------------------Return documents----------------------------------
1 Q0 G00-90-0342721 0 25.321761 test
1 Q0 G00-55-3817584 1 14.643225 test
1 Q0 G00-69-2353421 2 7.818525 test
---------------------------Relevant documents----------------------------------
1 0 G00-00-1006224 1
1 0 G00-02-0901987 1
1 0 G00-03-1898526 1
1 0 G00-10-3730888 1
1 0 G00-10-3849661 1


In [None]:
q3.print_rel_name('9')

---------------------------Topic_id and Topic_phrase----------------------------------
9 genealogy searches
---------------------------Return documents----------------------------------
9 Q0 G00-30-0221651 0 14.031603 test
9 Q0 G00-79-2892445 1 13.679620 test
9 Q0 G00-26-1048210 2 12.292053 test
9 Q0 G00-01-2134408 3 10.727103 test
9 Q0 G00-06-1975174 4 10.708617 test
9 Q0 G00-59-0523165 5 10.708617 test
9 Q0 G00-95-3755341 6 10.708617 test
9 Q0 G00-24-0016657 7 10.648241 test
9 Q0 G00-95-3337324 8 10.648241 test
9 Q0 G00-88-2629440 9 10.640403 test
9 Q0 G00-33-1729611 10 10.561911 test
9 Q0 G00-01-2898660 11 10.525253 test
9 Q0 G00-43-3812747 12 10.170332 test
9 Q0 G00-91-3181951 13 9.645810 test
9 Q0 G00-55-0643570 14 9.473688 test
9 Q0 G00-21-1529615 15 9.473688 test
9 Q0 G00-49-2630728 16 9.290957 test
9 Q0 G00-67-1176122 17 9.087153 test
9 Q0 G00-00-2016453 18 8.893623 test
9 Q0 G00-08-3780534 19 8.846570 test
9 Q0 G00-08-0900666 20 8.743332 test
9 Q0 G00-08-1314254 21 8.743332 te

The following modifications were made:
* Lowercasing
* Stemming

These modifications improved the performance overall and the MAP increased to 0.3496.

Specifically, for query 9 that we analyzed in part a), the false negative document was correctly classified which increased the MAP. However, there were a lot more false positive documents that were also returned. The previous false positive document was also still returned as a false positive but ranked much lower.

While the changes improved things overall, there were still some queries that got worse. Specifically, the techniques applied was able to improve the overall MAP, but also increased the number of false positives that were returned. Therefore, there are still improvements that can be done.

## Further Improvements

In [None]:
class IRQ4(IRSystem):
    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.index_sys which should have type whoosh.index.FileIndex
        """
        # Generate a temporary directory for the index
        indexDir = tempfile.mkdtemp()

        # Define Schema
        myAnalyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | CustomFilter(SnowballStemmer("english").stem)
        schema = Schema(file_path = ID(stored=True),
                          file_content = TEXT(analyzer = myAnalyzer))

        # Create index
        self.index_sys = index.create_in(indexDir, schema)

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Add buffer to self.index_sys
        """
        # open writer
        writer = writing.BufferedWriter(self.index_sys, period=None, limit=1000)

        try:
            # write each file to index
            for docNum, filePath in enumerate(self.file_list):
                with open(filePath, "r", encoding="utf-8") as f:
                    fileContent = f.read()
                    writer.add_document(file_path = filePath,
                                        file_content = fileContent)

                    # print status every 1000 documents
                    if (docNum+1) % 1000 == 0:
                        print("already indexed:", docNum+1)
            print("done indexing.")

        finally:
            # close the index
            writer.close()

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None

        NOTE: Please update self.query_parser and self.self.searcherwhich should have type whoosh.qparser.default.QueryParser and whoosh.searching.Searcher respectively
        """
         # DON't change the names of 'query_parser' and 'searcher'
        self.query_parser = QueryParser("file_content", schema=self.index_sys.schema)
        self.searcher = self.index_sys.searcher(weighting=BM25F(B=0.4, K1=1.2))

    def perform_search(self, topic_phrase):
        """
        INPUT:
            topic_phrase: string
        OUTPUT:
            topicResults: whoosh.searching.Results OR NeuralResults

        NOTE: Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        topic_query = self.query_parser.parse(topic_phrase)
        topicResults = self.searcher.search(topic_query, limit=None)
        return topicResults

In [None]:
q4 = IRQ4("government")

In [None]:
q4.add_files()

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [None]:
q4.py_trec_eval()

num_q                    1       1.0000
num_ret                  1       3.0000
num_rel                  1       5.0000
num_rel_ret              1       0.0000
map                      1       0.0000
gm_map                   1       -11.5129
Rprec                    1       0.0000
bpref                    1       0.0000
recip_rank               1       0.0000
iprec_at_recall_0.00     1       0.0000
iprec_at_recall_0.10     1       0.0000
iprec_at_recall_0.20     1       0.0000
iprec_at_recall_0.30     1       0.0000
iprec_at_recall_0.40     1       0.0000
iprec_at_recall_0.50     1       0.0000
iprec_at_recall_0.60     1       0.0000
iprec_at_recall_0.70     1       0.0000
iprec_at_recall_0.80     1       0.0000
iprec_at_recall_0.90     1       0.0000
iprec_at_recall_1.00     1       0.0000
P_5                      1       0.0000
P_10                     1       0.0000
P_15                     1       0.0000
P_20                     1       0.0000
P_30                     1       0.000

The following modifications were made:  

* Lowercasing - allows for case insensitvity
* Intraword filter and stop filter - split words that are connected and also remove stop words that don't provide any valuable information
* Stemming - try to get root words
* Modifying b and k1 values for BM25 metric - decreasing k1 to have less sensitivity to term frequency

The  final  MAP  performance  that  these  modifications  attained was 0.3557.
