In [None]:
!pip install whoosh
!pip install pytrec_eval
!pip install wget



In [None]:
import wget
wget.download("https://github.com/MIE451-1513-2019/course-datasets/raw/master/government.zip", "government.zip")

'government.zip'

In [None]:
!unzip government.zip

Archive:  government.zip
replace government/topics-with-full-descriptions.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# imports
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
from whoosh import scoring, qparser
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget
import nltk
from nltk.stem import *

## Loading the Data and Creating a Schema


In [None]:
DATA_DIR = "government"
#
# Put other path constants here
#
DOCUMENTS_DIR = os.path.join(DATA_DIR, "documents")
TOPIC_FILE = os.path.join(DATA_DIR, "gov.topics")
QRELS_FILE = os.path.join(DATA_DIR, "gov.qrels")

In [None]:
def createIndex(schema):
    # Generate a temporary directory for the index
    indexDir = tempfile.mkdtemp()

    # create and return the index
    return index.create_in(indexDir, schema)


In [None]:
#Define a Schema for the index
mySchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer()))

# now, create the index at the path INDEX_DIR based on the new schema
INDEX_Q2 = createIndex(mySchema)

In [None]:
def addFilesToIndex(indexObj, fileList):
    # open writer
    writer = writing.BufferedWriter(indexObj, period=None, limit=1000)

    try:
        # write each file to index
        for docNum, filePath in enumerate(fileList):
            with open(filePath, "r", encoding="utf-8") as f:
                fileContent = f.read()
                writer.add_document(file_path = filePath,
                                    file_content = fileContent)

                # print status every 1000 documents
                if (docNum+1) % 1000 == 0:
                    print("already indexed:", docNum+1)
        print("done indexing.")

    finally:
        # close the index
        writer.close()

In [None]:
# Build a list of files to index
filesToIndex = [str(filePath) for filePath in Path(DOCUMENTS_DIR).glob("**/*") if filePath.is_file()]

In [None]:
addFilesToIndex(INDEX_Q2, filesToIndex)

already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


# Running the queries through Whoosh Baseline System

In [None]:
QP_Q2 = QueryParser("file_content", schema=INDEX_Q2.schema)
SEARCHER_Q2 = INDEX_Q2.searcher()

In [None]:
def pyTrecEval(topicFile, qrelsFile, queryParser, searcher):
    # Load topic file - a list of topics(search phrases) used for evalutation
    with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()

    # create an output file to which we'll write our results
    tempOutputFile = tempfile.mkstemp()[1]
    with open(tempOutputFile, "w") as outputTRECFile:
        # for each evaluated topic:
        # build a query and record the results in the file in TREC_EVAL format
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            #print(topic_id, topic_phrase)
            topicQuery = queryParser.parse(topic_phrase)
            topicResults = searcher.search(topicQuery, limit=None)
            for (docnum, result) in enumerate(topicResults):
                score = topicResults.score(docnum)
                #print("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
    with open(qrelsFile, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(tempOutputFile, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, {'map'}) #Instead of printing out all the results I only printed out the result which I wanted to look at ie. MAP

    results = evaluator.evaluate(run)
    def print_line(measure, scope, value):
        print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

    for query_id, query_measures in results.items():
        for measure, value in query_measures.items():
            if measure == "runid":
              continue
            print_line(measure, query_id, value)
    for measure in query_measures.keys():
        if measure == "runid":
              continue
        print_line(
            measure,
            'all',
            pytrec_eval.compute_aggregated_measure(
                measure,
                [query_measures[measure]
                 for query_measures in results.values()]))

In [None]:
pyTrecEval(TOPIC_FILE, QRELS_FILE,QP_Q2,SEARCHER_Q2) 

map                      1       0.0000
map                      2       0.0000
map                      4       0.0312
map                      6       0.0000
map                      7       0.0000
map                      9       0.0000
map                      10      0.1667
map                      14      0.2500
map                      16      0.0000
map                      18      1.0000
map                      22      0.2000
map                      24      1.0000
map                      26      0.1111
map                      28      0.0000
map                      all     0.1971


**MAP** is an appropriate measure because it looks not only at the precision but also the order of the precise documents. If the relevant documents are on top, the MAP will be high. MAP is the mean average precision over all the queries

The MAP is .1971 with the baseline model

There were some topics which performed very bad and some performed very well. For queries 1, 2, 6, 7, 9, 16 and 28, it performed very bad with a MAP of 0. But queries 18 and 24 performed very well with a MAP of 1. The others had a MAP between 0 and 1

In [None]:
def printRelName(topicFile, qrelsFile, queryParser, searcher, id):
  with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()
  for topic in topics:
        topic_id, topic_phrase = tuple(topic.split(" ", 1))
        if topic_id == id:
          print("---------------------------Topic_id and Topic_phrase----------------------------------")
          print(topic_id, topic_phrase)
          topicQuery = queryParser.parse(topic_phrase)
          topicResults = searcher.search(topicQuery, limit=None)
          print("---------------------------Return documents----------------------------------")
          for (docnum, result) in enumerate(topicResults):
              score = topicResults.score(docnum)
              print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
          print("---------------------------Relevant documents----------------------------------")
          with open(qrelsFile, 'r') as f_qrel:
            qrels = f_qrel.readlines()
            for i in qrels:
              qid, _, doc, rel = i.rstrip().split(" ")
              if qid == id and rel == "1":
                print(i.rstrip())

In [None]:
printRelName(TOPIC_FILE, QRELS_FILE,QP_Q2,SEARCHER_Q2,"4") 

---------------------------Topic_id and Topic_phrase----------------------------------
4 wireless communications
---------------------------Return documents----------------------------------
4 Q0 G00-99-2247765 0 16.449155 test
4 Q0 G00-85-1525415 1 13.364613 test
4 Q0 G00-05-1218739 2 12.956314 test
4 Q0 G00-09-0774298 3 11.781349 test
4 Q0 G00-56-4151981 4 11.367248 test
4 Q0 G00-21-2229498 5 10.743958 test
4 Q0 G00-98-4068688 6 10.464865 test
4 Q0 G00-47-2117970 7 10.213356 test
4 Q0 G00-67-0152545 8 8.392871 test
4 Q0 G00-06-1757034 9 6.431556 test
4 Q0 G00-78-2551063 10 3.955775 test
4 Q0 G00-84-0274223 11 2.068438 test
---------------------------Relevant documents----------------------------------
4 0 G00-03-2855342 1
4 0 G00-36-1275993 1
4 0 G00-47-2117970 1
4 0 G00-65-0162935 1


In [None]:
with open('government/documents/99/G00-99-2247765', "r") as f:
    print(f.read())

http://w3.antd.nist.gov/wire_tel.shtml

   [USEMAP:wire_te.gif]


   Goals
   Horizontal
   To help the US cellular communications industry test the wideband CDMA
   components (cdma2000 and W-CDMA systems) of the IMT-2000 family of
   standards for third-generation wireless systems over different traffic
   and wireless channel quality conditions
   To facilitate transmission of video signals over 3G wireless systems.

   Technical Approach
   Horizontal
     * Develop SPW models for the forward link, reverse link, and
       multi-carrier mode of the cdma2000 system
     * Measure the performance of the cdma2000 system over different ITU
       channel models and in face of interference from other users
       communicating multimedia information
     * Develop a partitioning method and a multi-priority transmission
       method to send video over 3G wireless systems

   FY02 Plans
   Horizontal
     * Provide industry customers with as needed consulting to understand
       and use

In [None]:
with open('government/documents/36/G00-36-1275993', "r") as f:
    print(f.read())

http://www.ntia.doc.gov/ntiahome/threeg/


                                3G Wireless
     __________________________________________________________________

   INTRODUCTION: "Wireless" Internet: What the 3G Challenge Means for U.S.
   Competitiveness

   10-05-01: New Plan to Identify Spectrum for Advanced Wireless Mobile
   Services (3G)

   Text of 6-Sep-2001 Letter from Commerce Secretary Evans to
   Congressional Leaders, Proposing Legislation to Shift Statutory
   Spectrum Auction Date. (Proposed legislative language)

   Acting Assistant Secretary William T. Hatch testified on July 31, 2001,
   before the Senate Commerce, Science, and Transportation Communications
   Subcommittee on spectrum management and 3rd generation wireless
   service.

   Acting Assistant Secretary William T. Hatch testified before the
   Subcommittee on Telecommunications and the Internet of the House Energy
   and Commerce Committee on 3G wireless issues on July 24, 2001.

   Text of Secretary Evans J

Considering query 4, here we can see that a number of documents were returned but only 1 of them was relevant and that too was placed at the 8th position ie.G00-47-2117970

For false positives we will consider doc G00-99-2247765. This was returned at the top position in the returned documents.
This was maybe because it had "wireless" 4 times in lowercase and "communications" 1 time in lowercase. But when we look at it, it does not provide any information about wireless communications and only gives an idea about the plans and goals of the company.

For false negatives, we will consider doc G00-36-1275993. This was relevant but was not even returned. When we have a look, we can see that "wireless appeared 11 times and different forms of the word "communications" appeared 9 times.
But the system maybe did not detect these is because these were not in lowercase as in the query and also there were different forms of the same word. For example communication appeared as Telecommuincations, communication and Communications.

Therefore if we can lemmatize and stem the words and convert them to lowercase, then it will surely remove this problem and return more relevant docs




# Modifying Whoosh by using lemmatization and stemming through NLTK to get better MAP value.

In [None]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# This filter will run for both the index and the query
from whoosh.analysis import Filter
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

In [None]:
newfilter = RegexTokenizer() | LowercaseFilter() | StopFilter()| CustomFilter(WordNetLemmatizer().lemmatize)
mySchema3 = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = newfilter))

# now, creating the index at the path INDEX_DIR based on the new schema
INDEX_Q3 = createIndex(mySchema3)
addFilesToIndex(INDEX_Q3, filesToIndex)


already indexed: 1000
already indexed: 2000
already indexed: 3000
already indexed: 4000
done indexing.


In [None]:
QP_Q3 = QueryParser("file_content", schema=INDEX_Q3.schema)
SEARCHER_Q3 = INDEX_Q3.searcher()

In [None]:
pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q3, SEARCHER_Q3)


map                      1       0.0000
map                      2       0.5000
map                      4       0.5395
map                      6       0.0000
map                      7       0.0000
map                      9       0.0417
map                      10      0.3333
map                      14      1.0000
map                      16      0.0000
map                      18      1.0000
map                      19      0.5000
map                      22      0.0370
map                      24      1.0000
map                      26      0.0771
map                      28      0.0729
map                      all     0.3401


In [None]:
printRelName(TOPIC_FILE, QRELS_FILE,QP_Q3,SEARCHER_Q3,"4") #Looking at relevant documents returned for query 4

---------------------------Topic_id and Topic_phrase----------------------------------
4 wireless communications
---------------------------Return documents----------------------------------
4 Q0 G00-36-1275993 0 16.834373 test
4 Q0 G00-47-2117970 1 16.208461 test
4 Q0 G00-99-2247765 2 15.343711 test
4 Q0 G00-00-1958915 3 14.839544 test
4 Q0 G00-85-1525415 4 14.799983 test
4 Q0 G00-28-2286602 5 14.210035 test
4 Q0 G00-21-2229498 6 13.883219 test
4 Q0 G00-74-4030396 7 13.776356 test
4 Q0 G00-69-0005329 8 13.601438 test
4 Q0 G00-05-1218739 9 13.547566 test
4 Q0 G00-84-3349019 10 13.301167 test
4 Q0 G00-46-1439567 11 13.122084 test
4 Q0 G00-16-0059045 12 12.600262 test
4 Q0 G00-44-1482914 13 12.499380 test
4 Q0 G00-71-3454228 14 12.249361 test
4 Q0 G00-02-1720397 15 12.152140 test
4 Q0 G00-07-3064254 16 11.367740 test
4 Q0 G00-09-0774298 17 11.364895 test
4 Q0 G00-65-0162935 18 10.879121 test
4 Q0 G00-67-0152545 19 10.531362 test
4 Q0 G00-05-1550998 20 10.023960 test
4 Q0 G00-56-4151981 2

I converted all the docs to lowercase , stemmed and lemmatized them and removed the stop words. The overall MAP increased from .1971 to .3401 which is a 72% increase.

Also, for specific queries, query 2, 9 and 28 now increased from 0 to non zero values and query 14 increased from .25 to 1.
This is a major improvement with just small measures

Another improvement was that now 3(G00-47-2117970, G00-36-1275993, G00-65-0162935 )  of the 4 relevant documents were returned correctly and with 2 of them at the top 2 positions of the ranked document ie. G00-36-1275993 and G00-47-2117970. This is a major improvement from the previous question, where only 1 relevant document was returned and that too at the 8th position.

There is a certain improvement now since  query 2, 9 and 28  increased from 0 to non zero values and query 14 increased from 0.25 to 1.
There were queries which went down as well such as 22 and 26 whose MAP values decreased

**Overall, I got a good result increasing the overall MAP by 72% ( from .1971 to .3401) and apart from queries 22 and 26, which decreased in MAP value, all of the queries improved or remained same as before.**
**But more improvements can be made in scoring and query parsing and different types of tokenizers and stemmers can be analyzed**

In [None]:
#Further adding more filters to improve search engine performance
newfilter = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter()| StripFilter() | StemFilter() | CustomFilter(LancasterStemmer().stem) | CustomFilter(WordNetLemmatizer().lemmatize) | CustomFilter(WordNetLemmatizer().lemmatize,'v') | CustomFilter(WordNetLemmatizer().lemmatize, 'r')
Schema4 = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = newfilter))

# now, create the index at the path INDEX_DIR based on the new schema
INDEX_Q4 = createIndex(Schema4)
addFilesToIndex(INDEX_Q4, filesToIndex)

# Further tuning by adding specific parser group and scoring methods

In [None]:
i=.1      # Function to print out the best B and K1 for BM25F algorithm
while i<1:
  print(i)
  QP_Q4 = QueryParser("file_content", schema=INDEX_Q4.schema,group=qparser.OrGroup.factory(i)) #Adding ORGroup
  SEARCHER_Q4 = INDEX_Q4.searcher(weighting=scoring.BM25F(B=0.5, K1=5)) #AddingBM25F algorithm
  pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q4, SEARCHER_Q4)
  i=i+.1

0.1
map                      1       0.0706
map                      2       0.5294
map                      4       0.5390
map                      6       0.1429
map                      7       0.2167
map                      9       0.2500
map                      10      0.3333
map                      14      1.0000
map                      16      0.2428
map                      18      1.0000
map                      19      0.5000
map                      22      0.0370
map                      24      1.0000
map                      26      0.0875
map                      28      0.2111
map                      all     0.4107
0.2
map                      1       0.0706
map                      2       0.5294
map                      4       0.5390
map                      6       0.1429
map                      7       0.2167
map                      9       0.2500
map                      10      0.3333
map                      14      1.0000
map                      16     

In [None]:
i=.1     # Function to print out the best B and K1 for BM25F algorithm
while i<1:
  print(i)
  y=.5
  while y<8:
    print(y)
    SEARCHER_Q4 = INDEX_Q4.searcher(weighting=scoring.BM25F(B=i, K1=y))
    pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q4, SEARCHER_Q4)
    y=y+.5
  i=i+.1

0.1
0.5
map                      1       0.0443
map                      2       0.5263
map                      4       0.5564
map                      6       0.1000
map                      7       0.0639
map                      9       0.3333
map                      10      0.2500
map                      14      1.0000
map                      16      0.0802
map                      18      1.0000
map                      19      0.1667
map                      22      0.0667
map                      24      0.5000
map                      26      0.0784
map                      28      0.1534
map                      all     0.3280
1.0
map                      1       0.0490
map                      2       0.5256
map                      4       0.5546
map                      6       0.1000
map                      7       0.0791
map                      9       0.3333
map                      10      0.2500
map                      14      1.0000
map                      16 

In [None]:
#Final values for query parser and scoring
QP_Q4 = QueryParser("file_content", schema=INDEX_Q4.schema,group=qparser.OrGroup.factory(.5)) #Adding ORGroup
SEARCHER_Q4 = INDEX_Q4.searcher(weighting=scoring.BM25F(B=0.5, K1=5)) #AddingBM25F algorithm

In [None]:
pyTrecEval(TOPIC_FILE, QRELS_FILE, QP_Q4, SEARCHER_Q4)

map                      1       0.0706
map                      2       0.5294
map                      4       0.5400
map                      6       0.1429
map                      7       0.2167
map                      9       0.2500
map                      10      0.3333
map                      14      1.0000
map                      16      0.2428
map                      18      1.0000
map                      19      0.5000
map                      22      0.0370
map                      24      1.0000
map                      26      0.0875
map                      28      0.2111
map                      all     0.4108


In [None]:
printRelName(TOPIC_FILE, QRELS_FILE,QP_Q4,SEARCHER_Q4,"4") #Looking at relevant documents returned for query 4

---------------------------Topic_id and Topic_phrase----------------------------------
4 wireless communications
---------------------------Return documents----------------------------------
4 Q0 G00-36-1275993 0 16.064967 test
4 Q0 G00-47-2117970 1 14.853658 test
4 Q0 G00-99-2247765 2 13.089115 test
4 Q0 G00-74-4030396 3 11.229551 test
4 Q0 G00-72-3133704 4 10.868666 test
4 Q0 G00-54-0905235 5 10.868666 test
4 Q0 G00-84-3349019 6 10.791791 test
4 Q0 G00-85-1525415 7 10.764198 test
4 Q0 G00-21-2229498 8 10.090927 test
4 Q0 G00-69-0005329 9 9.733779 test
4 Q0 G00-19-2625954 10 9.646312 test
4 Q0 G00-05-1218739 11 9.450093 test
4 Q0 G00-28-2286602 12 9.355862 test
4 Q0 G00-00-1958915 13 8.999657 test
4 Q0 G00-46-1439567 14 8.469916 test
4 Q0 G00-16-0059045 15 7.964833 test
4 Q0 G00-67-0152545 16 7.901216 test
4 Q0 G00-10-2871392 17 7.762551 test
4 Q0 G00-02-1720397 18 7.568047 test
4 Q0 G00-98-4068688 19 7.353383 test
4 Q0 G00-44-1482914 20 7.165278 test
4 Q0 G00-71-3454228 21 6.950125 t

**a) A clear list of all final modifications made.**

1. RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter()|      StripFilter() | StemFilter() | CustomFilter(LancasterStemmer().stem) | CustomFilter(WordNetLemmatizer().lemmatize) | CustomFilter(WordNetLemmatizer().lemmatize,'v') | CustomFilter(WordNetLemmatizer().lemmatize, 'r').
Added all these filters to tokenize, lowercase, split words if special characters are present, remove stop words, stem words, lemmatize words ( nouns, verbs and adverbs )

2. QP_Q4 = QueryParser("file_content", schema=INDEX_Q4.schema,group=qparser.OrGroup.factory(.9)) 
The query parser was used with the OrGroup and a scaling factor (factory).

3. SEARCHER_Q4 = INDEX_Q4.searcher(weighting=scoring.BM25F(B=0.55, K1=3.4))
Scoring was done with the BM25F method with scaling parameters B and K1

**b) Why each modification was made – how did it help?**
1. The filters were added to tokenize, lowercase, split words if special characters are present, remove stop words, stem words, lemmatize words ( nouns, verbs and adverbs ). Since as we saw in the document,alot of words were present in different forms and in uppercase, all these filters were added to get them into their most common form.

2. I used the OrGroup instead of the AND group(default). This was done so that even if there was a single term in a query which was present in the document, that should be returned as relevant. On the other hand the AND group requires all the words in the query to be present to return a document. Which might not be the ideal case since sometimes the exact words in the query are not present and the systems terms it as irrelevant. This is wrong since it might be a very relevant document. Hence by using the OrGroup we can return more documents. This would mean that there would we alot of false positives but atleast the relevant documents won't be discarded. Overall it showed a better MAP than AND group. I also used factory scaling factor. I used a loop to determine my best value from a group of values and the best value came out to be .5

3. I used the BM25F scoring algorithm. I tried to find the best B and K1 values by taking a loop and seeing the best values. Here the best values came out to be .5 for B and 5 for K1.Basically K1 tells us how quickly will the score normalize with respect to the term frequency. And B tells us how much importance will a term have with respect to its field length. For example a term in the title will have more importance than a term in the body. Here I varied the values from 0.1 to .9 for B and from .5 to 8 for k1. I found out .5 and 5 to be the best values.
Overall this scoring algorithm worked as a hyperparameter and by tuning it I was able to increase the MAP

**c) THE FINAL MAP IS 0.4108**
