<a href="https://colab.research.google.com/github/aldolipani/SearchEngine/blob/master/SearchEngine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import sys
import os
import glob
import re
import codecs
import nltk
import string
import math
import numpy as np
from nltk.corpus import stopwords
from tqdm import tqdm_notebook
from bs4 import BeautifulSoup, Tag
from google.colab import drive

nltk.download('punkt')
nltk.download('stopwords')

IN_COLAB = 'google.colab' in sys.modules

path = None
if IN_COLAB:
  drive.mount('/content/gdrive')
  path = "./gdrive/My Drive/SearchEngine/"
else:
  path = "./SearchEngine/"

In [0]:
!ls "$path"

# Download Test Collection
Note that you need to substitute your GitLab username and password in order to download the test collection.

In [0]:
if not os.path.exists(path + "test-collection"):
  !rm -fr "$path"/test-collection
  !git clone https://username:password@gitlab.com/aldolipani/adhoc8.git
  !mv adhoc8 "$path"/test-collection

In [0]:
!ls "$path"/test-collection

In [0]:
!ls "$path"/test-collection/Collection

In [0]:
!ls "$path"/test-collection/Topics

In [0]:
!ls "$path"/test-collection/QRels

# Parse Collection

In [0]:
%%time
collection = {}

if not os.path.exists(path + "collection.npy"):
  for file_name in tqdm_notebook(list(glob.iglob(path + "test-collection/Collection/**", recursive = True))):
    if os.path.isfile(file_name):
      text = "<DOCS>" + codecs.open(file_name, "r", "iso-8859-1").read() + "</DOCS>"
      parsed_text = BeautifulSoup(text)
      for doc in parsed_text.docs:
        id = None
        text = []
        for field in doc:
          if isinstance(field, Tag):
            if field.name == "docno":
              id = field.text.strip()
            else:
              text.append(field.text.strip())
        collection[id] = "\n".join(text)
      
  np.save(path + "collection.npy", collection)
else:
  print("loading", path + "collection.npy")
  collection = np.load(path + "collection.npy", allow_pickle=True).all()

print(str(len(collection)) + " documents read!")

#Create Direct Index

In [0]:
%%time
direct_index = {}

ps = nltk.stem.PorterStemmer()
preprocess_cache = {}
def preprocess(token):
  if token not in preprocess_cache:
    preprocess_cache[token] = ps.stem(token.lower())
  return preprocess_cache[token]

if not os.path.exists(path + "direct_index.npy"):
  stop_words = set(stopwords.words('english'))
  for doc in tqdm_notebook(collection):
    text = collection[doc]
    bag_or_words = {}
    for token in nltk.word_tokenize(text):
      if not (token in string.punctuation or token in stop_words):
        token = preprocess(token)
        if token not in bag_or_words:
          bag_or_words[token] = 1
        else:
          bag_or_words[token] += 1
    direct_index[doc] = bag_or_words

  np.save(path + "direct_index.npy", direct_index)
else:
  print("loading", path + "direct_index.npy")
  direct_index = np.load(path + "direct_index.npy", allow_pickle=True).all()
  
print(str(len(direct_index)) + " documents in the direct index!")

In [0]:
del collection # free the memory used by the object collection

# Create Inverted Index

In [0]:
%%time
inverted_index = {}

if not os.path.exists(path + "inverted_index.npy"):
  for doc in tqdm_notebook(direct_index):
    bag_of_words = direct_index[doc]
    for term in bag_of_words:
      if term not in inverted_index:
        inverted_index[term] = {}
      inverted_index[term][doc] = bag_of_words[term]
    if IN_COLAB:
      direct_index[doc] = None
  
  np.save(path + "inverted_index.npy", inverted_index)
else:
  print("loading", path + "inverted_index.npy")
  inverted_index = np.load(path + "inverted_index.npy", allow_pickle=True).all()
    
print(str(len(inverted_index)) + " words in the inverted index!")

In [0]:
del direct_index # free the memory used by the direct index

# Read Topics

In [0]:
#%%time
queries = {}

def preprocess(token):
    ps = nltk.stem.PorterStemmer()
    return ps.stem(token.lower())

reTopic = re.compile("<num> Number: (\d+)")
reTitle = re.compile("<title> (.+)")

topic = ""
title = ""
for line in codecs.open(path + "test-collection/Topics/topicsTREC8Adhoc.txt", "r", "iso-8859-1").readlines():
    mTopic = reTopic.match(line)
    mTitle = reTitle.match(line)
    
    if mTopic:
        topic = mTopic.group(1).strip()
    elif mTitle:
        title = mTitle.group(1).strip()
        tokens = nltk.word_tokenize(title)
        queries[topic] = []
        for token in tokens:
            if token not in string.punctuation:
                queries[topic].append(preprocess(token))

print(str(len(queries)) + " queries read!")

# Search

In [0]:
%%time

lds = {}
for term in tqdm_notebook(inverted_index):
  for doc in inverted_index[term]:
    if doc not in lds:
      lds[doc] = 0
    lds[doc] += inverted_index[term][doc]

D = len(lds)

In [0]:
%%time
runs = {}

def lm(tf, df, term, doc):
    pass

#TF-IDF
def tfidf(tf, df, term, doc):
    return tf * math.log(D/df)

#BM25
def bm25(tf, df, term, doc):
    ld = lds[doc]
    return tf/(tf + 1.2*(0.7 + 0.3*ld/D)) * math.log(D/df)
  
score = bm25 

for topic in queries:    
    run = {}
    for term in queries[topic]:
        if term in inverted_index:
            for doc in inverted_index[term]:
                tf = inverted_index[term][doc]
                df = len(inverted_index[term])
                if not doc in run:
                    run[doc] = bm25(tf, df, term, doc)
                else:
                    run[doc] = run[doc] + score(tf, df, term, doc)
    
    runs[topic] = run

print(str(len(runs)) + " runs retrieved!")

# Evaluate

In [0]:
!git clone https://github.com/usnistgov/trec_eval.git
!(cd trec_eval && make)

In [0]:
!rm run.txt

with open('run.txt', 'a') as the_file:
    for topic in queries:
        n = 0
        run = runs[topic]
        for doc in sorted(run, key=run.get, reverse=True):
            n = n + 1
            if n == 1001:
                break
            the_file.write(str(topic) + " Q0 " + doc + " " + str(n) + " " + str(run[doc]) + " run\n")

In [0]:
!./trec_eval/trec_eval -q "$path"/test-collection/QRels/qrels.trec8.adhoc.parts1-5 run.txt | grep ^map