<a href="https://colab.research.google.com/github/aldolipani/SearchEngine/blob/master/SearchEngine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import glob
import re
import codecs
import numpy as np
import nltk
import string
from tqdm import tqdm_notebook
from bs4 import BeautifulSoup, Tag
from google.colab import drive

nltk.download('punkt')
drive.mount('/content/gdrive')
path = "./gdrive/My Drive/"

In [0]:
if not os.path.exists("./gdrive/My Drive/test-collection"):
  !rm -fr ./gdrive/My\ Drive/test-collection
  !git clone https://username:password@gitlab.com/aldolipani/adhoc8.git
  !mv adhoc8 ./gdrive/My\ Drive/test-collection

# Parse Collection

In [0]:
%%time
collection = {}

if not os.path.exists(path + "collection.npy"):
  for file_name in tqdm_notebook(list(glob.iglob(path + "test-collection/Collection/**", recursive = True))):
    if os.path.isfile(file_name):
      text = "<DOCS>" + codecs.open(file_name, "r", "iso-8859-1").read() + "</DOCS>"
      parsed_text = BeautifulSoup(text)
      for doc in parsed_text.docs:
        doc_id = None
        doc_text = []
        for field in doc:
          if isinstance(field, Tag):
            if field.name == "docno":
              doc_id = field.text.strip()
            else:
              doc_text.append(field.text.strip())
        collection[doc_id] = "\n".join(doc_text)
      
  np.save(path + "collection.npy", collection)
else:
  collection = np.load(path + "collection.npy", allow_pickle=True).all()

print(str(len(collection)) + " documents read!")

#Create Direct Index

In [0]:
%%time
direct_index = {}

ps = nltk.stem.PorterStemmer()

def preprocess(token):
    return ps.stem(token.lower())

if not os.path.exists(path + "direct_index.npy"):
  for doc_id in tqdm_notebook(collection):
    text = collection[doc_id]
    bag_or_words = {}
    for token in nltk.word_tokenize(text):
      if token not in string.punctuation:
        token = preprocess(token)
        if token not in bag_or_words:
          bag_or_words[token] = 1
        else:
          bag_or_words[token] += 1
    direct_index[doc_id] = bag_or_words

  np.save(path + "direct_index.npy", direct_index)
else:
  direct_index = np.load(path + "direct_index.npy", allow_pickle=True).all()

print(str(len(direct_index)) + " documents in the direct index!")

In [0]:
%%time
inverted_index = {}

if not os.path.exists(path + "inverted_index.npy"):
  for doc_id in tqdm_notebook(direct_index):
    vector = direct_index[doc_id]
    for term in vector:
      if term in inverted_index:
        inverted_index[term].append({"docId":doc_id, "tf":vector[term]})
      else:
        inverted_index[term] = [{"docId":doc_id, "tf":vector[term]}]
  
  np.save(path + "inverted_index.npy", inverted_index)
else:
  inverted_index = np.load(path + "inverted_index.npy", allow_pickle=True).all()
  
print(str(len(inverted_index)) + " words in the inverted index!")

# Read Topics

In [0]:
#%%time
queries = {}

def preprocess(token):
    ps = nltk.stem.PorterStemmer()
    return ps.stem(token.lower())

reTopicId = re.compile("<num> Number: (\d+)")
reTitle = re.compile("<title> (.+)")

dir = '/Users/aldolipani/Dropbox/Dropbox/Shared/SOmer/datasets/TREC-8/AdHoc/Topics'

topicId = ""
title = ""
for line in codecs.open(dir + "/topicsTREC8Adhoc.txt", "r", "iso-8859-1").readlines():
    mTopicId = reTopicId.match(line)
    mTitle = reTitle.match(line)
    
    if mTopicId:
        topicId = mTopicId.group(1).strip()
    elif mTitle:
        title = mTitle.group(1).strip()
        tokens = nltk.word_tokenize(title)
        queries[topicId] = []
        for token in tokens:
            if token not in string.punctuation:
                queries[topicId].append(preprocess(token))

print(str(len(queries)) + " queries read!")

In [0]:
lc = 0
for doc in direct_index:
    lc = lc + len(direct_index[doc])
    
D = len(direct_index)

# Search

In [0]:
%%time
runs = {}

def lm(tf, df, term, docId):
    pass

#TF-IDF
def tfidf(tf, df, term, docId):
    return tf * math.log(D/df)

#BM25
def bm25(tf, df, term, docId):
    ld = len(direct_index[docId])
    return tf/(tf + 1.2*(0.7 + 0.3*ld/D)) * math.log(D/df)

for topic_id in queries:    
    run = {}
    for term in queries[topic_id]:
        if term in inverted_index:
            for post in inverted_index[term]:
                docId = post['docId']
                tf = post['tf']
                df = len(inverted_index[term])
                if not docId in run:
                    run[docId] = score(tf, df, term, docId)
                else:
                    run[docId] = run[docId] + score(tf, df, term, docId)
    
    runs[topicId] = run

print(str(len(runs)) + " runs retrieved!")

# Evaluate

In [0]:
with open('test.txt', 'a') as the_file:
    for topicId in queries:
        n = 0
        run = runs[topicId]
        for doc in sorted(run, key=run.get, reverse=True):
            n = n + 1
            if n == 1001:
                break
            the_file.write(str(topicId) + " Q0 " + doc + " " + str(n) + " " + str(run[doc]) + " test\n")

In [0]:
%%time
!trec_eval -q ./gdrive/My\ Drive/test-collection/QRels/qrels.trec8.adhoc.parts1-5 test.txt | grep ^map