# Search Engine Implementation


An implementation of a search engine to rank matching documents according to their relevance to a given search query. The ranking is calculated by using simple term frequency and document frequency measures.

Let's first import the librarys we will be using.

In [3]:
import os
import math
import numpy as np
import matplotlib.pyplot as plt

Make a list of all txt files in the path specified

In [7]:

path = 'ACL\\' # notice here that backward slash is replaced with double backward slashes

txtFiles = []
for fileName in os.listdir(path):
    if fileName.endswith('.txt'):
        txtFiles.append(path+fileName)
print(len(txtFiles),txtFiles[:5])

154 ['ACL\\A00-2036.pdf.txt', 'ACL\\A00-2037.pdf.txt', 'ACL\\A00-2038.pdf.txt', 'ACL\\A00-2039.pdf.txt', 'ACL\\A00-2040.pdf.txt']


Defining a function to take a document as an input and return a list of words in the input document

In [8]:
## function to read text and return list of words 
def wordList(doc):
    sList=[]
    for w in doc.split(" "):
        sList.append(w.strip('\n'))
    return sList

A sample text to the functions

In [29]:
sampleText="On July 16, 1969, the Apollo 11 spacecraft launched from the Kennedy Space Center in Florida. Its mission was to go\
where no human being had gone before—the moon! The crew consisted of Neil Armstrong, Michael Collins, and Buzz Aldrin. The\
spacecraft landed on the moon in the Sea of Tranquility, a basaltic flood plain, on 20--July 1969"

In [30]:
print(wordList(sampleText))

['On', 'July', '16,', '1969,', 'the', 'Apollo', '11', 'spacecraft', 'launched', 'from', 'the', 'Kennedy', 'Space', 'Center', 'in', 'Florida.', 'Its', 'mission', 'was', 'to', 'gowhere', 'no', 'human', 'being', 'had', 'gone', 'before—the', 'moon!', 'The', 'crew', 'consisted', 'of', 'Neil', 'Armstrong,', 'Michael', 'Collins,', 'and', 'Buzz', 'Aldrin.', 'Thespacecraft', 'landed', 'on', 'the', 'moon', 'in', 'the', 'Sea', 'of', 'Tranquility,', 'a', 'basaltic', 'flood', 'plain,', 'on', '20--July', '1969']


In [31]:
###  function to remove puntuation marks from words
# import string.maketrans as textfilter
from string import punctuation as puncs
def removePuncs(wordList):
    #print('punctuation marks are: ', puncs)
    sList = []
    for w in wordList:
        word=w.translate(str.maketrans({key: None for key in puncs}))
        sList.append(word)
    return sList

In [32]:
removePuncs(wordList(sampleText))

['On',
 'July',
 '16',
 '1969',
 'the',
 'Apollo',
 '11',
 'spacecraft',
 'launched',
 'from',
 'the',
 'Kennedy',
 'Space',
 'Center',
 'in',
 'Florida',
 'Its',
 'mission',
 'was',
 'to',
 'gowhere',
 'no',
 'human',
 'being',
 'had',
 'gone',
 'before—the',
 'moon',
 'The',
 'crew',
 'consisted',
 'of',
 'Neil',
 'Armstrong',
 'Michael',
 'Collins',
 'and',
 'Buzz',
 'Aldrin',
 'Thespacecraft',
 'landed',
 'on',
 'the',
 'moon',
 'in',
 'the',
 'Sea',
 'of',
 'Tranquility',
 'a',
 'basaltic',
 'flood',
 'plain',
 'on',
 '20July',
 '1969']

In [33]:
### function to calculate term frequency in the doc
def termFrequencyInDoc(wordList):
    termFrequency_dic={}
    for w in wordList:
        if w in termFrequency_dic.keys():
            termFrequency_dic[w]+=1
        else:
            termFrequency_dic[w]=1
    return termFrequency_dic

In [34]:
termFrequencyInDoc(removePuncs(wordList(sampleText)))

{'On': 1,
 'July': 1,
 '16': 1,
 '1969': 2,
 'the': 4,
 'Apollo': 1,
 '11': 1,
 'spacecraft': 1,
 'launched': 1,
 'from': 1,
 'Kennedy': 1,
 'Space': 1,
 'Center': 1,
 'in': 2,
 'Florida': 1,
 'Its': 1,
 'mission': 1,
 'was': 1,
 'to': 1,
 'gowhere': 1,
 'no': 1,
 'human': 1,
 'being': 1,
 'had': 1,
 'gone': 1,
 'before—the': 1,
 'moon': 2,
 'The': 1,
 'crew': 1,
 'consisted': 1,
 'of': 2,
 'Neil': 1,
 'Armstrong': 1,
 'Michael': 1,
 'Collins': 1,
 'and': 1,
 'Buzz': 1,
 'Aldrin': 1,
 'Thespacecraft': 1,
 'landed': 1,
 'on': 2,
 'Sea': 1,
 'Tranquility': 1,
 'a': 1,
 'basaltic': 1,
 'flood': 1,
 'plain': 1,
 '20July': 1}