In [39]:
import pickle
import nltk
# nltk.download('stopwords')
import codecs
import re
import sys
from nltk.tokenize import word_tokenize               # <=== tokenizer 
from nltk.stem.porter import PorterStemmer            # <=== stemmer 
from nltk.corpus import stopwords as nltk_stopwords   # <=== stopwords
STOPWORDS = set(nltk_stopwords.words('english'))
# print(re.__version__)


# 1. Load data set

In [62]:
text = codecs.open('data/SearchEngine/wiki-600', encoding='utf-8').read()
starts = [match.span()[0] for match in re.finditer('\n = [^=]', text) ]

articles = list()
for ii , start in enumerate(starts):
  end = starts[ii + 1] if ii+1 < len(starts) else len(text)
  articles.append(text[start:end])
    

snippets = [' '.join(article[:200].split()) for article in articles]


for snippet in snippets[:2]:
    print(snippet)

= Valkyria Chronicles III = Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside
= Tower Building of the Little Rock Arsenal = The Tower Building of the Little Rock Arsenal , also known as U.S. Arsenal Building , is a building located in MacArthur Park in downtown Little Roc


# 2. Calculating term frequencies

In [63]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
STOPWORDS = set(nltk_stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/balaji/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/balaji/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## tokenize the articles, calculate term frequencies

In [64]:
import string
import sys
from collections import Counter, defaultdict

from nltk.tokenize import word_tokenize               # <=== tokenizer 
from nltk.stem.porter import PorterStemmer            # <=== stemmer 
from nltk.corpus import stopwords as nltk_stopwords   # <=== stopwords

TABLE = dict( [(ord(cc), ord(' ')) for cc in string.punctuation] )
STOPWORDS = set(nltk_stopwords.words('english'))
stemmer = PorterStemmer()


def get_tokens(article):
    article = str(article)
    article = article.translate(TABLE) # remove punctuation
    tokens = word_tokenize(article) # tokenize
    tokens = [token.lower() for token in tokens] # normalization --> to make it case insensitive search
    tokens = [token for token in tokens if not token in STOPWORDS] # remove stop words
    tokens = [stemmer.stem(token) for token in tokens]  # stemming
    return tokens


term_frequency = defaultdict(dict)

def index(id, article):
    tokens = get_tokens(article)
    tokens = Counter(tokens)
    for token, frequency in tokens.items():
        term_frequency[token][id] = frequency

for ii, article in enumerate(articles):
    if ii and ii%10 == 0: print(ii, end=', ')
    sys.stdout.flush()
    index(ii, article)

print('term_frequency for "einstein"')
print(term_frequency['einstein'])
print('='*200)

10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520, 530, 540, 550, 560, 570, 580, 590, 600, 610, 620, term_frequency for "einstein"
{84: 5, 294: 1, 300: 1}


# 3. saving and loading

In [65]:
import pickle

def picklesave(obj, filename):
    print('Saving .. ')
    ff = open(filename, 'wb')
    pickle.dump(obj, ff)
    ff.close()
    print('Done')
    return True

def pickleload(filename):
    print('Loading .. ')
    ff = open(filename, 'rb')
    obj = pickle.load(ff)
    ff.close()
    print('Done')
    return obj

picklesave([snippets, term_frequency], 'data-26000.pdata')
snippets, term_frequency = pickleload('data-26000.pdata')

Saving .. 
Done
Loading .. 
Done


# 4. Search


In [68]:
import math

D = len(snippets)
def search(query, nresults=3):
    tokens = get_tokens(query)
    scores = defaultdict(float)
    for token in tokens:
        document_frequency = len(term_frequency[token])
        for article, score in term_frequency[token].items():
            scores[article] += score * math.log(1. * D / document_frequency)
    return sorted(scores.keys(), reverse=True, key=scores.get)[:nresults]

def display_results(query, results):
    print('You search for: "%s"' % query)
    print('-'*100)
    for result in results:
        print(snippets[result])
    print('='*100)

display_results('Smart phone', search('smart phone'))
display_results('einstein', search('einstein'))
display_results('physics', search('physics'))
display_results('india', search('india'))
display_results('director', search('director'))

You search for: "Smart phone"
----------------------------------------------------------------------------------------------------
= In Bloom = For the 2013 film of the same name , see In Bloom ( 2013 film ) " In Bloom " is a song by American rock band Nirvana . Written by frontman Kurt Cobain , the song addresses people
= Bart vs. Australia = " Bart vs. Australia " is the sixteenth episode of the sixth season of The Simpsons . It originally aired on the Fox network in the United States on February 19 , 1995 . I
= Crash Boom Bang ! = For the Roxette album with a similar name , see Crash ! Boom ! Bang ! Crash Boom Bang ! ( known in Japan as Crash Bandicoot Festival ) ( クラッシュ ・ バンディクー フェスティバル , Kurasshu
You search for: "einstein"
----------------------------------------------------------------------------------------------------
= Edward Creutz = Edward Creutz ( January 23 , 1913 – June 27 , 2009 ) was an American physicist who worked on the Manhattan Project at the Metallurgical Laborat