## 1. Buiding Whoosh Schema

In [14]:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer

schema = Schema(filename=ID(stored=True),
                line_num=ID(stored=True),
                content=TEXT(analyzer=StemmingAnalyzer())
               )

## 2. Loading Data

In [15]:
import os, os.path
from whoosh import index

# Note, this clears the existing index in the directory
ix = index.create_in("hp", schema)

# Get a writer form the created index in 
writer = ix.writer()

In [16]:
def loadFile(writer, fname):
    '''
    Read file contents, load into database.
    '''
    line_no = 1
    with open(fname, 'r') as infile:
        # TODO: create indexes for each line in the input file
        for line in infile:
            line=line.rstrip('\n')
            line_no+=1
            writer.add_document(filename=fname,line_num=str(line_no),content=line)
            
        print("Indexed: ", fname)


def processFolder(writer,folder):
    print('Processing folder: ',folder)
    for root, dirs, files in os.walk(folder):
        print("root = ", root)
        # Process Files
        for file in files:
            if file.endswith(".txt"):
                filename = os.path.join(root, file)
                loadFile(writer,filename)
            else:
                print("Unhandled File")
        # Recurse into subfolders
        for d in dirs:
            print("recursing into ",d)
            processFolder(writer,d)

In [17]:
processFolder(writer,"hp")
writer.commit() # save changes

Processing folder:  hp
root =  hp
Indexed:  hp/CHAPTER 1.txt
Indexed:  hp/CHAPTER 2.txt
Indexed:  hp/CHAPTER 3.txt
Indexed:  hp/CHAPTER 4.txt
Indexed:  hp/CHAPTER 5.txt
Indexed:  hp/CHAPTER 6.txt
Indexed:  hp/CHAPTER 7.txt
Indexed:  hp/CHAPTER 8.txt
Unhandled File
Unhandled File
Unhandled File
Unhandled File
Unhandled File
recursing into  MAIN.tmp
Processing folder:  MAIN.tmp
root =  hp/MAIN.tmp
Unhandled File


## 3. Executing Queries

In [18]:
from whoosh.qparser import QueryParser
qp=QueryParser("content",schema=ix.schema)
q=qp.parse(u"Harry")
with ix.searcher() as s:
    results=s.search(q)
    for hit in results:
        print(hit["filename"])
# Find the indexes of lines where the string 'Harry' is appearing. 


hp/CHAPTER 6.txt
hp/CHAPTER 2.txt
hp/CHAPTER 1.txt
hp/CHAPTER 2.txt
hp/CHAPTER 3.txt
hp/CHAPTER 3.txt
hp/CHAPTER 5.txt
hp/CHAPTER 6.txt
hp/CHAPTER 6.txt
hp/CHAPTER 6.txt


In [19]:
from whoosh.qparser import QueryParser
from whoosh import scoring
qp=QueryParser("content",schema=ix.schema)
q=qp.parse(u"Harry")
with ix.searcher(weighting=scoring.TF_IDF()) as s:
    results=s.search(q)
    for hit in results:
        print(hit["filename"])
    
# Find the indexes of lines where the string 'Harry' using TF_IDF as the scoring mechanism. 

hp/CHAPTER 2.txt
hp/CHAPTER 2.txt
hp/CHAPTER 2.txt
hp/CHAPTER 5.txt
hp/CHAPTER 5.txt
hp/CHAPTER 5.txt
hp/CHAPTER 5.txt
hp/CHAPTER 6.txt
hp/CHAPTER 6.txt
hp/CHAPTER 6.txt


In [35]:
from whoosh.query import *

with ix.searcher(weighting=scoring.TF_IDF()) as s:
    qp = QueryParser("content", ix.schema)
    user_q = qp.parse(u"Harry")

    # Only show documents in the "rendering" chapter
    allow_q = Term("filename", "hp/CHAPTER 6.txt")
    # Don't show any documents where the "content" field contains "hate"
    restrict_q = Term("content","hate")

    results = s.search(user_q, mask=restrict_q, filter=allow_q)      #   
        for hit in results:
            print(hit["filename"], hit["content"])

# Use a filter to list the indexes in chapter 6 corresponding to the search string 'Harry' using TF_IDF as the scoring mechanism. 

IndentationError: unexpected indent (<ipython-input-35-666f3f4f06f7>, line 13)