# Research function examples with Whoosh

## Open the index

In [1]:
from whoosh import qparser
import whoosh.index as index

from pathlib import Path
pd = Path().resolve().parent.parent
index_dir = str(pd) + "/data/generated/index"

ix = index.open_dir(index_dir)

## Utility class to print results

In [2]:
class ResultDisplayer:
    
    def __init__(self, results):
        self.results = results
        
    def displayResults(self):    
        import sqlite3

        from pathlib import Path
        pd = Path().resolve().parent.parent
        database = str(pd) + "/data/generated/database/database.sqlite"

        # create a database connection
        conn = sqlite3.connect(database)

        with conn:
            cur = conn.cursor()
            print(results)
            print()
            
            for result in self.results:
                cur.execute("SELECT id, year, title FROM papers WHERE id = " + result['id'])

                row = cur.fetchone()
                print("{} - {} ({})".format(row[0], row[2], row[1]))
    

## Default query using AND

In [3]:
from whoosh.qparser import QueryParser

qp = QueryParser("paper_text", schema=ix.schema)
q = qp.parse(u"machine learning")

with ix.searcher() as s:
    results = s.search(q)
        
    ResultDisplayer(results).displayResults()

<Top 10 Results for And([Term('paper_text', 'machine'), Term('paper_text', 'learning')]) runtime=0.2567590760092073>

918 - Limits on Learning Machine Accuracy Imposed by Data Quality (1994)
917 - Temporal Dynamics of Generalization in Neural Networks (1994)
3456 - Human Active Learning (2008)
1384 - Reinforcement Learning with Hierarchies of Machines (1997)
816 - Optimal Stopping and Effective Machine Complexity in Learning (1993)
1065 - A Unified Learning Scheme: Bayesian-Kullback Ying-Yang Machine (1995)
5872 - Efficient and Robust Automated Machine Learning (2015)
6469 - Dual Learning for Machine Translation (2016)
5541 - Optimal Teaching for Limited-Capacity Human Learners (2014)
2236 - Field-Programmable Learning Arrays (2002)


## Query with OR

In [7]:
from whoosh.qparser import QueryParser

# Parse query with 'OR'
# 0 = importance to documents with one of the terms
# 1 = importance to documents with all of the terms
og = qparser.OrGroup.factory(0.5)

qp = QueryParser("paper_text", schema=ix.schema, group=og)
q = qp.parse(u"image processing")

with ix.searcher() as s:
    results = s.search(q)
    
    ResultDisplayer(results).displayResults()

<Top 10 Results for Or([Term('paper_text', 'image'), Term('paper_text', 'processing')]) runtime=0.10381832249706946>

2981 - Chained Boosting (2006)
2409 - A Mixed-Signal VLSI for Real-Time Generation of Edge-Based Image Vectors (2003)
2689 - Modeling Nonlinear Dependencies in Natural Images using Mixture of Laplacian Distribution (2004)
2930 - An Analog Visual Pre-Processing Processor Employing Cyclic Line Access in Only-Nearest-Neighbor-Interconnects Architecture (2005)
1540 - General-Purpose Localization of Textured Image Regions (1998)
562 - Recurrent Eye Tracking Network Using a Distributed Representation of Image Motion (1991)
5410 - PEWA: Patch-based Exponentially Weighted Aggregation for image denoising (2014)
4686 - Image Denoising and Inpainting with Deep Neural Networks (2012)
1455 - A General Purpose Image Processing Chip: Orientation Detection (1997)
50 - An Adaptive and Heterodyne Filtering Procedure for the Imaging of Moving Objects (1987)


## Query with multiple index fields

In [5]:
from whoosh.qparser import MultifieldParser

mparser = MultifieldParser(["title", "paper_text"], schema=ix.schema)

q = mparser.parse(u"database variable")

with ix.searcher() as s:
    results = s.search(q)
    
    ResultDisplayer(results).displayResults()

<Top 10 Results for And([Or([Term('title', 'database'), Term('paper_text', 'database')]), Or([Term('title', 'variable'), Term('paper_text', 'variable')])]) runtime=0.05991102920800806>

3235 - Sparse Overcomplete Latent Variable Decomposition of Counts Data (2007)
1380 - Estimating Dependency Structure as a Hidden Variable (1997)
564 - Image Segmentation with Networks of Variable Scales (1991)
742 - The Parti-Game Algorithm for Variable Resolution Reinforcement Learning in Multidimensional State-Spaces (1993)
1729 - Topographic Transformation as a Discrete Latent Variable (1999)
1580 - Learning a Continuous Hidden Variable Model for Binary Data (1998)
5653 - A Recurrent Latent Variable Model for Sequential Data (2015)
4352 - Learning Probabilistic Non-Linear Latent Variable Models for Tracking Complex Activities (2011)
4022 - Latent Variable Models for Predicting File Dependencies in Large-Scale Software Development (2010)
6456 - Multi-view Anomaly Detection via Robust Probabilistic La

## Query with fuzzy terms

In [20]:
from whoosh.qparser import QueryParser
from whoosh.query import FuzzyTerm

qp = QueryParser("paper_text", schema=ix.schema)
qp.add_plugin(qparser.FuzzyTermPlugin())

q = qp.parse(u"database~0 variaxle~2")

with ix.searcher() as s:
    results = s.search(q)
    
    ResultDisplayer(results).displayResults()

<Top 10 Results for And([FuzzyTerm('paper_text', 'database', boost=1.000000, maxdist=0, prefixlength=0), FuzzyTerm('paper_text', 'variaxle', boost=1.000000, maxdist=2, prefixlength=0)]) runtime=2.916193322743311>

5069 - Global Solver and Its Efficient Approximation for Variational Bayesian Low-rank Subspace Clustering (2013)
3362 - Regulator Discovery from Gene Expression Time Series of Malaria Parasites: a Hierachical Approach (2007)
4669 - Probabilistic Low-Rank Subspace Clustering (2012)
6328 - The Generalized Reparameterization Gradient (2016)
6074 - Proximal Deep Structured Models (2016)
5011 - Efficient Algorithm for Privately Releasing Smooth Queries (2013)
4095 - Learning Multiple Tasks with a Sparse Matrix-Normal Penalty (2010)
1710 - Learning to Parse Images (1999)
1389 - Serial Order in Reading Aloud: Connectionist Models and Neighborhood Structure (1997)
6060 - A Probabilistic Programming Approach To Probabilistic Data Analysis (2016)


## Query with result filtering

In [5]:
from whoosh.qparser import QueryParser
from whoosh import query

# Parse with filter on fields

with ix.searcher() as s:
    og = qparser.OrGroup.factory(0.5)
    qp = QueryParser("paper_text", schema=ix.schema, group=og)
    user_q = qp.parse("image processing")

    # Filter results for fields
    filter1 = query.Or([query.Term("year", 1994), query.Term("year", 1995)])
    filter2 = query.Wildcard("authors", "*shumeet*")
    allow_q = filter1 & filter2
    
    # Don't show any documents where the "tag" field contains "todo"
    restrict_q = query.Term("paper_text", "robots")

    results = s.search(user_q, filter=allow_q, mask=restrict_q)
    ResultDisplayer(results).displayResults()

<Top 2 Results for Or([Term('paper_text', 'image'), Term('paper_text', 'processing')]) runtime=0.36271308401569513>

880 - Using a Saliency Map for Active Spatial Selective Attention: Implementation & Initial Results (1994)
1168 - Human Face Detection in Visual Scenes (1995)
