# Naive demonstration of boolean retrieval

In [1]:
import nbimporter
from corpora import FileStream
from indexing import Tokenizer, MIndex
import numpy as np

importing Jupyter notebook from corpora.ipynb
importing Jupyter notebook from indexing.ipynb


In [2]:
folder = 'data/wikisearch/brat_20'
corpus = FileStream(folder, file_ext='txt')
tokenizer = Tokenizer(preserve_case=False)
B = MIndex()

In [3]:
for doc_id in corpus.docs:
    doc = corpus.doc(doc_id)
    tokens = tokenizer.remove_punctuation(tokenizer.tweet_tokenizer(doc))
    B.boolean(doc_id, tokens)

## Using term-document matrix

In [4]:
m, features, docs = B.boolean_to_matrix()

In [5]:
school = m[:,features.index('school')]
students = m[:,features.index('students')]

In [6]:
a = np.logical_and(school, students)

In [7]:
results = [docs[x] for x in np.where(a)[0]]

# Evaluation

In [8]:
import json

In [9]:
with open(folder + '/queries.json', 'rU') as inj:
    queries = json.load(inj)

In [10]:
E = queries['10']['page_ids']
Q = queries['10']['query']

In [11]:
print Q

government and education


In [12]:
Qt = tokenizer.remove_punctuation(tokenizer.tweet_tokenizer(Q))
Qt = [x for x in Qt if x != 'and']

In [13]:
vectors = []
for token in Qt:
    v = m[:,features.index(token)]
    vectors.append(v)

In [14]:
vand = vectors[0]
for x in vectors[1:]:
    vand = np.logical_and(vand, x)
vor = vectors[0]
for x in vectors[1:]:
    vor = np.logical_or(vor, x)

In [15]:
Ra = [docs[x].replace('.txt', '') for x in np.where(vand)[0]]
Ro = [docs[x].replace('.txt', '') for x in np.where(vor)[0]]

In [16]:
def precision(R, T):
    a = float(len([x for x in R if x in T]))
    b = float(len(R))
    try:
        p = a / b
    except ZeroDivisionError:
        p = np.nan
    return p

def recall(R, T):
    a = float(len([x for x in R if x in T]))
    b = float(len(T))
    try:
        p = a / b
    except ZeroDivisionError:
        p = np.nan
    return p

In [17]:
print precision(Ra, E), recall(Ra, E), len(Ra), len(E)

0.55 0.55 20 20


In [18]:
print precision(Ro, E), recall(Ro, E), len(Ro), len(E)

0.188118811881 0.95 101 20


In [19]:
print Qt

[u'government', u'education']
