# Boolean retrieval
A toy implementation of a boolean retrieval system.

In [1]:
import numpy as np
import nbimporter
import utilities as utils
from collections import defaultdict

Importing Jupyter notebook from utilities.ipynb


In [2]:
class Boolean(object):
    
    AND = 1
    OR = 0
    
    def __init__(self, corpus):
        self.dictionary = defaultdict(lambda: set())
        self._indexing(corpus)
        
    def _tokenize(self, texts):
        for tid, text in texts:
            yield (tid, text.split())
    
    def _indexing(self, texts):
        for tid, tokens in self._tokenize(texts):
            for token in tokens:
                self.dictionary[token].add(tid)
    
    def query(self, query, mode=1):
        q = [t for t in query.split() if 
             t in self.dictionary.keys()]
        results = self.dictionary[q[0]]
        for t in q[1:]:
            if mode == Boolean.AND:
                results = results.intersection(
                    self.dictionary[t]
                )
            else:
                results = results.union(
                    self.dictionary[t]
                )
        return results, q

In [3]:
folder = '/Users/alfio/Dati/cranfield/cran'

In [4]:
C = utils.Cranfield(folder)

In [5]:
B = Boolean(C)

In [6]:
queries = list(C.readdocs(C.queryfile))
q = queries[0]['text']

In [7]:
print(q)

 what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft .


In [23]:
docs, qtokens = B.query("speed aircraft", 
                        mode=Boolean.AND)

In [24]:
relevance = C.relevance()

In [25]:
relevance.head()

Unnamed: 0,query,doc,relevance
0,1,184,2
1,1,29,2
2,1,31,2
3,1,12,3
4,1,51,3


In [26]:
E = relevance.loc[(relevance['query']==1) & (relevance['relevance']>=3)]['doc'].values

## Evaluation

In [27]:
from sklearn.metrics import confusion_matrix

In [28]:
corpus_ids = [x for x, y in C]

In [29]:
retrieved = np.zeros(len(corpus_ids))
expected = np.zeros(len(corpus_ids))
for x in docs:
    retrieved[corpus_ids.index(x)] = 1
for x in E:
    expected[corpus_ids.index(x)] = 1

In [16]:
expected

array([0., 0., 0., ..., 0., 0., 0.])

In [30]:
cm = confusion_matrix(expected, retrieved)

In [31]:
cm

array([[1372,    7],
       [  20,    1]])

In [32]:
tn, fp, fn, tp = cm.ravel()

In [33]:
print ((tp) / (fp + tp))
print ((tp) / (fn + tp))

0.125
0.047619047619047616
