# PROBABILISTIC METHODS FOR SEARCH

In [1]:
import pymongo
import numpy as np
from collections import defaultdict

In [None]:
class BIM(object):
    
    def __init__(self, dbname, collection, 
                 true_doc_ids, field='text'):
        self.db = pymongo.MongoClient()[dbname][collection]
        self.Td = true_doc_ids
        self.N = len(self.db.distinct('document'))
        self.field = field
        self.P, self.B = {}, {}
        self._boolean_indexing()
        self.idfi = self.idf()

    def idf(self):
        g = {'$group': {'_id': '$' + self.field,
                       'docs': {'$addToSet': '$document'}
                       }}
        cursor = self.db.aggregate([g])
        result = {}
        for record in cursor:
            result[record['_id']] = record['docs']
        return result
    
    def _boolean_indexing(self):
        g = {'$group': {'_id': '$' + self.field, 'docs': {'$addToSet': '$document'}}}
        cursor = self.db.aggregate([g])
        for record in cursor:
            docs = list(record['docs'])
            self.B[record['_id']] = docs
            pi = (len([x for x in docs if x in self.Td]) + 1) / self.N
            qi = (len(docs) - pi) / self.N
            self.P[record['_id']] = (pi, qi)
        
    def search(self, q_tokens):
        q = [token for token in q_tokens if token in self.B.keys()]
        Pr = defaultdict(lambda: 0)
        for t in q:
            pi, qi = self.P[t]
            for doc in self.B[t]:
                Pr[doc] += np.log((pi * (1 - qi)) / (qi * (1 - pi)))
        answer = sorted([(x, y) for x, y in Pr.items()], key=lambda k: -k[1])
        return answer