# In-memory indexes for recepies
Classes for indexing recepies in memory providing unigram, 2-gram, and 3-gram indexing.

In [1]:
import pymongo
import nltk
from collections import defaultdict

In [4]:
class Index(object):
    
    def __init__(self, dbname, collection, docids=None, 
                 labels=None, field='text', ngrams=False):
        self.tokens = pymongo.MongoClient()[dbname][collection]
        self.docids = docids
        self.field = field
        self.labels = labels
        self.docs = set()
        self.U = defaultdict(lambda: defaultdict(lambda: 0))
        self.B = defaultdict(lambda: defaultdict(lambda: 0))
        self.T = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0)))
        self._unigram()
        if ngrams:
            self._ngram()
        
    def _filtering(self):
        pipeline = []
        q = {}
        if self.docids is not None:
            q['document'] = {'$in': self.docids}
        if self.labels is not None:
            q['label'] = {'$in': self.labels}
        if len(q) > 0:
            m = {'$match': q}
            pipeline.append(m)
        return pipeline
        
    def _unigram(self):
        pipeline = self._filtering()
        g = {'$group': {'_id': {
            'doc': '$document', 'token': '$' + self.field},
                       'count': {'$sum': 1}}}
        pipeline.append(g)
        for record in self.tokens.aggregate(pipeline, allowDiskUse=True):
            r, d = record['_id']['token'], record['_id']['doc']
            self.docs.add(d)
            self.U[r][d] = record['count']
            
    def _ngram(self):
        pipeline = self._filtering()
        s = {'$sort': {'document': 1, 'label': 1, 'sentence': 1}}
        g = {'$group': {'_id': {'doc': '$document', 'sent': '$sentence'}, 
                        'tokens': {'$push': '$' + self.field}}}
        pipeline.append(g)
        for record in self.tokens.aggregate(pipeline, allowDiskUse=True):
            for k, v in nltk.ngrams(record['tokens'], n=2):
                self.B[k][v] += 1
            for k, v, z in nltk.ngrams(record['tokens'], n=3):
                self.T[k][v][z] += 1
