# VECTOR SPACE MODEL

In [None]:
import pymongo
import numpy as np
from collections import Counter, defaultdict

In [None]:
class VSM(object):
    
    def __init__(self, dbname, collection, field='text'):
        self.db = pymongo.MongoClient()[dbname][collection]
        self.field = field
        self.idfi = self.idf()
        self.mtf = self.maxtf()
        self.ndocs = len(self.db.distinct('document'))
        self.word_tf, self.doc_tf = self.tf()
    
    def tf(self):
        g = {'$group': {'_id': 
                        {'doc': '$document', 
                         'word': '$' + self.field},
                        'count': {'$sum': 1}
                       }}
        cursor = self.db.aggregate([g])
        result_a = defaultdict(lambda: defaultdict(lambda: 0))
        result_b = defaultdict(lambda: defaultdict(lambda: 0))
        for record in cursor:
            result_a[record['_id']['word']][record['_id']['doc']] = record['count']
            result_b[record['_id']['doc']][record['_id']['word']] = record['count']
        return result_a, result_b
        
    def maxtf(self):
        g = {'$group': {'_id': 
                        {'doc': '$document', 
                         'word': '$' + self.field},
                        'count': {'$sum': 1}
                       }}
        g2 = {'$group': {'_id': '$_id.doc', 'mtf': {'$max': '$count'}}}
        cursor = self.db.aggregate([g, g2])
        result = {}
        for record in cursor:
            result[record['_id']] = record['mtf']
        return result
        

    def idf(self):
        g = {'$group': {'_id': '$' + self.field,
                       'docs': {'$addToSet': '$document'}
                       }}
        cursor = self.db.aggregate([g])
        result = {}
        for record in cursor:
            result[record['_id']] = record['docs']
        return result
    
    def search(self, q_tokens):
        v_len, dot = {}, defaultdict(lambda: [])
        q = [token for token in q_tokens if token in self.word_tf.keys()]
        for t in q:
            idf_t = np.log(len(self.doc_tf) / len(self.word_tf[t]))
            for doc, tf in self.word_tf[t].items():
                max_tf = max(self.doc_tf[doc].values())
                if doc not in v_len.keys():
                    points = np.array([y * np.log(len(self.doc_tf) / len(self.word_tf[x])) 
                                       for x, y in self.doc_tf[doc].items()])
                    v_len[doc] = np.sum(points / max_tf)
                dot[doc].append(self.word_tf[t][doc] / max_tf)
        answer = sorted([(x, sum(y) / v_len[x]) for x, y in dot.items()], key=lambda k: -k[1])
        return answer