In [1]:
from Tools.Config import CONFIG
from Tools.Logger import logger
from Tools.Document import Document
from Tools.Preprocessing import Preprocessing
from Tools.ReverseIndex import ReverseIndex
from Tools.VectorizeTFIDF import vectorize
from Tools.Model import Model

In [2]:
documents = Document.parse_many(CONFIG['documents'])
documents['pre'] = Preprocessing(**CONFIG['preprocessing']).preprocessing_many(documents['text'])

In [3]:
reverseIndex = ReverseIndex()
reverseIndex.create(documents)
reverseIndex.persist(CONFIG['reverse_index']['filename'])

In [4]:
documents['vectorize'] = vectorize(reverseIndex, documents['pre'])

In [5]:
documents[['id', 'vectorize']].to_csv(CONFIG['path_to_vectors'])

In [165]:
import math
import numpy as np
import pandas as pd

class Model:
    def __init__(self, reverseIndex):
        self.reverseIndex = reverseIndex
        self.cos_lengths = None
        self.y = None

    def fit(self, X, y):
        self.calc_cos_lengths(X)
        self.y = y

    def calc_cos_lengths(self, X):
        cos_lengths = []
        
        for index in range(len(X)):
            zeros = np.zeros(len(X[index]))
            zeros[index] = self.calc_cos_length(X[index])
            cos_lengths.append(zeros)
        
        self.cos_lengths = np.array(cos_lengths)
    
    def calc_cos_length(self, doc):
        summed = sum([ weight*weight for weight in doc ])
        length = math.sqrt(summed)
        return length
    
    def predict(self, doc):
        doc_length = self.calc_cos_length(doc)
        
        div = [sum(cos_length)*doc_length for cos_length in self.cos_lengths]
        
        mult = self.cos_lengths * doc
        
        summeds = np.array([ sum(line) for line in mult ])
        
        results = [[ self.y[index], summeds[index] / div[index]] for index in range(len(summeds))]
        results.sort(reverse=True, key=lambda x: x[1])

        return results

model = Model(reverseIndex)
model.fit(documents['vectorize'], documents['id'])

In [166]:
pd.DataFrame(model.predict(np.array(documents['vectorize'][0]))).head()

Unnamed: 0,0,1
0,129,0.197386
1,77,0.155281
2,1149,0.137228
3,34,0.104434
4,926,0.103168


In [167]:
pd.DataFrame(model.predict(np.array(documents['vectorize'][1]))).head()

Unnamed: 0,0,1
0,260,0.437958
1,1068,0.216507
2,636,0.148705
3,573,0.126396
4,961,0.123665
