In [1]:
import json

inv_index = {}

# считывание инвертированного индекса из файла

with open('./db/inverted_index.txt', 'r') as file:
    for line in file.readlines():
        word = line[:line.index('{')-2]
        rest = line[line.index('{')-1:]
        inv_index[word] = json.loads(rest)

tf_idf = {}
        
# считывание данных tf-idf из файла

with open('./db/tf_idf.txt','r') as file:
    tf_idf = json.loads(file.read())
    
# считывание ссылок документов из файла
    
path='./db/index.txt'
docs = []
with open(path,'r') as file:
    text = ''
    for line in file.readlines():
        docs.append(line.split(' ')[-1].replace('\n', ''))



In [2]:
from scipy import spatial
from pymorphy2 import MorphAnalyzer
morph = MorphAnalyzer()

# получение номеров документов, в котором встречается данное слово

def one_word_query(word, inv_index):
    word = word.strip()
    word = morph.normal_forms(word)[0]
    res = []
    if word in inv_index.keys():
        res = inv_index[word].keys()
    return res

# получение номеров документов, в котором встречается каждое слово из данного запроса

def phrase_query(query, inv_index):
    ll = []
    words = to_words(query)
    for word in words:
        if word in inv_index.keys():
            ll.append(inv_index[word].keys())
    setted = set(ll[0]).intersection(*ll)
    return list(setted)

def similarity(vec1, vec2):
    return 1 - spatial.distance.cosine(vec1, vec2)

def to_words(query):
    words = query.split(' ')
    for i in range(len(words)):
        word = words[i].strip()
        word = morph.normal_forms(word)[0]
        words[i] = word
    return words

# подсчет вектора значений tf-idf для каждого термина в запросе

def query_vec(query, inv_index, lms_res):
    words = to_words(query)
    i = 0
    while i < len(words):
        ww = words[i]
        if ww not in inv_index.keys():
            words.remove(ww)
        else:
            words[i] = ww
            i+=1
    query_vec = [0]*len(words)
    for i in range(len(words)):
        query_vec[i] = query_tf(words[i], words)*idf(words[i],lms_res)
        
    return query_vec
    
def query_tf(word, words):
    c = 0
    for w in words:
        if w == word:
            c+=1
    return c/len(words)

def idf(word, lms_res):
    for key in lms_res.keys():
        if word in lms_res[key].keys():
            return lms_res[key][word][1]

# получение вектора значений tf-idf всех терминов запроса для каждого документа
        
def doc_vecs(lms_res, docs, words):
    doc_vecs = {}
    for i in docs:
        doc_vecs[i] = []
        for w in words:
            if w in lms_res[i].keys():
                doc_vecs[i].append(lms_res[i][w][1])
            else:
                doc_vecs[i].append(0)
    return doc_vecs

# функция поиска

def search(query, inv_index, lms_res, docs):
    res = set(phrase_query(query, inv_index))
    words = to_words(query)
    for word in words:
        for i in one_word_query(word, inv_index):
            res.add(i)
    ds = doc_vecs(lms_res, res, words)
    qc = query_vec(query, inv_index, lms_res)
    smls = [[similarity(ds[i], qc),i] for i in ds.keys() ]
    smls.sort(key=lambda x: x[0])
    result = []
    print(smls)
    for i in range(len(smls)):
        result.append(docs[int(smls[i][1])-1])
    return result


In [3]:
from flask import Flask, render_template, request

app = Flask(__name__)


@app.route("/", methods=["GET","POST"])
def home():
    if request.method == "GET":
        return render_template('index.html')
    elif request.method == "POST":
        query = request.form['search']
        content = search(query, inv_index, tf_idf, docs)
        return render_template('index.html', content=content)


if __name__ == "__main__":
    app.run()

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000 (Press CTRL+C to quit)
127.0.0.1 - - [31/Mar/2022 21:16:58] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [31/Mar/2022 21:17:01] "POST / HTTP/1.1" 200 -


[[0.9999999999999998, '74'], [1, '14'], [1, '1']]


127.0.0.1 - - [31/Mar/2022 21:17:09] "POST / HTTP/1.1" 200 -


[[0.376695351651557, '23'], [0.376695351651557, '20'], [0.376695351651557, '49'], [0.376695351651557, '60'], [0.376695351651557, '68'], [0.376695351651557, '24'], [0.376695351651557, '34'], [0.376695351651557, '80'], [0.376695351651557, '69'], [0.376695351651557, '64'], [0.376695351651557, '21'], [0.376695351651557, '95'], [0.376695351651557, '81'], [0.376695351651557, '29'], [0.376695351651557, '46'], [0.376695351651557, '88'], [0.376695351651557, '30'], [0.376695351651557, '31'], [0.376695351651557, '86'], [0.376695351651557, '99'], [0.376695351651557, '36'], [0.376695351651557, '97'], [0.376695351651557, '38'], [0.376695351651557, '54'], [0.376695351651557, '25'], [0.8052799292743292, '8'], [0.9263372021268009, '50'], [0.9263372021268009, '85'], [0.9263372021268009, '82'], [0.9263372021268009, '40'], [0.9263372021268009, '93'], [0.9263372021268009, '9'], [0.926337202126801, '74'], [0.926337202126801, '75'], [0.926337202126801, '76'], [0.926337202126801, '90'], [0.926337202126801, '3