In [None]:
import gzip
import json
import os
import time
import pandas as pd
import numpy as np
import pickle
import requests
import configparser

from json import JSONDecodeError
from collections import Counter
from flask import Flask
from flask import request

In [None]:
"use this function for creating inverted index"
start = time.time()
inverted_index = create_inverted_index()
print(time.time() - start)

In [None]:
"create idf and normilized tf_idf from saved inverted_index"
dict_idf = idf(inverted_index)
dict_tf_idf = normalized_tf_idf_docs(inverted_index, docs, dict_idf)
requests.post('http://127.0.0.1:13541/rank/idf', json={'idf' : dict_idf,
                                                      'tf_idf': dict_tf_idf})

In [None]:
"save inverted index for using at home"
with open('reverse_index.pickle', 'wb') as handle:
    pickle.dump(inverted_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
"you can use saved inverted index at home"
with open('reverse_index.pickle', 'rb') as file:
    inverted_index = pickle.load(file)

In [None]:
docs_text = []
for root, dirs, files in os.walk("../../Data/by"):  
    for filename in files:
        if 'text' in filename and '-01-' in filename:
            with gzip.open('../../Data/by/' + filename, 'rb') as f:
                for line in f:
                    try: 
                        docs_text.append(json.loads(line))
                    except JSONDecodeError:
                        print("Can't read file " + filename)

meta_docs = []
for root, dirs, files in os.walk("../../Data/by"):  
    for filename in files:
        if 'text' not in filename and '-01-' in filename:
            with gzip.open('../../Data/by/' + filename, 'rb') as f:
                for line in f:
                    try: 
                        meta_docs.append(json.loads(line))
                    except JSONDecodeError:
                        print("Can't read file " + filename)

meta_docs = pd.DataFrame(meta_docs)
meta_docs.drop_duplicates(subset='id', keep='first', inplace=True)
docs_text = pd.DataFrame(docs_text)
docs_text.drop_duplicates(subset='id_job', keep='first', inplace=True)
docs=pd.merge(meta_docs, docs_text, how='inner', left_on='id', right_on='id_job')

In [None]:
def read_docs(path, files):
    meta_docs = []
    docs_text = []
    for file_name in files:
        if 'text' not in file_name:
            with gzip.open(path + file_name, 'rb') as f:
                for line in f:
                    try: 
                        meta_docs.append(json.loads(line))
                    except JSONDecodeError:
                        print("Can't read file " + filename)
        
        if 'text' in file_name:
            with gzip.open(path + file_name, 'rb') as f:
                for line in f:
                    try: 
                        docs_text.append(json.loads(line))
                    except JSONDecodeError:
                        print("Can't read file " + filename)
                    
    meta_docs = pd.DataFrame(meta_docs)
    meta_docs.drop_duplicates(subset='id', keep='first', inplace=True)
    docs_text = pd.DataFrame(docs_text)
    docs_text.drop_duplicates(subset='id_job', keep='first', inplace=True)
    docs=pd.merge(meta_docs, docs_text, how='inner', left_on='id', right_on='id_job')
    
    return docs      


In [None]:
def position_in_text(lst, term):
    '''
    lst - list of tokens
    term - term that you are looking for
    Return all positions of term in the list of tokens(0 = first token) 
    '''
    return [i for i, x in enumerate(lst) if x == term]

def position_in_sentences(lst, term):
    '''
    lst - list of list, where each nested list - seperate sentences
    term - term that you are looking for
    Return all sequence numbers of sentences in which you can find term (0 = first sentence) 
    '''    
    return [i for i, x in enumerate(lst) if term in x]

In [None]:
def idf(inverted_index):
    '''
    The function returns inverted document frequency
    for each term in inverted_index.
    
    Input: inverted_index - fresh inverted index.
    '''
    number_of_docs = docs.shape[0]
    idf = {term: np.log(number_of_docs / len(inverted_index[term])) for term in list(inverted_index.keys())}    
    return idf

def tf(document):
    '''
    The function returns tf for a certain document
    
    Input: dictionary that matches certain document
            and term (from inverted index)
    '''
    # len(document.position_text) - number of term appearances in the text
    # len(document.position_title) - number of term appearances in the title
    # w - weight for word in title
    w = 5
    return np.log(1 + (len(document.position_text) + w*len(document.position_title))
                  / document.doc_len)


def normalized_tf_idf_docs(inverted_index, docs, idf):
    '''
    The function creates dictionary for every document 
    with normalized tf_idf for terms that appear in this document.
    
    Input: dictionary with idf for every term; DataFrame with document;
            inverted index.
    Output: {
            docID1: {term1: tf_idf_docID1_term1, term2: tf_idf_docID1_term2, ...},
            docID2: {term1: tf_idf_docID2_term1, term3: tf_idf_docID2_term3, ...}, 
            ...,
            docIDn: {...}
            }
    '''
    tf_idf = {int(docID): {} for docID in docs.id.unique()}    
    
    for term in inverted_index.keys():
        for document in inverted_index[term]:
            tf_idf[document.id][term] = tf(document) * idf[term]
            
    # normalization:
    for docID in tf_idf.keys():
        norm = sum([i**2 for i in tf_idf[docID].values()])**.5
        tf_idf[docID] = {i[0]: i[1] / norm for i in tf_idf[docID].items()}
        
    return tf_idf

In [None]:
def intersect_all(terms, inverted_index):
    '''
    terms - list of terms for wich we want to intersect set of documents
    inverted_index - created inverted index
    '''
            
    if terms == []:
        return set()
    
    ans = set()
    for term in terms:            
        p = inverted_index.get(term)
        posting_list = set()
        for d in p:
            posting_list.add(d.id)
        if len(ans) == 0:
            ans = posting_list
        else:
            ans = ans & posting_list
    
    # if posting lists for terms don't intersect
    if ans == set():
        # find doc frequency because we want to find the rarest (more informational) term 
        # df -> {doc_freq: term}
        df = {len(inverted_index.get(term)): term for term in terms}
        term_with_min_df = df[min(list(df.keys()))]
        p = inverted_index.get(term_with_min_df)
        for d in p:
            ans.add(d.id)
    
    return ans

In [None]:
class Document:
    def __init__(self, id, count, term, text_tokens, title_tokens):
        self.id = id
        self.count = count
        self.position_text = self.position_in_doc(text_tokens, term)
        self.position_title = self.position_in_title(title_tokens, term)
        self.title_flag = any(term in sent for sent in title_tokens)
        self.position_sentence = self.position_in_sentence(text_tokens, term)
        self.doc_len = self.get_doc_len(text_tokens, title_tokens)

    def get_doc_len(self, text_tokens, title_tokens):
        length = 0
        
        for sent in text_tokens:
            length += len(sent)
            
        for sent in title_tokens:
            length += len(sent)
            
        return length
        
    def position_in_doc(self, text_tokens, term):
        flat_list_of_text_tokens = [item for sublist in text_tokens for item in sublist]
        return [i for i, x in enumerate(flat_list_of_text_tokens) if x == term]

    def position_in_title(self, title_tokens, term):
        flat_list_of_title_tokens = [item for sublist in title_tokens for item in sublist]
        return [i for i, x in enumerate(flat_list_of_title_tokens) if x == term]

    def position_in_sentence(self, text_tokens, term):
        return [i for i, x in enumerate(text_tokens) if term in x]

In [None]:
inverted_index = dict()
read_files = set()
docs = pd.DataFrame()

config = configparser.ConfigParser()
config.read('config.ini')

In [None]:
app = Flask(__name__)

@app.route("/reverseindex", methods=["POST"])
def reverseindex():
    global docs
    global inverted_index

    json_data = request.json
    words = json_data['data']
    print(words)
    # create flat list from list of lists
    words = [item for sublist in words for item in sublist]

    # reject words that are not in the inverted index
    words = [term for term in words if inverted_index.get(term) is not None]
    
    # intersect lists of documents for all processed words in query
    documents = list(intersect_all(words, inverted_index))
    
    # ranking if len(documents) is more than one
    print(documents)
    if len(documents) > 1:
        response_ranked = requests.post('http://127.0.0.1:13541/rank',
                                        json={'documents': documents,
                                              'words': words})
        parsed_ranked = json.loads(response_ranked.text)
        documents = parsed_ranked['ranked']
    
    # if maximal numbers of documents given, then select only the desired amount
    if json_data.get('max_docs'):
        documents = documents[:json_data.get('max_docs')]        
    
    # need this because type(docs['id']) is string in this dataFrame
    documents = [str(x) for x in documents]
            
    # index of sentences(first/second/etc) in which there are words from query for every doc
    pos = {key: [] for key in documents}
    for term in words:        
        p = inverted_index.get(term)
        for docID in documents:
            for d in p:
                if d.id == int(docID):
                    pos[docID] += d.position_sentence
                        
  
    # get text of found documentss
    ranked_documents = []
    for document in documents:
        ranked_documents += docs.loc[docs['id'] == document,
                                     ['id', 'title', 'text', 'url']].to_dict('records')
      
        
    print(ranked_documents)
    return json.dumps({"status":"ok", "got_data":json_data['data'], 
                       "processed_data": ranked_documents, "position": pos})


@app.route("/reverseindex/add", methods=['POST'])
def add():
    global docs
    global inverted_index
    
    path = config['Data']['Path']
    files = set([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))])
    docs_new = pd.DataFrame()

    for file_name in files:
        if file_name not in read_files and '-01-' in file_name:
            read_files.add(file_name)
    
    docs_new = read_docs(path, read_files)
    
    docs_new = docs_new.iloc[:100, :]
    for index, row in docs_new.iterrows():
        doc = dict()
        doc['docID'] = int(row['id'])
        
        response_analyze = requests.post('http://127.0.0.1:13533/analyze',
                                         json={'data' : row['text']})
        doc['text_searchable'] = json.loads(response_analyze.text)['words']
        
        response_analyze = requests.post('http://127.0.0.1:13533/analyze',
                                         json={'data' : row['title']})
        doc['title_searchable'] = json.loads(response_analyze.text)['words']
        
        tokens_text = doc['text_searchable']
        tokens_title = doc['title_searchable']

        tokens_text_flat = [item for sublist in tokens_text for item in sublist]
        tokens_title_flat = [item for sublist in tokens_title for item in sublist]
                
        number_of_occurrences = Counter(tokens_text_flat + tokens_title_flat)         
        for term in set(tokens_text_flat + tokens_title_flat):
            if term not in inverted_index:            
                inverted_index[term] = [Document(doc['docID'],
                                                  number_of_occurrences[term],
                                                  term,
                                                  tokens_text,
                                                  tokens_title)] 
            else:
                inverted_index[term].append(Document(doc['docID'],
                                                     number_of_occurrences[term],
                                                     term,
                                                     tokens_text,
                                                     tokens_title))
                
            
    # refresh idf and sent into service for ranking  
    docs = docs.append(docs_new, ignore_index=True)
    docs.drop_duplicates(subset='id_job', keep='first', inplace=True)
    
    docs.to_csv('documents.csv', sep='\t')
      
    #create idf and normilized tf_idf from saved inverted_index
    dict_idf = idf(inverted_index)
    dict_tf_idf = normalized_tf_idf_docs(inverted_index, docs, dict_idf)    
    requests.post('http://127.0.0.1:13541/rank/idf', json={'idf' : dict_idf,
                                                           'tf_idf': dict_tf_idf})

    return json.dumps({"status":"ok"}, ensure_ascii=False)

        

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=13538)