In [None]:
import gzip
import json
import os
import time
from json import JSONDecodeError
import pandas as pd
import numpy as np
import pickle
import requests

In [None]:
"use this function for creating inverted index"
start = time.time()
inverted_index = create_inverted_index()
print(time.time() - start)

In [None]:
"create idf from saved inverted_index"
requests.post('http://127.0.0.1:13541/rank/idf', json={'data' : idf(inverted_index)})

In [None]:
"save inverted index for using at home"
with open('reverse_index.pickle', 'wb') as handle:
    pickle.dump(inverted_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
"you can use saved inverted index at home"
with open('reverse_index.pickle', 'rb') as file:
    inverted_index = pickle.load(file)

In [None]:
docs_text = []
for root, dirs, files in os.walk("../Data/by"):  
    for filename in files:
        if 'text' in filename and '-01-' in filename:
            with gzip.open('../Data/by/' + filename, 'rb') as f:
                for line in f:
                    try: 
                        docs_text.append(json.loads(line))
                    except JSONDecodeError:
                        print("Can't read file " + filename)

In [None]:
meta_docs = []
for root, dirs, files in os.walk("../Data/by"):  
    for filename in files:
        if 'text' not in filename and '-01-' in filename:
            with gzip.open('../Data/by/' + filename, 'rb') as f:
                for line in f:
                    try: 
                        meta_docs.append(json.loads(line))
                    except JSONDecodeError:
                        print("Can't read file " + filename)

In [None]:
meta_docs = pd.DataFrame(meta_docs)
meta_docs.drop_duplicates(subset='id', keep='first', inplace=True)
docs_text = pd.DataFrame(docs_text)
docs_text.drop_duplicates(subset='id_job', keep='first', inplace=True)
docs=pd.merge(meta_docs, docs_text, how='inner', left_on='id', right_on='id_job')

In [None]:
inverted_index = dict()
read_files = set()
docs = pd.DataFrame()

In [None]:
import json
import requests
import numpy as np

from flask import Flask
from flask import request

app = Flask(__name__)

@app.route("/reverseindex", methods=["POST"])
def reverseindex():
    global docs
    global inverted_index

    json_data = request.json
    words = json_data['data']
    print(words)
    # create flat list from list of lists
    words = [item for sublist in words for item in sublist]

    # reject words that are not in the inverted index
    words = [term for term in words if inverted_index.get(term) is not None]
    
    # intersect lists of documents for all processed words in query
    documents = list(intersect_all(words, inverted_index))
    
    # ranking if len(documents) is more than one word
    index_slice = {}
    print(documents)
    if len(documents) > 1:
        index_slice = {key: [] for key in words}
        for term in words:
            index_slice[term] = inverted_index[term]
        
        response_ranked = requests.post('http://127.0.0.1:13541/rank', 
                                       json={'data' : index_slice, 
                                             'documents' : documents, 
                                             'words' : words})
        parsed_ranked = json.loads(response_ranked.text)
        documents = parsed_ranked['ranked']
        print(parsed_ranked)
    
    # if maximal numbers of documents given, then select only the desired amount
    if json_data.get('max_docs'):
        documents = documents[:json_data.get('max_docs')]        
    
    # need this because type(docs['id']) is string in this dataFrame
    documents = [str(x) for x in documents]
            
    # index of sentences(first/second/etc) in which there are words from query for every doc
    pos = {key: [] for key in documents}
    for term in words:        
        p = inverted_index.get(term)
        for docID in documents:
            for d in p:
                if d['docID'] == int(docID):
                    pos[docID] += d['pos_sent']
                        
  
    # get text of found documentss
    ranked_documents = []
    for document in documents:
        ranked_documents += docs.loc[docs['id'] == document, ['id', 'title', 'text', 'url']].to_dict('records')
      
        
    print(ranked_documents)
    return json.dumps({"status":"ok", "got_data":json_data['data'], 
                       "processed_data": ranked_documents, "position": pos})


@app.route("/reverseindex/add", methods=['POST'])
def add():
    global docs
    global inverted_index
    
    path = "../../Data/by/"
    files = set([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))])
    docs_new = pd.DataFrame()

    for file_name in files:
        if file_name not in read_files and '-01-' in file_name:
            read_files.add(file_name)
    
    docs_new = read_docs(path, read_files)
    
    docs_new = docs_new.iloc[100:120, :]
    for index, row in docs_new.iterrows():
        doc = dict()
        doc['docID'] = int(row['id'])
        response_analyze = requests.post('http://127.0.0.1:13533/analyze',
                                         json={'data' : row['text']})
        doc['text_searchable'] = json.loads(response_analyze.text)['words']
        response_analyze = requests.post('http://127.0.0.1:13533/analyze',
                                         json={'data' : row['title']})
        doc['title_searchable'] = json.loads(response_analyze.text)['words']
        
        tokens_text = [item for sublist in doc['text_searchable'] for item in sublist]
        tokens_title = [item for sublist in doc['title_searchable'] for item in sublist]
                
        number_of_occurrences = Counter(tokens_text+tokens_title)         
        for term in set(tokens_title+tokens_text):
            title_flag = term in tokens_title
            
            if term not in inverted_index:            
                inverted_index[term] = [ {'docID': doc['docID'], 'count': number_of_occurrences[term],
                                          'pos': position_in_text(tokens_text, term), 
                                          'pos_title': position_in_text(tokens_title, term),
                                          'title_flag': title_flag, 
                                          'pos_sent': position_in_sentences(doc['text_searchable'], term),
                                          'doc_length': len(tokens_text + tokens_title)}] 
            else:
                inverted_index[term].append({'docID': doc['docID'], 'count': number_of_occurrences[term],
                                             'pos': position_in_text(tokens_text, term), 
                                             'pos_title': position_in_text(tokens_title, term),
                                             'title_flag': title_flag, 
                                             'pos_sent': position_in_sentences(doc['text_searchable'], term),
                                             'doc_length': len(tokens_text + tokens_title)})
                
            
    # refresh idf and sent into service for ranking  
    docs = docs.append(docs_new, ignore_index=True)
    docs.drop_duplicates(subset='id_job', keep='first', inplace=True)
    
    requests.post('http://127.0.0.1:13541/rank/idf', json={'data' : idf(inverted_index)})

    return json.dumps({"status":"ok", "added" : docs_new.to_dict()}, ensure_ascii=False)

        

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=13538)

In [None]:
def read_docs(path, files):
    meta_docs = []
    docs_text = []
    for file_name in files:
        print(file_name)
        if 'text' not in file_name:
            with gzip.open(path + file_name, 'rb') as f:
                for line in f:
                    try: 
                        meta_docs.append(json.loads(line))
                    except JSONDecodeError:
                        print("Can't read file " + filename)
        
        if 'text' in file_name:
            with gzip.open(path + file_name, 'rb') as f:
                for line in f:
                    try: 
                        docs_text.append(json.loads(line))
                    except JSONDecodeError:
                        print("Can't read file " + filename)
                    
    meta_docs = pd.DataFrame(meta_docs)
    meta_docs.drop_duplicates(subset='id', keep='first', inplace=True)
    docs_text = pd.DataFrame(docs_text)
    docs_text.drop_duplicates(subset='id_job', keep='first', inplace=True)
    docs=pd.merge(meta_docs, docs_text, how='inner', left_on='id', right_on='id_job')
    
    return docs

In [None]:
import pandas as pd
from collections import Counter

def position_in_text(lst, term):
    '''
    lst - list of tokens
    term - term that you are looking for
    Return all positions of term in the list of tokens(0 = first token) 
    '''
    return [i for i, x in enumerate(lst) if x == term]

def position_in_sentences(lst, term):
    '''
    lst - list of list, where each nested list - seperate sentences
    term - term that you are looking for
    Return all sequence numbers of sentences in which you can find term (0 = first sentence) 
    '''    
    return [i for i, x in enumerate(lst) if term in x]

def create_inverted_index():
    '''
    Return inverted index in list of dictionaries
    word -> [{documentID, count of occurencies in document, positions in doc_text
    position in doc_title, title_flag, position in sentences, docs length}, ...]
    '''
    global docs
    inverted_index = dict()
    docs = docs.iloc[:100,:]
    for index, row in docs.iterrows():
        doc = dict()
        doc['docID'] = int(row['id'])
        response_analyze = requests.post('http://127.0.0.1:13533/analyze', json={'data' : row['text']})
        # doc['text_searchable'] - list of lists, where each nested list - separate sentence
        doc['text_searchable'] = json.loads(response_analyze.text)['words']
        response_analyze = requests.post('http://127.0.0.1:13533/analyze', json={'data' : row['title']})
        # doc['title_searchable'] - list of lists
        doc['title_searchable'] = json.loads(response_analyze.text)['words']
      
        # create flat list from list of lists
        tokens_text = [item for sublist in doc['text_searchable'] for item in sublist]
        tokens_title = [item for sublist in doc['title_searchable'] for item in sublist]
               
        number_of_occurrences = Counter(tokens_text + tokens_title)         
        for term in set(tokens_title + tokens_text):
            title_flag = False
            if term in tokens_title:
                title_flag = True
                
            if term not in inverted_index:                
                inverted_index[term] = [ {'docID': doc['docID'], 'count': number_of_occurrences[term],
                                          'pos': position_in_text(tokens_text, term), 
                                          'pos_title': position_in_text(tokens_title, term),
                                          'title_flag': title_flag,
                                          'pos_sent': position_in_sentences(doc['text_searchable'], term),
                                          'doc_length': len(tokens_text + tokens_title)}] 
            else:
                inverted_index[term].append({'docID': doc['docID'], 'count': number_of_occurrences[term],
                                             'pos': position_in_text(tokens_text, term), 
                                             'pos_title': position_in_text(tokens_title, term),
                                             'title_flag': title_flag,
                                             'pos_sent': position_in_sentences(doc['text_searchable'], term),
                                             'doc_length': len(tokens_text + tokens_title)})
                
    # refresh idf and sent into service for ranking         
    requests.post('http://127.0.0.1:13541/rank/idf', json={'data' : idf(inverted_index)})
              
        
    return(inverted_index)

In [None]:
def idf(inverted_index):
    '''
    inverted_index - fresh inverted index
    return inverted document frequency for each term in inverted_index
    '''
    number_of_docs = docs.shape[0]
    print(number_of_docs)
    idf = {term: np.log(number_of_docs / len(inverted_index[term])) for term in list(inverted_index.keys())}
    
    return idf

In [None]:
def intersect_all(terms, inverted_index):
    '''
    terms - list of terms for wich we want to intersect set of documents
    inverted_index - created inverted index
    '''
            
    if terms == []:
        return set()
    
    ans = set()
    for term in terms:            
        p = inverted_index.get(term)
        posting_list = set()
        for d in p:
            posting_list.add(d['docID'])
        if len(ans) == 0:
            ans = posting_list
        else:
            ans = ans & posting_list
    
    # if posting lists for terms don't intersect
    if ans == set():
        # find doc frequency because we want to find the rarest (more informational) term 
        # df -> {doc_freq: term}
        df = {len(inverted_index.get(term)): term for term in terms}
        term_with_min_df = df[min(list(df.keys()))]
        p = inverted_index.get(term_with_min_df)
        for d in p:
            ans.add(d['docID'])
    
    return ans