In [3]:
import gzip
import json
import os
from json import JSONDecodeError
import pandas as pd
import pickle
import requests

In [119]:
docs = pd.read_csv('Data/eval_texts.csv', sep='\\t', encoding='utf-8')
docs.drop_duplicates(subset='id', keep='first', inplace=True)

  """Entry point for launching an IPython kernel.


In [120]:
with open('reverse_index.pickle', 'rb') as file:
    inverted_index = pickle.load(file)

In [None]:
"use this function for creating inverted index"
inverted_index = create_inverted_index()

In [None]:
"save inverted index for using at home"
with open('reverse_index.pickle', 'wb') as handle:
    pickle.dump(inverted_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
"you can use saved inverted index at home"
with open('reverse_index.pickle', 'rb') as file:
    inverted_index = pickle.load(file)

In [113]:
docs_text = []
for root, dirs, files in os.walk("Data/by"):  
    for filename in files:
        if 'text' in filename:
            with gzip.open('Data/by/' + filename, 'rb') as f:
                for line in f:
                    try: 
                        docs_text.append(json.loads(line))
                    except JSONDecodeError:
                        print("Can't read file " + filename)

In [114]:
meta_docs = []
for root, dirs, files in os.walk("Data/by"):  
    for filename in files:
        if 'text' not in filename:
            with gzip.open('Data/by/' + filename, 'rb') as f:
                for line in f:
                    try: 
                        meta_docs.append(json.loads(line))
                    except JSONDecodeError:
                        print("Can't read file " + filename)

In [115]:
meta_docs = pd.DataFrame(meta_docs)
meta_docs.drop_duplicates(subset='id', keep='first', inplace=True)
docs_text = pd.DataFrame(docs_text)
docs_text.drop_duplicates(subset='id_job', keep='first', inplace=True)
docs=pd.merge(meta_docs, docs_text, how='inner', left_on='id', right_on='id_job')

In [None]:
from flask import Flask
from flask import request
import json
import requests
import numpy as np

app = Flask(__name__)

@app.route("/reverseindex", methods=["POST"])
def reverseindex():
    json_data = request.json
    words = json_data['data']
    
    #intersect lists of documents for all processed words in query
    documents = list(intersect_all(words, inverted_index))
    
    #if maximal numbers of documents given, then select only the desired amount
    if json_data.get('max_docs'):
        documents = documents[:json_data.get('max_docs')]
    
    #get text of found documentss
    documents = docs[docs['id'].apply(lambda x : x in documents)].loc[:, ['id', 'text']].to_dict('records')
    
    return json.dumps({"status":"ok", "got_data":json_data['data'], "processed_data": documents})


'''@app.route("/reverseindex/add", methods=['POST'])
def add():
    TODO
    Add doc to reverse index
    Imagine we have new_texts.csv
    We have to create new service with:    
    docs = pd.read_csv('../../Data/new_texts.csv', sep='\t')
    docs.drop_duplicates(subset='id', keep='first', inplace=True)
    for i in range(docs.shape[0]):
        doc = dict()
        doc['docID'] = int(docs.loc[i, 'id'])
        response_analyze_title = requests.post('http://127.0.0.1:13533/analyze', json={'data' : docs.loc[i, 'text_title']})
        doc['title_searchable'] = json.loads(response_analyze_title.text)['words']
        response_analyze_text = requests.post('http://127.0.0.1:13533/analyze', json={'data' : docs.loc[i, 'text']})
        doc['text_searchable'] = json.loads(response_analyze_text.text)['words']
        response_add_index = requests.post('http://127.0.0.1:13538/reverseindex/add', json={'data' : doc})
        
        Also we need to join 'eval_texts.csv' with 'new_texts.csv. Where and how?
'''    

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=13538)

In [116]:
import pandas as pd
from collections import Counter

def position_in_text(lst, term):
    '''
    lst - list of tokens
    term - term that you are looking for
    Return all positions of term in the list of tokens(0 = first token) 
    '''
    return [i for i, x in enumerate(lst) if x == term]

def create_inverted_index():
    '''
    Return inverted index in list of dictionaries
    word -> [{documentID, count of occurencies in document, positions in document, title_flag}, ...]
    '''
    inverted_index = dict()
    for i in range(docs.shape[0]):
        doc = dict()
        doc['docID'] = int(docs.loc[i, 'id'])
        response_analyze = requests.post('http://127.0.0.1:13533/analyze', json={'data' : docs.loc[i, 'text']})
        doc['text_searchable'] = json.loads(response_analyze.text)['words']
        response_analyze = requests.post('http://127.0.0.1:13533/analyze', json={'data' : docs.loc[i, 'title']})
        doc['title_searchable'] = json.loads(response_analyze.text)['words']
        
        
        tokens_text = doc['text_searchable']
        tokens_title = doc['title_searchable']
        
        number_of_occurrences = Counter(tokens_text+tokens_title)         
        for term in set(tokens_title+tokens_text):
            title_flag = False
            if term in tokens_title:
                title_flag = True
                
            if term not in inverted_index:                
                inverted_index[term] = [ {'docID': doc['docID'], 'count': number_of_occurrences[term],
                                          'pos': position_in_text(tokens_text, term), 
                                          'pos_title': position_in_text(tokens_title, term),
                                          'title_flag': title_flag}] 
            else:
                inverted_index[term].append({'docID': doc['docID'], 'count': number_of_occurrences[term],
                                             'pos': position_in_text(tokens_text, term), 
                                             'pos_title': position_in_text(tokens_title, term),
                                             'title_flag': title_flag})
    return(inverted_index)

In [117]:
def intersect_all(terms, inverted_index):
    '''
    terms - list of terms for wich we want to intersect set of documents
    inverted_index - created inverted index
    '''
    ans = set()
    for term in terms:            
        p = inverted_index.get(term)
        posting_list=set()
        if not p:
            return set()
        for d in p:
            posting_list.add(d['docID'])
        if len(ans) == 0:
            ans = posting_list
        else:
            ans = ans & posting_list

    return ans