In [3]:
import gzip
import json
import os
from json import JSONDecodeError
import pandas as pd
import pickle
import requests
import time

In [119]:
docs = pd.read_csv('Data/eval_texts.csv', sep='\\t', encoding='utf-8')
docs.drop_duplicates(subset='id', keep='first', inplace=True)

  """Entry point for launching an IPython kernel.


In [290]:
with open('reverse_index_new.pickle', 'rb') as file:
    inverted_index = pickle.load(file)

In [None]:
"use this function for creating inverted index"
start = time.time()
inverted_index = create_inverted_index()
print(time.time()-start)

In [240]:
"save inverted index for using at home"
with open('reverse_index_new.pickle', 'wb') as handle:
    pickle.dump(inverted_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
"you can use saved inverted index at home"
with open('reverse_index_new.pickle', 'rb') as file:
    inverted_index = pickle.load(file)

In [13]:
docs_text = []
for root, dirs, files in os.walk("Data/by"):  
    for filename in files:
        if 'text' in filename:
            with gzip.open('Data/by/' + filename, 'rb') as f:
                for line in f:
                    try: 
                        docs_text.append(json.loads(line))
                    except JSONDecodeError:
                        print("Can't read file " + filename)

In [14]:
meta_docs = []
for root, dirs, files in os.walk("Data/by"):  
    for filename in files:
        if 'text' not in filename:
            with gzip.open('Data/by/' + filename, 'rb') as f:
                for line in f:
                    try: 
                        meta_docs.append(json.loads(line))
                    except JSONDecodeError:
                        print("Can't read file " + filename)

In [16]:
meta_docs = pd.DataFrame(meta_docs)
meta_docs.drop_duplicates(subset='id', keep='first', inplace=True)
docs_text = pd.DataFrame(docs_text)
docs_text.drop_duplicates(subset='id_job', keep='first', inplace=True)
docs=pd.merge(meta_docs, docs_text, how='inner', left_on='id', right_on='id_job')

In [27]:
docs = docs.iloc[:1000, :]

In [None]:
from flask import Flask
from flask import request
import json
import requests
import numpy as np

app = Flask(__name__)

read_files = ['eval_texts.csv']

@app.route("/reverseindex", methods=["POST"])
def reverseindex():
    json_data = request.json
    words = json_data['data']
    
    #intersect lists of documents for all processed words in query
    documents = list(intersect_all(words, inverted_index))

    #if maximal numbers of documents given, then select only the desired amount
    if json_data.get('max_docs'):
        documents = documents[:json_data.get('max_docs')]
    
    #get text of found documentss
    documents = docs[docs['id'].apply(lambda x : int(x) in documents)].loc[:, ['id', 'text']].to_dict('records')
    return json.dumps({"status":"ok", "got_data":json_data['data'], "processed_data": documents})


@app.route("/reverseindex/add", methods=['POST'])
def add():
    global docs
    
    files_to_read = []
    mypath = "Data"
    files = [f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
    for filename in files:
        if filename not in read_files:
            files_to_read.append(filename)
            read_files.append(filename)
                
    docs_new = pd.DataFrame()
    for file_name in files_to_read:
        print(file_name)
        docs_new = pd.read_csv("{}\{}".format(mypath, file_name), sep='\t')
        docs_new.drop_duplicates(subset='id', keep='first', inplace=True)
        docs = docs.append(docs_new, ignore_index=True)
    
    for index, row in docs_new.iterrows():
        doc = dict()
        doc['docID'] = int(row['id'])
        response_analyze = requests.post('http://127.0.0.1:13533/analyze', json={'data' : row['text']})
        doc['text_searchable'] = json.loads(response_analyze.text)['words']
        response_analyze = requests.post('http://127.0.0.1:13533/analyze', json={'data' : row['title']})
        doc['title_searchable'] = json.loads(response_analyze.text)['words']
        
        
        tokens_text = doc['text_searchable']
        tokens_title = doc['title_searchable']
        
        number_of_occurrences = Counter(tokens_text+tokens_title)         
        for term in set(tokens_title+tokens_text):
            title_flag = False
            if term in tokens_title:
                title_flag = True
                
            if term not in inverted_index:                
                inverted_index[term] = [ {'docID': doc['docID'], 'count': number_of_occurrences[term],
                                          'pos': position_in_text(tokens_text, term), 
                                          'pos_title': position_in_text(tokens_title, term),
                                          'title_flag': title_flag}] 
            else:
                inverted_index[term].append({'docID': doc['docID'], 'count': number_of_occurrences[term],
                                             'pos': position_in_text(tokens_text, term), 
                                             'pos_title': position_in_text(tokens_title, term),
                                             'title_flag': title_flag})
    return json.dumps({"status":"ok", "added" : docs_new.to_dict()}, ensure_ascii=False)

        

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=13538)

In [8]:
import pandas as pd
from collections import Counter

def position_in_text(lst, term):
    '''
    lst - list of tokens
    term - term that you are looking for
    Return all positions of term in the list of tokens(0 = first token) 
    '''
    return [i for i, x in enumerate(lst) if x == term]

def create_inverted_index():
    '''
    Return inverted index in list of dictionaries
    word -> [{documentID, count of occurencies in document, positions in document, title_flag}, ...]
    '''
    inverted_index = dict()
    for index, row in docs.iterrows():
        doc = dict()
        doc['docID'] = int(row['id'])
        response_analyze = requests.post('http://127.0.0.1:13533/analyze', json={'data' : row['text']})
        doc['text_searchable'] = json.loads(response_analyze.text)['words']
        response_analyze = requests.post('http://127.0.0.1:13533/analyze', json={'data' : row['title']})
        doc['title_searchable'] = json.loads(response_analyze.text)['words']
        
        
        tokens_text = doc['text_searchable']
        tokens_title = doc['title_searchable']
        
        number_of_occurrences = Counter(tokens_text+tokens_title)         
        for term in set(tokens_title+tokens_text):
            title_flag = False
            if term in tokens_title:
                title_flag = True
                
            if term not in inverted_index:                
                inverted_index[term] = [ {'docID': doc['docID'], 'count': number_of_occurrences[term],
                                          'pos': position_in_text(tokens_text, term), 
                                          'pos_title': position_in_text(tokens_title, term),
                                          'title_flag': title_flag}] 
            else:
                inverted_index[term].append({'docID': doc['docID'], 'count': number_of_occurrences[term],
                                             'pos': position_in_text(tokens_text, term), 
                                             'pos_title': position_in_text(tokens_title, term),
                                             'title_flag': title_flag})
    return inverted_index

In [19]:
def intersect_all(terms, inverted_index):
    '''
    terms - list of terms for wich we want to intersect set of documents
    inverted_index - created inverted index
    '''
    ans = set()
    for term in terms:            
        p = inverted_index.get(term)
        posting_list=set()
        if not p:
            return set()
        for d in p:
            posting_list.add(d['docID'])
        if len(ans) == 0:
            ans = posting_list
        else:
            ans = ans & posting_list

    return ans