In [1]:
import json
import time
import os
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import numpy as np
from bert_serving.client import BertClient

In [2]:
##Config
INDEX_NAME = 'posts'
INDEX_FILE = os.path.join(os.getcwd(), "data\index.json")
DATA_FILE = os.path.join(os.getcwd(), "data\posts.json")
BATCH_SIZE = 1000
SEARCH_SIZE = 5

#Client Object for Elastic Search
client = Elasticsearch(timeout=100)
bc = BertClient(ip='137.117.84.80')

In [3]:
## Adding Data / Indexing Data

def indexData():
    print('Creating Index .....')
    client.indices.delete(index = INDEX_NAME ,ignore = [404])
    
    with open(INDEX_FILE) as index_file:
        source = index_file.read().strip()
        client.indices.create(index=INDEX_NAME,body=source)
        
    docs = []
    count= 0
    with open(DATA_FILE) as data_file:
        for line in data_file:
            line = line.strip()

            doc = json.loads(line)
            if doc['type'] != 'question':
                continue

            docs.append(doc)
            count+=1

            if count % BATCH_SIZE == 0 :
                index_batch(docs)
                docs = []
                print("Indexed {} Documents".format(count))
        
        if docs:
            index_batch(docs)
            print("Indexed {} Documents".format(count))
            
    client.indices.refresh(index=INDEX_NAME)
    print("Done Indexing")
                
        
def index_batch(docs):
    titles = [doc['title'] for doc in docs]
    title_vectors = embed_text(titles)
    requests = []
    for i,doc in enumerate(docs):
        request = doc
        request["_op_type"] = 'index'
        request["_index"] = INDEX_NAME
        request['title_vector'] = title_vectors[i]
        requests.append(request)
    bulk(client,requests)
    
        
    

In [4]:
## Embedding Text 
def embed_text(text):
    vectors = bc.encode(text)
    return [vector.tolist() for vector in vectors]        

In [5]:
indexData()

Creating Index .....
Indexed 1000 Documents
Indexed 2000 Documents
Indexed 3000 Documents
Indexed 4000 Documents


here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


Indexed 5000 Documents
Indexed 6000 Documents
Indexed 7000 Documents
Indexed 8000 Documents
Indexed 9000 Documents
Indexed 10000 Documents
Indexed 11000 Documents
Indexed 12000 Documents
Indexed 13000 Documents
Indexed 14000 Documents
Indexed 15000 Documents
Indexed 16000 Documents
Indexed 17000 Documents
Indexed 18000 Documents
Indexed 18848 Documents
Done Indexing


In [6]:
#main
def query():
    while True:
        handle_query()
        
## Searching               
def handle_query():
    query = input("Enter Query - ")
    query_vector = embed_text([query])[0]
    
    script_query = {
        "script_score":{
            "query":{"match_all":{}},
            "script": {
                "source":"cosineSimilarity(params.query_vector, doc['title_vector']) + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    }
    
    response = client.search(index=INDEX_NAME,body={
            "size": SEARCH_SIZE,
            "query": script_query,
            "_source": {"includes": ["title", "body"]}
        }
    )
    print("{} total hits.".format(response["hits"]["total"]["value"]))
    for hit in response["hits"]["hits"]:
        print("id: {}, score: {}".format(hit["_id"], hit["_score"]))
        print(hit["_source"])
        print()

In [None]:
query()

Enter Query - onthology
10000 total hits.
id: 22lcsG0BEtnmCY9HQ-Os, score: 1.8241231
{'title': 'Genealogy Tree Control', 'body': "I've been tasked (by my wife) with creating a program to allow her to track the family trees on both sides of our family. Does anyone know of a cost effective (free) control to represent this type of information. What I'm looking for is a modified org-chart type chart/tree. The modification is that any node should have 2 parent nodes (E.G. a child should have a Mother/Father). The solution I've came up with so far is to have 2 trees, an ancestor tree and a descendants tree, with the individual being inspected as the root node for each tree. It works, but is sort of clunky. I'm working primarily in c# WinForms, so .Net type controls or source code is preferrable. "}

id: xGlfsG0BEtnmCY9HfP1K, score: 1.8233192
{'title': 'Asynchronous APIs', 'body': "When trying to implement an asynchronous API calls / Non-blocking calls, I know a little in a All Plain-C applic

In [None]:
!pip install -U bert-serving-client

In [None]:
## Flask API 


from flask import Flask, request, render_template
import flask
from flask_cors import CORS, cross_origin

app = Flask(__name__)
CORS(app, support_credentials=True)

@app.route('/')
def my_form():
    return render_template('index.html')

@app.route('/', methods=['POST'])
def my_form_post():
    text = request.form['text']
    processed_text = text.upper()
    return processed_text

if __name__ =='__main__':
    app.run(host=None,port=None)