### Elastic search request api

In [2]:
import json
import logging

import requests


def elasticsearch_curl(uri="http://localhost:9200/", json_body="", verb="get"):
    # pass header option for content type if request has a
    # body to avoid Content-Type error in Elasticsearch v6.0
    headers = {
        "Content-Type": "application/json",
    }
    resp = None
    try:
        # make HTTP verb parameter case-insensitive by converting to lower()
        if verb.lower() == "get":
            resp = requests.get(uri, headers=headers, data=json_body)
        elif verb.lower() == "post":
            resp = requests.post(uri, headers=headers, data=json_body)
        elif verb.lower() == "put":
            resp = requests.put(uri, headers=headers, data=json_body)
        elif verb.lower() == "del":
            resp = requests.delete(uri, headers=headers, data=json_body)
            return None

        # read the text object string
        try:
            resp_text = json.loads(resp.text)
        except:
            resp_text = resp.text

        # catch exceptions and print errors to terminal
    except Exception as error:
        logging.warning("resp:", resp)
        logging.warning("uri:", uri)
        logging.warning("\nelasticsearch_curl() error:", error)
        resp_text = None

    # return the Python dict of the request
    #     print ("resp_text:", resp_text)
    return resp_text


def del_all_scroll():
    response = elasticsearch_curl(
        uri="http://localhost:9200/_search/scroll/_all", verb="del"
    )
    return response


def del_pit(pit):
    json_data = json.dumps({"id": pit})
    response = elasticsearch_curl(
        uri="http://localhost:9200/_pit", json_body=json_data, verb="del"
    )
    return response


response = elasticsearch_curl("http://localhost:9200/wikipedia_sentences/_count")
print(response)

{'count': 0, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}


### Post a sentence to the index

In [3]:
def post_sentence(dict_object):
    # Post one example to news_event_frame index
    if dict_object:
        json_object = json.dumps(dict_object)
    else:
        return None
    response = elasticsearch_curl(
        "http://localhost:9200/wikipedia_sentences/_doc",
        verb="post",
        json_body=json_object,
    )
    return response


def bulk_post(bulk_dict_data, index_name="wikipedia_sentences"):
    if len(bulk_dict_data) < 1:
        return None
    # Post multiple examples to an index
    # A list of data dict
    meta_json = json.dumps({"index": {"_index": index_name, "_type": "_doc"}})
    data_to_post = (
        "\n".join(meta_json + "\n" + json.dumps(d) for d in bulk_dict_data) + "\n"
    )
    response = elasticsearch_curl(
        f"http://localhost:9200/_bulk", verb="post", json_body=data_to_post,
    )
    return response


# test_data_dict = [{"doc": "This is a test text."} for i in range(500000)]
# # response = post_sentence(test_data_dict)
# # print(response)
# response = bulk_post(test_data_dict)
# print(response)

### Insert sentences into elastic index

In [4]:
from tqdm.auto import tqdm

file_path = "../data/wikipedia/english_wiki.txt"
bulk_size = 80000
count = 0
data_dict_list = []
response = elasticsearch_curl("http://localhost:9200/wikipedia_sentences/_count")
count = response["count"]
with open(file_path) as f:
    for line in tqdm(f.readlines()):
        count += 1
        data_dict = {"doc": line}
        data_dict_list.append(data_dict)
        if count == bulk_size:
            bulk_post(data_dict_list)
            data_dict_list = []
            count = 0
            response = elasticsearch_curl(
                "http://localhost:9200/wikipedia_sentences/_count"
            )
            print("current count:", response["count"])

    # Put the final set into elastic 
    bulk_post(data_dict_list)
    response = elasticsearch_curl(
        "http://localhost:9200/wikipedia_sentences/_count"
    )
    print("current count:", response["count"])

HBox(children=(FloatProgress(value=0.0, max=126899104.0), HTML(value='')))

current count: 80000
current count: 160000
current count: 240000
current count: 320000
current count: 400000
current count: 480000
current count: 560000
current count: 640000
current count: 720000
current count: 800000
current count: 880000
current count: 960000
current count: 1040000
current count: 1120000
current count: 1200000
current count: 1280000
current count: 1360000
current count: 1440000
current count: 1520000
current count: 1600000
current count: 1680000
current count: 1760000
current count: 1840000
current count: 1920000
current count: 2000000
current count: 2080000
current count: 2160000
current count: 2240000
current count: 2320000
current count: 2400000
current count: 2480000
current count: 2560000
current count: 2640000
current count: 2720000
current count: 2800000
current count: 2880000
current count: 2960000
current count: 3040000
current count: 3120000
current count: 3200000
current count: 3280000
current count: 3360000
current count: 3440000
current count: 3520000
c