### Elastic search request api

In [1]:
import json
import logging

import requests


def elasticsearch_curl(uri="http://localhost:9200/", json_body="", verb="get"):
    # pass header option for content type if request has a
    # body to avoid Content-Type error in Elasticsearch v6.0
    headers = {
        "Content-Type": "application/json",
    }
    resp = None
    try:
        # make HTTP verb parameter case-insensitive by converting to lower()
        if verb.lower() == "get":
            resp = requests.get(uri, headers=headers, data=json_body)
        elif verb.lower() == "post":
            resp = requests.post(uri, headers=headers, data=json_body)
        elif verb.lower() == "put":
            resp = requests.put(uri, headers=headers, data=json_body)
        elif verb.lower() == "del":
            resp = requests.delete(uri, headers=headers, data=json_body)
            return None

        # read the text object string
        try:
            resp_text = json.loads(resp.text)
        except:
            resp_text = resp.text

        # catch exceptions and print errors to terminal
    except Exception as error:
        logging.warning("resp:", resp)
        logging.warning("uri:", uri)
        logging.warning("\nelasticsearch_curl() error:", error)
        resp_text = None

    # return the Python dict of the request
    #     print ("resp_text:", resp_text)
    return resp_text


def del_all_scroll():
    response = elasticsearch_curl(
        uri="http://localhost:9200/_search/scroll/_all", verb="del"
    )
    return response


def del_pit(pit):
    json_data = json.dumps({"id": pit})
    response = elasticsearch_curl(
        uri="http://localhost:9200/_pit", json_body=json_data, verb="del"
    )
    return response


response = elasticsearch_curl("http://localhost:9200/wikipedia_sentences/_count")
print(response)

{'count': 126899105, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}


### Post a sentence to the index

In [34]:
def post_sentence(dict_object):
    # Post one example to news_event_frame index
    if dict_object:
        json_object = json.dumps(dict_object)
    else:
        return None
    response = elasticsearch_curl(
        "http://localhost:9200/wikipedia_sentences/_doc",
        verb="post",
        json_body=json_object,
    )
    return response


def query_sentence(phrase, size=100):
    dict_object = {
        "query": {
            "bool": {
                "must": [{"match_phrase": {"doc": phrase}}],
                "should": [
                    {"match_phrase": {"doc": "is a"}},
                    {"match_phrase": {"doc": "is not a"}},
                    {"match_phrase": {"doc": "have a"}},
                    {"match_phrase": {"doc": "does not have a"}},
                    {"match_phrase": {"doc": "is capable of"}},
                    {"match_phrase": {"doc": "is not capable of"}},
                ],
            },
        },
        "size": size,
    }

    json_object = json.dumps(dict_object)
    response = elasticsearch_curl(
        "http://localhost:9200/wikipedia_sentences/_search",
        verb="post",
        json_body=json_object,
    )
    result = []
    for item in response["hits"]["hits"]:
        score = item["_score"]
        doc = item["_source"]["doc"]
        result.append((doc, score))
    return result


def query_sentence_match(phrase):
    dict_object = {"query": {"match": {"doc": phrase}}, "size": 1}
    json_object = json.dumps(dict_object)
    response = elasticsearch_curl(
        "http://localhost:9200/wikipedia_sentences/_search",
        verb="post",
        json_body=json_object,
    )
    result = []
    for item in response["hits"]["hits"]:
        score = item["_score"]
        doc = item["_source"]["doc"]
        result.append((doc, score))
    return result


def bulk_post(bulk_dict_data, index_name="wikipedia_sentences"):
    if len(bulk_dict_data) < 1:
        return None
    # Post multiple examples to an index
    # A list of data dict
    meta_json = json.dumps({"index": {"_index": index_name, "_type": "_doc"}})
    data_to_post = (
        "\n".join(meta_json + "\n" + json.dumps(d) for d in bulk_dict_data) + "\n"
    )
    response = elasticsearch_curl(
        f"http://localhost:9200/_bulk", verb="post", json_body=data_to_post,
    )
    return response


# test_data_dict = [{"doc": "This is a test text."} for i in range(500000)]
# response = post_sentence(test_data_dict)
# # print(response)
# response = bulk_post(test_data_dict)
# print(response)

response = query_sentence("animal has a")
response

[('Each animal has a name.\n', 16.058651),
 ('Each animal has a specific name.\n', 15.583131),
 ('The animal has a broad foot.\n', 15.583131),
 ('The animal has a long siphon.\n', 15.583131),
 ('The animal has a long siphon.\n', 15.583131),
 ('The animal has a uniform light colour.\n', 15.134964),
 ('The living animal has a general orange coloration.\n', 14.711854),
 ('The animal has a distinct head and tail.\n', 14.711854),
 ('Each animal has a subdermal tracking microchip implant.\n', 14.711854),
 ('The living animal has a general orange color.\n', 14.711854),
 ('This animal has a large, uniform, and sutureless braincase.\n', 14.311757),
 ('The population of C. fulvus has not been quantified but the animal has a wide range and is presumed to have a large total population.\n',
  14.114789),
 ('The animal has a conspicuously large, long-haired tail, measuring .\n',
  13.932846),
 ('The animal has a tendency to place everything in its mouth.\n', 13.573481),
 ('The animal has a large hea

### Insert sentences into elastic index

In [3]:
data = []
jsonfilename = "../data/LeapOfThought/mask-filling/lot_train.txt"
with open(jsonfilename) as f:
    input_lines = f.readlines()
    for line in input_lines:
        data.append(line)
    print(len(data))
jsonfilename = "../data/LeapOfThought/mask-filling/lot_test.txt"
with open(jsonfilename) as f:
    input_lines = f.readlines()
    for line in input_lines:
        data.append(line)
    print(len(data))
jsonfilename = "../data/LeapOfThought/mask-filling/lot_dev.txt"
with open(jsonfilename) as f:
    input_lines = f.readlines()
    for line in input_lines:
        data.append(line)
    print(len(data))

9793
10158
10510


In [42]:
import random
import string

from tqdm.auto import tqdm

count = lambda l1, l2: sum([1 for x in l1 if x in l2])


def count_punc(s):
    return count(s, set(string.punctuation))


def count_keyword(s, key):
    count = 0
    s = s.replace(".", "").replace("\n", "")
    for w in s.split(" "):
        if w.lower() == key:
            count += 1
    return count


source_neural_sent_dic = {}
outfilename = "../data/LeapOfThought/mask-filling/lot_neutral.txt"
with open(outfilename, "w") as f:
    for item in tqdm(data):
        ## Get source and objective
        sens, obj = item.split("\t")
        obj = obj.strip()
        sen_li = sens.split(". ")
        for sen in sen_li[:-1]:
            sen_ws = sen.split(" ")
            for w in sen_ws:
                if sen.replace(w, "<MASK>") + "." == sen_li[-1]:
                    source = w
                    source_sent = sen
                    break
        ## Get top 10 sentences from wikipedia by source
        #         print(f"source:{source}\n target:{obj}\n source_sent:{source_sent}")
        if source in source_neural_sent_dic:
            out_sentence = item.strip() + "\t" + source_neural_sent_dic[source]
            f.write(out_sentence)
            continue
        size = 10
        top_query_sent = query_sentence(source, size)
        query_replace_score = []
        ### For each of the 10 sentence, reformulate it by the object word, and then get the one with the lowest similary score in its top-1 result.
        ### This means the word in that sentence is unlikely repalced by the object.
        for sent in top_query_sent:
            query_replace_sen = sent[0].replace(source, obj)
            diff_len = abs(
                (len(query_replace_sen.split(" ")) - len(source_sent.split(" ")))
                / min(len(query_replace_sen.split(" ")), len(source_sent.split(" ")))
            )
            if (count_punc(sent[0]) > 1) or (  ## Less than one punctuation
                count_keyword(sent[0], source) > 1
            ):  ## Less than one source word
                query_top_1_score = 100
            else:
                query_top_1_sen, query_top_1_score = query_sentence_match(
                    query_replace_sen
                )[0]
            if (diff_len > 2) or (
                len(query_replace_sen.replace("  ", " ").split(" ")) < 4
            ):
                ## The length differnce between the select sent and the source sent should be less than 2 times.
                query_top_1_score = 200
            query_replace_score.append(query_top_1_score)
        best_sen_index = query_replace_score.index(min(query_replace_score))
        best_sen = top_query_sent[best_sen_index][0]
        source_neural_sent_dic[source] = best_sen
        out_sentence = item.strip() + "\t" + best_sen

        #             if query_replace_score[best_sen_index] < 100:
        #                 best_sen = top_query_sent[best_sen_index][0]
        #                 out_sentence = item.strip() + "\t" + best_sen
        #                 break
        #             else:
        #                 if len(top_query_sent) != size or size>500:
        #                     best_sen_index = random.randomint(0, len(top_query_sent) - 1)
        #                     best_sen = top_query_sent[best_sen_index][0]
        #                     out_sentence = item.strip() + "\t" + best_sen
        #                     break
        #                 size += 100

        f.write(out_sentence)

HBox(children=(FloatProgress(value=0.0, max=10510.0), HTML(value='')))




In [43]:
len(data), len(source_neural_sent_dic)

(10510, 326)

In [44]:
source_neural_sent_dic

{'energy': 'A single wave or photon does not have a center of momentum frame where its energy must be at minimal value.\n',
 'animal': 'The animal is capable of complete retraction within the shell.\n',
 'fish': 'The fish does not have a lateral line.\n',
 'person': 'A person who died a natural death does not have a malevolent ifrit.\n',
 'element': 'The CCG does not have a "reserve" element.\n',
 'covering': 'Victoria does not have a dedicated industry statute covering the taxi industry.\n',
 'drug': 'There is not a single drug that is a standard among treatment.\n',
 'illumination': 'He is a storyboard artist for Illumination Entertainment.\n',
 'appearance': 'The image does not have a unified appearance.\n',
 'leader': 'The party does not have a traditional leader.\n',
 'individual': 'The individual does not have a foreign accent.\n',
 'chemical': 'NOTT-202 is a two-part chemical compound that is capable of selectively absorbing carbon dioxide.\n',
 'mammal': 'Max is a mammal.\n',
 