In [2]:
pip install elasticsearch

Note: you may need to restart the kernel to use updated packages.


In [3]:
from elasticsearch import Elasticsearch, exceptions, helpers
from urllib.request import urlopen
from getpass import getpass
import json
import time
from PyPDF2 import PdfReader

In [22]:
# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id
ELASTIC_CLOUD_ID = "xxx"

# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key
ELASTIC_API_KEY = "xxx"

#a0c3Zi01RUJvZ2pPMVNfeFlwWUQ6d1JQNTBtXzlSejJ5c1hpaEpYMEFDUQ==
# Create the client instance
client = Elasticsearch(
    # For local development
    # hosts=["http://localhost:9200"]
    cloud_id=ELASTIC_CLOUD_ID,
    api_key=ELASTIC_API_KEY,
)

In [5]:
print(client.info())

{'name': 'instance-0000000001', 'cluster_name': '892d6a53e3134bedb6ea5d7a2ed48e9b', 'cluster_uuid': 'YcQ_87FzQrO2_X_yEypK_g', 'version': {'number': '8.15.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '253e8544a65ad44581194068936f2a5d57c2c051', 'build_date': '2024-09-02T22:04:47.310170297Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [68]:
try:
    client.inference.delete(inference_id="my-elser-endpoint")
except exceptions.NotFoundError:
    # Inference endpoint does not exist
    pass

try:
    client.options(
        request_timeout=60, max_retries=3, retry_on_timeout=True
    ).inference.put(
        task_type="sparse_embedding",
        inference_id="my-elser-endpoint",
        body={
            "service": "elser",
            "service_settings": {"num_allocations": 4, "num_threads": 1},
        },
    )
    print("Inference endpoint created successfully")
except exceptions.BadRequestError as e:
    if e.error == "resource_already_exists_exception":
        print("Inference endpoint created successfully")
    else:
        raise e

Inference endpoint created successfully


In [69]:
inference_endpoint_info = client.inference.get(inference_id="my-elser-endpoint")
model_id = inference_endpoint_info["endpoints"][0]["service_settings"]["model_id"]

while True:
    status = client.ml.get_trained_models_stats(
        model_id=model_id,
    )

    deployment_stats = status["trained_model_stats"][0].get("deployment_stats")
    if deployment_stats is None:
        print("ELSER Model is currently being deployed.")
        time.sleep(5)
        continue

    nodes = deployment_stats.get("nodes")
    if nodes is not None and len(nodes) > 0:
        print("ELSER Model has been successfully deployed.")
        break
    else:
        print("ELSER Model is currently being deployed.")
    time.sleep(5)

ELSER Model has been successfully deployed.


In [70]:
client.indices.delete(index="semantic-text-novel", ignore_unavailable=True)
client.indices.create(
    index="semantic-text-novel",
    mappings={
        "properties": {
            "page": {"type": "text"},
            "content": {"type": "text", "copy_to": "content_semantic"},
            "content_semantic": {
                "type": "semantic_text",
                "inference_id": "my-elser-endpoint",
            },
        }
    },
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'semantic-text-novel'})

In [71]:
reader = PdfReader("/Users/amrutaghate/Downloads/aiw.pdf")
number_of_pages = len(reader.pages)
operations = []


In [72]:
print(number_of_pages)

69


In [73]:
for i in range(0, number_of_pages):
    page = reader.pages[i]
    text = page.extract_text()
    temp_json = {"page":i+1, "content":text}
    operations.append({"index": {"_index": "semantic-text-novel"}})
    operations.append(temp_json)


In [75]:
client.bulk(index="semantic-text-novel", operations=operations, refresh=True)

ObjectApiResponse({'errors': False, 'took': 1000, 'items': [{'index': {'_index': 'semantic-text-novel', '_id': 'pkQ0_JEB-GuaxYmbAoUg', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 69, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'semantic-text-novel', '_id': 'p0Q0_JEB-GuaxYmbAoUh', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 70, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'semantic-text-novel', '_id': 'qEQ0_JEB-GuaxYmbAoUh', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 71, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'semantic-text-novel', '_id': 'qUQ0_JEB-GuaxYmbAoUh', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 72, '_primary

In [62]:
def pretty_search_response(response):
    if len(response["hits"]["hits"]) == 0:
        print("Your search returned no results.")
    else:
        for hit in response["hits"]["hits"]:
            id = hit["_id"]
            score = hit["_score"]
            page = hit["_source"]["page"]
            content = hit["_source"]["content"]

            pretty_output = f"\nID: {id}\nScore: {score}\nPage: {page}\nContent: {content}"

            print(pretty_output)

In [66]:
response = client.search(
    index="semantic-text-novel",
    query={
        "bool": {
            "must": {
                "multi_match": {
                    "fields": ["content"],
                    "query": "caterpillar smoking",
                    "boost": 1.5,
                }
            },
            "should": {
                "semantic": {
                    "field": "content_semantic",
                    "query": "caterpillar smoking",
                    "boost": 3.0,
                }
            },
        }
    },
)

pretty_search_response(response)


ID: KUQi_JEB-GuaxYmbg4Vb
Score: 63.279594
Page: 21
Content:   
“But I’m not used to it!” pleaded poor Alice in a piteous tone. And she  
thought of herself, “I wish the creatures wouldn’t be so easily  
offended!”  
  
“You’ll get used to it in time,” said the Caterpillar; and it put the  
hookah into its mouth and began smoking again.  
  
This time Alice waited patiently until it chose to speak again. In a  
minute or two the Caterpillar took the hookah out of its mouth and  
yawned once or twice, and shook itself. Then it got down off the  
mushroom, and crawled away in the grass, merely remarking as it went,  
“One side will make you grow taller, and the other side will make you  
grow shorter.”  
  
“One side of _what?_ The other side of _what?_” thought Alice to  
herself.  
  
“Of the mushroom,” said the Caterpillar, just as if she had asked it  
aloud; and in another moment it was out of sight.  
  
Alice remained looking thoughtfully at the mushroom for a minute,  
trying to 