# Connect to Elasticsearch


In [None]:
# https://www.elastic.co/search-labs/tutorials/search-tutorial/full-text-search/connect-python

In [2]:
import json
from pprint import pprint
import os
import time

from elasticsearch import Elasticsearch


# ELASTIC_PASSWORD = os.environ["ELASTIC_PASSWORD"]
ELASTIC_PASSWORD = "paper_chat"


class Search:
    def __init__(self):
        self.es = Elasticsearch(
            hosts="https://es01:9200",
            basic_auth=("elastic", ELASTIC_PASSWORD),
            verify_certs=False,
        )
        client_info = self.es.info()
        print("Connected to Elasticsearch!")
        pprint(client_info.body)

In [3]:
search = Search()
es = search.es

Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'I9n8oe4ASbeUOowA6_KKjw',
 'name': 'es01',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-07-07T22:04:49.882652950Z',
             'build_flavor': 'default',
             'build_hash': 'd55f984299e0e88dee72ebd8255f7ff130859ad0',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.10.0',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.14.3'}}


  _transport = transport_class(


In [1]:
# https://www.elastic.co/search-labs/tutorials/search-tutorial/full-text-search/create-index

# Create the Index


In [9]:
es.indices.create(index="my_documents")



ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_documents'})

In [10]:
es.indices.delete(index="my_documents")



ObjectApiResponse({'acknowledged': True})

In [11]:
class Search:
    def __init__(self):
        self.es = Elasticsearch(
            hosts="https://es01:9200",
            basic_auth=("elastic", ELASTIC_PASSWORD),
            verify_certs=False,
        )
        client_info = self.es.info()
        print("Connected to Elasticsearch!")
        pprint(client_info.body)

    def create_index(self):
        self.es.indices.delete(index="my_documents", ignore_unavailable=True)
        self.es.indices.create(index="my_documents")

# Add Documents to the Index


In [12]:
documents = {
    "title": "Work From Home Policy",
    "contents": "The purpose of this full-time work-from-home policy is...",
    "created_on": "2023-11-02",
}

response = es.index(index="my_documents", body=documents)
print(response["_id"])



fRj5yJABWFu0icyGQgiG


In [23]:
class Search:
    def __init__(self):
        self.es = Elasticsearch(
            hosts="https://es01:9200",
            basic_auth=("elastic", ELASTIC_PASSWORD),
            verify_certs=False,
        )
        client_info = self.es.info()
        print("Connected to Elasticsearch!")
        pprint(client_info.body)

    def create_index(self):
        self.es.indices.delete(index="my_documents", ignore_unavailable=True)
        self.es.indices.create(index="my_documents")

    def insert_document(self, document):
        return self.es.index(index="my_documents", body=document)

    def insert_documents(self, documents):
        # NOTE: see details in https://elasticsearch-py.readthedocs.io/en/stable/api.html#elasticsearch.Elasticsearch.bulk
        operations = []
        for document in documents:
            # For each document, two entries are added to the operations list:
            #   1. A description of what operation to perform, set to index, with the name of the index given as an argument.
            #   2. The actual data of the document
            operations.append({"index": {"_index": "my_documents"}})
            operations.append(document)
        return self.es.bulk(operations=operations)

# Ingesting Documents from a JSON File


In [22]:
import json

es = Search()

with open("data.json", "rt") as f:
    documents = json.loads(f.read())

for document in documents:
    es.insert_document(document)

Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'I9n8oe4ASbeUOowA6_KKjw',
 'name': 'es01',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-07-07T22:04:49.882652950Z',
             'build_flavor': 'default',
             'build_hash': 'd55f984299e0e88dee72ebd8255f7ff130859ad0',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.10.0',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.14.3'}}


  _transport = transport_class(


# Regenerating the Index


In [None]:
class Search:
    def __init__(self):
        self.es = Elasticsearch(
            hosts="https://es01:9200",
            basic_auth=("elastic", ELASTIC_PASSWORD),
            verify_certs=False,
        )
        client_info = self.es.info()
        print("Connected to Elasticsearch!")
        pprint(client_info.body)

    def create_index(self):
        self.es.indices.delete(index="my_documents", ignore_unavailable=True)
        self.es.indices.create(index="my_documents")

    def insert_document(self, document):
        return self.es.index(index="my_documents", body=document)

    def insert_documents(self, documents):
        # NOTE: see details in https://elasticsearch-py.readthedocs.io/en/stable/api.html#elasticsearch.Elasticsearch.bulk
        operations = []
        for document in documents:
            # For each document, two entries are added to the operations list:
            #   1. A description of what operation to perform, set to index, with the name of the index given as an argument.
            #   2. The actual data of the document
            operations.append({"index": {"_index": "my_documents"}})
            operations.append(document)
        return self.es.bulk(operations=operations)

    def reindex(self):
        self.create_index()
        with open("data.json", "rt") as f:
            documents = json.loads(f.read())
        return self.insert_documents(documents)

# Search Basics


In [24]:
# https://www.elastic.co/search-labs/tutorials/search-tutorial/full-text-search/search-basics

# Elasticsearch Queries

The Elasticsearch services uses a Query DSL (Domain Specific Language) based on the JSON format to define queries.


In [30]:
class Search:
    def __init__(self):
        self.es = Elasticsearch(
            hosts="https://es01:9200",
            basic_auth=("elastic", ELASTIC_PASSWORD),
            verify_certs=False,
        )
        client_info = self.es.info()
        print("Connected to Elasticsearch!")
        pprint(client_info.body)

    def create_index(self):
        self.es.indices.delete(index="my_documents", ignore_unavailable=True)
        self.es.indices.create(index="my_documents")

    def insert_document(self, document):
        return self.es.index(index="my_documents", body=document)

    def insert_documents(self, documents):
        # NOTE: see details in https://elasticsearch-py.readthedocs.io/en/stable/api.html#elasticsearch.Elasticsearch.bulk
        operations = []
        for document in documents:
            # For each document, two entries are added to the operations list:
            #   1. A description of what operation to perform, set to index, with the name of the index given as an argument.
            #   2. The actual data of the document
            operations.append({"index": {"_index": "my_documents"}})
            operations.append(document)
        return self.es.bulk(operations=operations)

    def reindex(self):
        self.create_index()
        with open("data.json", "rt") as f:
            documents = json.loads(f.read())
        return self.insert_documents(documents)

    def search(self, **query_args):
        return self.es.search(index="my_documents", **query_args)


es = Search()

Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'I9n8oe4ASbeUOowA6_KKjw',
 'name': 'es01',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-07-07T22:04:49.882652950Z',
             'build_flavor': 'default',
             'build_hash': 'd55f984299e0e88dee72ebd8255f7ff130859ad0',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.10.0',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.14.3'}}


  _transport = transport_class(


# Match Queries

```python
GET /_search
{
  "query": {
    "match": {
      "name": {
        "query": "search text here"
      }
    }
  }
}
```


Scores in Elasticsearch are calculated using the [Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25) algorithm.


In [95]:
response = es.search(query={"match": {"name": {"query": "John Doe"}}})
print(response["hits"]["total"])
response["hits"]["hits"]

{'value': 2, 'relation': 'eq'}




[{'_index': 'my_documents',
  '_id': 'fhgByZABWFu0icyGrAhq',
  '_score': 0.36464313,
  '_ignored': ['content.keyword'],
  '_source': {'name': 'John Doe',
   'url': 'https://www.johndoe.com',
   'summary': 'John Doe is a software engineer and a blogger.',
   'content': 'John Doe is a software engineer and a blogger. He is the author of the popular blog johndoe.com where he writes about software engineering, programming, and technology. John has been working in the software industry for over 10 years and has a passion for building great software products. In his free time, John enjoys playing video games, reading books, and spending time with his family.',
   'created_on': '2021-01-01',
   'updated_at': '2021-01-01',
   'category': 'blog',
   'rolePermissions': {'read': True, 'write': False}}},
 {'_index': 'my_documents',
  '_id': 'fxgByZABWFu0icyGrQga',
  '_score': 0.36464313,
  '_ignored': ['content.keyword'],
  '_source': {'name': 'John Doe',
   'url': 'https://www.johndoe.com',
   's

# Retrieving Individual Results


In [105]:
class Search:
    def __init__(self):
        self.es = Elasticsearch(
            hosts="https://es01:9200",
            basic_auth=("elastic", ELASTIC_PASSWORD),
            verify_certs=False,
        )
        client_info = self.es.info()
        print("Connected to Elasticsearch!")
        pprint(client_info.body)

    def create_index(self):
        self.es.indices.delete(index="my_documents", ignore_unavailable=True)
        self.es.indices.create(index="my_documents")

    def insert_document(self, document):
        return self.es.index(index="my_documents", body=document)

    def insert_documents(self, documents):
        # NOTE: see details in https://elasticsearch-py.readthedocs.io/en/stable/api.html#elasticsearch.Elasticsearch.bulk
        operations = []
        for document in documents:
            # For each document, two entries are added to the operations list:
            #   1. A description of what operation to perform, set to index, with the name of the index given as an argument.
            #   2. The actual data of the document
            operations.append({"index": {"_index": "my_documents"}})
            operations.append(document)
        return self.es.bulk(operations=operations)

    def reindex(self):
        self.create_index()
        with open("data.json", "rt") as f:
            documents = json.loads(f.read())
        return self.insert_documents(documents)

    def search(self, **query_args):
        response = self.es.search(index="my_documents", **query_args)
        print(response["hits"]["total"])
        for hit in response["hits"]["hits"]:
            print(hit["_source"])
        return response

    def retrieve_document(self, id):
        return self.es.get(index="my_documents", id=id)


es = Search()

Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'I9n8oe4ASbeUOowA6_KKjw',
 'name': 'es01',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-07-07T22:04:49.882652950Z',
             'build_flavor': 'default',
             'build_hash': 'd55f984299e0e88dee72ebd8255f7ff130859ad0',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.10.0',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.14.3'}}


  _transport = transport_class(


# Searching Multiple Fields

- Details: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html

```python
GET /_search
{
  "query": {
    "multi_match" : {
      "query":    "this is a test",
      "fields": [ "subject", "message" ]
    }
  }
}
```


In [99]:
es.search(query={"multi_match": {"query": "policy", "fields": ["name", "title"]}})

{'value': 1, 'relation': 'eq'}
{'title': 'Work From Home Policy', 'contents': 'The purpose of this full-time work-from-home policy is...', 'created_on': '2023-11-02'}




# Pagination

https://www.elastic.co/search-labs/tutorials/search-tutorial/full-text-search/pagination


In [106]:
results = es.search(
    query={
        "multi_match": {
            "query": "John",
            "fields": ["name", "title", "summary", "content"],
        }
    },
    size=1,
)

{'value': 2, 'relation': 'eq'}
{'name': 'John Doe', 'url': 'https://www.johndoe.com', 'summary': 'John Doe is a software engineer and a blogger.', 'content': 'John Doe is a software engineer and a blogger. He is the author of the popular blog johndoe.com where he writes about software engineering, programming, and technology. John has been working in the software industry for over 10 years and has a passion for building great software products. In his free time, John enjoys playing video games, reading books, and spending time with his family.', 'created_on': '2021-01-01', 'updated_at': '2021-01-01', 'category': 'blog', 'rolePermissions': {'read': True, 'write': False}}




In [108]:
results = es.search(
    query={
        "multi_match": {
            "query": "John",
            "fields": ["name", "title", "summary", "content"],
        }
    },
    size=1,
    from_=1,
)

{'value': 2, 'relation': 'eq'}
{'name': 'John Doe', 'url': 'https://www.johndoe.com', 'summary': 'John Doe is a software engineer and a blogger.', 'content': 'John Doe is a software engineer and a blogger. He is the author of the popular blog johndoe.com where he writes about software engineering, programming, and technology. John has been working in the software industry for over 10 years and has a passion for building great software products. In his free time, John enjoys playing video games, reading books, and spending time with his family.', 'created_on': '2021-01-01', 'updated_at': '2021-01-01', 'category': 'blog', 'rolePermissions': {'read': True, 'write': False}}




# Filters


# Introduction to Boolean Queries

A compound query allows an application to combine two or more individual queries, so that they execute together, and if appropriate, return a combined set of results. \
The standard way to create compound queries in Elasticsearch is to use a [Boolean query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html).

- Compound queries: https://www.elastic.co/guide/en/elasticsearch/reference/current/compound-queries.html

There are four different ways to combine queries:

1. bool.must: the clause must match. If multiple clauses are given, all must match (similar to an AND logical operation).
2. bool.should: when used without must, at least one clause should match (similar to an OR logical operation). When combined with must each matching clause boosts the relevance score of the document.
3. bool.filter: only documents that match the clause(s) are considered search result candidates.
4. bool.must_not: only documents that do not match the clause(s) are considered search result candidates.


# Adding a Filter to a Query

1. The `bool.must` clause is usually the place where the base query is defined.
2. The filtering is implemented in a `bool.filter` section
   - Using a `match` or `multi_match` query for a filter is not a good idea, because these are full-text search queries.
   - The query must return an absolute true or false answer for each document and not a relevance score like the match queries do.
   - `term`: performs an exact search for the a value in a given field.
     - https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html
     - More appropriate for `keyword` (`category.keyword`) type, not default type `text`


In [110]:
query = {
    "bool": {
        "must": [
            {
                "multi_match": {
                    "query": "John",
                    "fields": ["name", "title", "summary", "content"],
                }
            }
        ],
        "filter": [{"term": {"category.keyword": {"value": "category to filter"}}}],
    }
}
es.search(query=query)

{'value': 0, 'relation': 'eq'}




ObjectApiResponse({'took': 6, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 0, 'relation': 'eq'}, 'max_score': None, 'hits': []}})

In [None]:
query = {
    "bool": {
        "must": [
            {
                "multi_match": {
                    "query": "John",
                    "fields": ["name", "title", "summary", "content"],
                }
            }
        ],
        "filter": [{"term": {"category.keyword": {"value": "category to filter"}}}],
    }
}
es.search(query=query)

{'value': 0, 'relation': 'eq'}




ObjectApiResponse({'took': 6, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 0, 'relation': 'eq'}, 'max_score': None, 'hits': []}})

# Chatbot

- Chunking: https://github.com/elastic/elasticsearch-labs/blob/main/notebooks/document-chunking/tokenization.ipynb


In [11]:
from elasticsearch import Elasticsearch, NotFoundError

from langchain import hub
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.tools.retriever import create_retriever_tool
from langchain_core.messages import HumanMessage, BaseMessage
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
)
from langchain_elasticsearch import ElasticsearchStore

from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.sqlite import SqliteSaver

from paper_chat.core.timer import T
from paper_chat.core.llm import CHAT_LLM, EMBEDDINGS


arxiv_url = "https://arxiv.org/pdf/2004.07606"
loader = PyPDFLoader(arxiv_url)
_docs = loader.load()
_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
_splits = _text_splitter.split_documents(_docs)

In [12]:
client = Elasticsearch(
    hosts="https://es01:9200",
    basic_auth=("elastic", "paper_chat"),
    verify_certs=False,
)
client_info = client.info()

In [14]:
INDEX = "papers"
client.indices.delete(index=INDEX, ignore_unavailable=True)
vectorstore = ElasticsearchStore.from_documents(
    _splits, EMBEDDINGS, es_connection=client, index_name=INDEX
)