# Document processing



In [1]:
!pip install PyMuPDF
!pip install langchain
!pip install nltk

Collecting PyMuPDF
  Obtaining dependency information for PyMuPDF from https://files.pythonhosted.org/packages/41/4a/530017aaf0a554aa6d9abd547932a02c0188962d12122fe611bf7a6d0c26/PyMuPDF-1.23.5-cp310-none-manylinux2014_x86_64.whl.metadata
  Downloading PyMuPDF-1.23.5-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.23.5 (from PyMuPDF)
  Obtaining dependency information for PyMuPDFb==1.23.5 from https://files.pythonhosted.org/packages/cf/14/de59687368ad2c047b038b5b9b04e40bd5d486d5b36c6aef42c18c35ea2c/PyMuPDFb-1.23.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata
  Downloading PyMuPDFb-1.23.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading PyMuPDF-1.23.5-cp310-none-manylinux2014_x86_64.whl (4.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading PyMuPDFb-1.23.5-py3-none-manylinux2014_x86_64.manylinux_2_

Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Downloading langchain-0.0.316-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading aiohttp-3.8.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hUsing cached async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
Downloading dataclasses_json-0.6.1-py3-none-any.whl (27 kB)
Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Downloading langsmith-0.0.44-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading SQLAlchemy-2.0.22-cp310-cp310-manylinux_2_

In [2]:
import os
import boto3
import fitz  # PyMuPDF
import nltk
from langchain.text_splitter import NLTKTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
nltk.download('punkt')
from io import BytesIO
import re


[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [81]:
source_bucket = "gen-ai-fargate-docu-gpt-document-bucket"
# source_key = "HumanResourceDirectives.pdf"
source_key = "Project Management Directives.pdf"

In [82]:
# Get the PDF object from the source bucket
s3_client = boto3.client('s3')
pdf_object = s3_client.get_object(Bucket=source_bucket, Key=source_key)

## PDF to text for Chunking across pages


In [84]:
def pdf_to_text(input_pdf_object):
    """
    Convert a PDF document into text and retrieve the starting indices of each page's content.

    This function takes a PDF object and converts its content into plain text using PyMuPDF (fitz library).
    It also calculates the starting indices of each page's content within the concatenated text.

    Args:
        input_pdf_object (dict): A dictionary-like object containing a 'Body' field representing
                                the binary content of the PDF document.

    Returns:
        tuple: A tuple containing:
            - text (str): The extracted plain text content from the PDF.
            - page_start_indices (list): A list of integers representing the starting indices
                                        of each page's content within the concatenated text.
    """
    pdf_bytes = input_pdf_object['Body'].read()
    pdf_document = fitz.open(stream=BytesIO(pdf_bytes), filetype="pdf")
    
    text = ""
    page_start_indices = [0]
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
        page_start_indices.append(page_start_indices[-1] + len(page.get_text()))
            
    # Close the PDF document
    pdf_document.close()
    return text, page_start_indices

In [85]:
text,page_start_indices = pdf_to_text(pdf_object)

In [86]:
def remove_extra_newlines(text):
    """
    Remove extra newline characters from the given text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with extra newline characters replaced by a single space.
    """
    cleaned_text = re.sub(r'\n+', ' ', text)
    return cleaned_text

In [87]:
def split_text_chunks(text, page_start_indices,source_key):
    """
    Split a large text into smaller chunks while preserving page-level metadata.

    This function takes a large text and divides it into smaller chunks using the NLTKTextSplitter
    with specified chunk size and overlap. It associates each chunk with its corresponding page
    metadata from the provided page_start_indices.

    Args:
        text (str): The large text to be split into chunks.
        page_start_indices (list): A list of integers representing the starting indices of each
                                  page's content within the text.

    Returns:
        list: A list of dictionaries, where each dictionary contains:
            - "page_number" (int): Page number (1-indexed) associated with the chunk.
            - "document_name" (str): Name of the document the chunk is extracted from.
            - "text" (str): The extracted chunk of text with extra newlines removed.
    """
    text_splitter = NLTKTextSplitter(chunk_size=1500, chunk_overlap=40)
    chunks = text_splitter.split_text(text)
    
    text_with_metadata = []
    overall_length = 0
    for chunk in chunks:
        for i, j in enumerate(page_start_indices):
            if j > overall_length:
                # Extract page metadata
                page_metadata = {
                    "page_number": i,  # Page numbers are 1-indexed
                    "document_name": source_key, 
                    "text": remove_extra_newlines(chunk)  # Define 'remove_extra_newlines' function
                }
                text_with_metadata.append(page_metadata)
                break
        overall_length += (len(chunk) - 5)  # 5 taken as compensation for overlap
    return text_with_metadata

        
    

In [88]:
chunked_content = split_text_chunks(text,page_start_indices, source_key)

In [23]:
!pip install opensearch-py

Collecting opensearch-py
  Obtaining dependency information for opensearch-py from https://files.pythonhosted.org/packages/03/e3/f1fa3cca5a568bdd0eb67c33a72fb7db1667bcfd4828cfef20f1c0e913bf/opensearch_py-2.3.2-py2.py3-none-any.whl.metadata
  Downloading opensearch_py-2.3.2-py2.py3-none-any.whl.metadata (6.9 kB)
Downloading opensearch_py-2.3.2-py2.py3-none-any.whl (327 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.3/327.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: opensearch-py
Successfully installed opensearch-py-2.3.2


In [24]:
# For embedding
!pip install sentence_transformers
!pip install InstructorEmbedding


Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Obtaining dependency information for transformers<5.0.0,>=4.6.0 from https://files.pythonhosted.org/packages/1a/d1/3bba59606141ae808017f6fde91453882f931957f125009417b87a281067/transformers-4.34.0-py3-none-any.whl.metadata
  Downloading transformers-4.34.0-py3-none-any.whl.metadata (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.5/121.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.5 MB/s

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hBuilding wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125926 sha256=c5f047e5bb97168fe6dbe6a8f0f0b3ccc7bb0f4a16da9cc097ef98a164d79d24
  Stored in directory: /home/ec2-user/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_transformers
Installing collected packages: sentencepiece, safetensors, huggingface-hub, tokenizers, transformers, sentence_transformers
Successfully installed huggingface-hub-0.17.3 safetensors-0.4.0 sentence_transformers-2.2.2 sentencepiece-0.1.99 tokenizers-0.14.1 transformers-4.34.0
Collecting InstructorEmbedding
  Obtaining dependency information for InstructorEmbedding from https://fi

In [25]:
from InstructorEmbedding import INSTRUCTOR


  from tqdm.autonotebook import trange


In [26]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, helpers

## OpenSearch Indexing

In [89]:
# This code connects to the cluster.

host = 'https://search-gen-ai-fargate-docu-gpt-os-bx2z3gdknkwpujlbnrnqedh3pe.us-west-2.es.amazonaws.com' # cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com
port = 443
region = 'us-west-2' # e.g. us-west-1


credentials = ("amit","Amit1098$")
# credentials = boto3.Session().get_credentials()
# auth = AWSV4SignerAuth(credentials, region)


client = OpenSearch(
    hosts = [f'{host}:{port}'],
    http_auth = credentials,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection
)


In [90]:
info = client.info()

In [91]:
# #defining index mapping with a k-nn vector
# knn_index = {
#     'settings': {
#         'index.knn': True,
#         'index.knn.space_type': 'cosinesimil',
#     },
#     'mappings': {
#         'properties': {
#             'embedding': {  # k-NN vector field
#                 'type': 'knn_vector',
#                 'dimension': 768  # Dimension of the vector
#             },
#             'chunk_text': {  # Instead of 'passage'
#                 'type': 'text'
#             },
#             'page_number': {
#                 'type': 'long'
#             },
#             'document_name': {
#                 'type': 'keyword'
#             }
#         }
#     }
# }

In [94]:
#defining index mapping with a APPROXIMATE k-nn vector
knn_index = {
    'settings': {
        'index.knn': True,
        'index.knn.algo_param.ef_search': 100,
    },
    'mappings': {
        'properties': {
            'embedding': {  # k-NN vector field
                'type': 'knn_vector',
                'dimension': 768, # Dimension of the vector
                'method': {
                    "name": "hnsw",
                    "space_type": "l2",
                    "engine": "nmslib",
                    "parameters": {
                      "ef_construction": 128,
                      "m": 24
                    }
                }
            },
            'chunk_text': {  # Instead of 'passage'
                'type': 'text'
            },
            'page_number': {
                'type': 'long'
            },
            'document_name': {
                'type': 'keyword'
            },
            'date': {
                'type': 'date'
            }
        }
    }
}

In [95]:
index_name = 'amit-pdf-date-index'
index_body = knn_index
response = client.indices.create(
  index_name, 
  body=index_body
)

print(response)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'amit-pdf-date-index'}


In [96]:
import random
from datetime import datetime, timedelta

def random_date(start_date, end_date):
    """
    Generates a random date between start_date and end_date.
    Arguments:
    start_date (str): Start date in 'YYYY-MM-DD' format
    end_date (str): End date in 'YYYY-MM-DD' format
    Returns:
    str: Random date in 'YYYY-MM-DD' format
    """
    start_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')
    
    random_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    return random_date.strftime('%Y-%m-%d')

# Example usage
start_date = '2023-01-01'
end_date = '2023-12-31'



In [97]:
random_date_generated = random_date(start_date, end_date)
print("Random Date:", random_date_generated)

Random Date: 2023-07-15


In [98]:
model = INSTRUCTOR('hkunlp/instructor-base')

load INSTRUCTOR_Transformer
max_seq_length  512


In [99]:
def embed_chunks(chunked_content, model):
    """
    Embed the text chunks using a specified embedding model.

    This function takes a list of dictionaries representing text chunks, along with an embedding model.
    It embeds the text content of each chunk using the provided model and adds the embeddings to the metadata.

    Args:
        chunked_content (list): A list of dictionaries, where each dictionary contains:
            - "page_number" (int): Page number associated with the chunk.
            - "document_name" (str): Name of the document the chunk is extracted from.
            - "chunk_text" (str): The text content of the chunk.

        model: The embedding model used to encode the text content.

    Returns:
        list: A list of dictionaries, each containing the original metadata along with the added "embedding".

    Note:
        The `model` should have a method called `encode` that takes text as input and returns its embedding.
    """
    embeded_content = []
    for chunk in chunked_content:
        embeded_metadata = {
            "page_number": chunk["page_number"],
            "document_name": chunk["document_name"],
            "chunk_text": chunk["text"],
            "embedding": model.encode(chunk["text"]),
            "date": random_date(start_date, end_date)
        }
        embeded_content.append(embeded_metadata)
    return embeded_content    

In [100]:
docs = embed_chunks(chunked_content,model)

In [101]:
def prepare_data_for_indexing(docs, index):
    """
    Prepare a list of documents for indexing in a search engine.

    This function takes a list of dictionaries representing documents and adds the specified index
    field to each document, making them ready for indexing in a search engine.

    Args:
        docs (list): A list of dictionaries, where each dictionary represents a document.
        index (str): The name of the index in the search engine where the documents will be indexed.

    Returns:
        list: A list of dictionaries, each containing the original document data along with the added "_index" field.
    """
    for i in range(len(docs)):
        docs[i]["_index"] = index
    return docs

In [102]:
docs = prepare_data_for_indexing(docs, index_name)

In [103]:
# %%time
# docs = [
#     { "_index": "words", "_id": "word1", word: "foo" },
#     { "_index": "words", "_id": "word2", word: "bar" },
#     { "_index": "words", "_id": "word3", word: "baz" },
# ]



response = helpers.bulk(client, docs, max_retries=3)
print(response)

(16, [])


## See how many documents in the index

In [73]:
# search for all documents in the 'amit-pdf-index' index
response = client.search(index=index_name)

# extract the count of hits from the response
hits_count = response['hits']['total']['value']


# print the count of hits
print("Total Hits: ", hits_count)

Total Hits:  32


In [32]:
# # define the query
# query = {
#     "query": {
#         "match": {
#             "title": "dark knight"
#         }
#     }
# }

# # search for documents in the 'movies' index with the given query
# response = client.search(index='movies', body=query)

# # extract the hits from the response
# hits = response['hits']['hits']

# # print the hits
# for hit in hits:
#     print(hit)

## Query

In [104]:
def get_user_query_embeddings(query, model):
    """
    Encode a user query into an embedding using a specified embedding model.

    This function takes a user query and an embedding model and encodes the query text into an embedding
    using the provided model.

    Args:
        query (str): The user's query text to be encoded into an embedding.
        model: The embedding model used to encode the query text.

    Returns:
        numpy.ndarray: An embedding representing the encoded user query.
        
    Note:
        The `model` should have a method called `encode` that takes text as input and returns its embedding.
    """
    return model.encode(query)


In [105]:
query_question = 'who are the involved stakeholders for the first process in development stage?'
query_embedding =  get_user_query_embeddings(query_question, model)

In [128]:
query={
    "size": 5,
    "query": {
        "knn": {
            "embedding":{
                "vector":query_embedding,
                "k":5
            }
        }
    }, "_source": False,
    "fields": ["page_number","document_name","chunk_text","date"],
    
    
}



In [129]:
result = client.search(index=index_name, body=query)
search_results = result['hits']['hits']
search_results

[{'_index': 'amit-pdf-date-index',
  '_id': 'Q0Y6QYsBknONgm9nPZC8',
  '_score': 0.82543784,
  'fields': {'date': ['2023-01-14T00:00:00.000Z'],
   'document_name': ['Project Management Directives.pdf'],
   'page_number': [2],
   'chunk_text': ["The project scope will be documented in the project scope statement, which will provide a clear understanding of the project's boundaries and deliverables. Scope of Work List the scope Must have Should have Could have Will not have Date: Remarks:(if any) Fig: Project Scope Template 3. Identify Project Stakeholders: The PMO office will identify the key stakeholders who are involved in the project and establish a communication plan to ensure that stakeholders are kept informed of project progress. The communication plan should include the frequency, format, and content of project status reports. Here is the list of involved stakeholders for the first process in development stage:: a. Product Owner b. Business Development Officer c. Project Manager 

## Sorting the top n search results

In [111]:
def sort_json_by_date(json_data):
    sorted_json = sorted(json_data, key=lambda x: x['fields']["date"],reverse=True)
    return sorted_json

In [112]:
sort_json_by_date(search_results)

[{'_index': 'amit-pdf-date-index',
  '_id': 'TEY6QYsBknONgm9nPZC8',
  '_score': 0.7959017,
  'fields': {'date': ['2023-10-30T00:00:00.000Z'],
   'document_name': ['Project Management Directives.pdf'],
   'page_number': [9],
   'chunk_text': ['2. Use Standard Issue Types: The PMO office will use standard JIRA issue types, such as Bug, Story, Task, and Epic. This ensures consistency and facilitates reporting and analysis. ● Product Backlog: Product Backlog is defined in JIRA SN Product backlog 1. Product backlog 1 ● Project Dependencies The PMO office shall be able to decide upon the dependencies as to decide which task to prioritize firstly ● Milestone: Examples of milestones Include: i. Figma Design approved ii. Project Charter approved iii. CRUD operation for user completed 3. Use Standard Workflows: The PMO will use standard JIRA workflows, such as To Do, In Progress, and Done. This helps to ensure that the team is following a consistent process and that issues are being tracked and 

## Simply sorting by date

In [130]:
query={
    "size": 5,
    "query": {
        "knn": {
            "embedding":{
                "vector":query_embedding,
                "k":5
            }
        }
    }, "_source": False,
    "sort": [
        {
            "date": {
                "order": "desc"
            }
        }
    ],
    "fields": ["page_number","document_name","chunk_text","date"],
    
    
}

In [131]:
result = client.search(index=index_name, body=query)
search_results = result['hits']['hits']
search_results

[{'_index': 'amit-pdf-date-index',
  '_id': 'SEY6QYsBknONgm9nPZC8',
  '_score': None,
  'fields': {'date': ['2023-12-14T00:00:00.000Z'],
   'document_name': ['Project Management Directives.pdf'],
   'page_number': [6],
   'chunk_text': ["Note: The format and content of the worklog can vary depending on the client's requirements and the needs of the project. It is important to establish clear guidelines for the worklog to ensure consistency and accuracy. Financial Viability Financial viability is the process of evaluating the financial performance of support resources to ensure that they are contributing positively to the company's bottom line. The goal of financial viability is to ensure that the company is allocating its resources effectively and efficiently, and that its operations are financially sustainable. The following are the key steps involved in financial viability: 1. Identify Support Resources: The PMO office will identify the support resources that are being used for the p

## Multiply score and date

In [140]:
query = {
    "size": 16,
    "query": {
        "function_score": {
            "query": {
                "knn": {
                    "embedding": {
                        "vector": query_embedding,
                        "k": 5
                    }
                }
            },
            "functions": [
                {
                    "field_value_factor": {
                        "field": "date",
                        "factor": 1,
                        "missing": 0
                    }
                }
            ],
            "score_mode": "multiply"
        }
    },
    "_source": False,
    "fields": ["page_number", "document_name", "chunk_text", "date"]
}


In [None]:
#unix timestamp in miliseconds multiplied

In [141]:
result = client.search(index=index_name, body=query)
search_results = result['hits']['hits']
search_results

[{'_index': 'amit-pdf-date-index',
  '_id': 'QkY6QYsBknONgm9nPZC8',
  '_score': 138573.23,
  'fields': {'date': ['2023-05-13T00:00:00.000Z'],
   'document_name': ['Project Management Directives.pdf'],
   'page_number': [1],
   'chunk_text': ["It is expected that all PMO office personnel will follow the procedures and guidelines outlined in this SOP to ensure that all projects are managed in a consistent and efficient manner. The PMO office will also continuously review and improve its processes to ensure that they are aligned with the needs of the company and its clients. Project Initiation & Planning The first step in managing a project is project initiation, which is the process of defining the project's scope, timeline, and budget. The goal of project initiation is to establish a clear understanding of the project's objectives, stakeholders, and constraints, and to develop a plan for how the project will be executed. The following are the key steps involved in project initiation: 1.

## Exponentialy reduce the importanc of documents by date

In [136]:
query={
    "size": 5,
    "query": {
        "function_score": {
            "query": {
                "knn": {
                    "embedding": {
                        "vector": query_embedding,
                        "k": 5
                    }
                }
            },
            "functions": [
                {
                    "exp": {
                        "date": {
                            "origin": "now",
                            "scale": "30d", 
                            "decay": 0.5 
                        }
                    }
                }
            ],
            "score_mode": "sum"
        }
    },
    "_source": False,
    "fields": ["page_number", "document_name", "chunk_text", "date"]
}

In [137]:
result = client.search(index=index_name, body=query)
search_results = result['hits']['hits']
search_results

[{'_index': 'amit-pdf-date-index',
  '_id': 'TEY6QYsBknONgm9nPZC8',
  '_score': 0.60705316,
  'fields': {'date': ['2023-10-30T00:00:00.000Z'],
   'document_name': ['Project Management Directives.pdf'],
   'page_number': [9],
   'chunk_text': ['2. Use Standard Issue Types: The PMO office will use standard JIRA issue types, such as Bug, Story, Task, and Epic. This ensures consistency and facilitates reporting and analysis. ● Product Backlog: Product Backlog is defined in JIRA SN Product backlog 1. Product backlog 1 ● Project Dependencies The PMO office shall be able to decide upon the dependencies as to decide which task to prioritize firstly ● Milestone: Examples of milestones Include: i. Figma Design approved ii. Project Charter approved iii. CRUD operation for user completed 3. Use Standard Workflows: The PMO will use standard JIRA workflows, such as To Do, In Progress, and Done. This helps to ensure that the team is following a consistent process and that issues are being tracked and

In [142]:
query={
    "size": 5,
    "query": {
        "function_score": {
            "query": {
                "knn": {
                    "embedding": {
                        "vector": query_embedding,
                        "k": 5
                    }
                }
            },
            "functions": [
                {
                    "exp": {
                        "date": {
                            "origin": "2023-12-30",
                            "scale": "30d", 
                            "decay": 0.5 
                        }
                    }
                }
            ],
            "score_mode": "sum"
        }
    },
    "_source": False,
    "fields": ["page_number", "document_name", "chunk_text", "date"]
}

In [143]:
result = client.search(index=index_name, body=query)
search_results = result['hits']['hits']
search_results

[{'_index': 'amit-pdf-date-index',
  '_id': 'SEY6QYsBknONgm9nPZC8',
  '_score': 0.53103054,
  'fields': {'date': ['2023-12-14T00:00:00.000Z'],
   'document_name': ['Project Management Directives.pdf'],
   'page_number': [6],
   'chunk_text': ["Note: The format and content of the worklog can vary depending on the client's requirements and the needs of the project. It is important to establish clear guidelines for the worklog to ensure consistency and accuracy. Financial Viability Financial viability is the process of evaluating the financial performance of support resources to ensure that they are contributing positively to the company's bottom line. The goal of financial viability is to ensure that the company is allocating its resources effectively and efficiently, and that its operations are financially sustainable. The following are the key steps involved in financial viability: 1. Identify Support Resources: The PMO office will identify the support resources that are being used for

## Delete By Query

In [51]:
query = {
  "query": {
    "match": {
      "document_name": "Document_name_Test.pdf"
    }
  }
}

In [52]:
client.delete_by_query(index=index_name, body=query)

{'took': 219,
 'timed_out': False,
 'total': 16,
 'deleted': 16,
 'batches': 1,
 'version_conflicts': 0,
 'noops': 0,
 'retries': {'bulk': 0, 'search': 0},
 'throttled_millis': 0,
 'requests_per_second': -1.0,
 'throttled_until_millis': 0,
 'failures': []}

## Delete Index

In [78]:
response = client.indices.delete(
    index = index_name
)

print(response)

{'acknowledged': True}
