# 2 - Populate summary and full text indices from markdown, pdf or docx files

#### Install dependencies

In [None]:
%pip install langchain-text-splitters opensearch-py pyPDF2 python-docx

#### Import dependencies

In [None]:
import boto3
from index_documents_helper import (
    get_s3_key_list, 
    summarize_documents, 
    index_opensearch_summary_payload, 
    split_and_index_full_text,
    delete_summary_index,
    delete_full_text_index
)

#### Retrieve stored parameters

In [None]:
%store -r region_name
%store -r host
%store -r summary_index_name
%store -r full_text_index_name
%store -r pipeline_id
%store -r model_id
print("Region is:", region_name)
print("OpenSearch endpoint", host)
print("Summary index name", summary_index_name)
print("Full Text index name", full_text_index_name)
print("Semantic search pipeline ID", pipeline_id)
print("Model ID", model_id)

#### Get the name of the data bucket created by the CloudFormation stack

In [None]:
stack_name = "chatbot-demo"

cf_client = boto3.client('cloudformation')
response = cf_client.describe_stacks(StackName=stack_name)
outputs = response["Stacks"][0]["Outputs"]
bucket_name = list(filter(lambda outputs: outputs['OutputKey'] == 'DataBucket', outputs))[0]["OutputValue"]
print("The name of the data bucket is:", bucket_name)

#### Get a list of the files in the S3 bucket under the document prefix

In [None]:
s3_prefix = ""
file_extensions = (".md", ".pdf", ".docx")
max_file_size = 3000000

key_list = get_s3_key_list(
    bucket_name = bucket_name,
    s3_prefix = s3_prefix,
    file_extensions = file_extensions,
    max_file_size = max_file_size
)

print("Found", len(key_list), "documents.")

In [None]:
#key_list

## Part 1 - Summarize the documents and populate the document summary OpenSearch index

#### Delete any existing OpenSearch summary index records for the key list

In [None]:
delete_summary_index(
    region_name = region_name, 
    opensearch_host = host, 
    key_list = key_list, 
    summary_index_name = summary_index_name
)

#### Summarize the documents in the S3 key list and return an OpenSearch payload
This can take a long time, depending upon the number and size of documents

In [None]:
max_summary_length = 5000

opensearch_payload = summarize_documents(
    region_name = region_name,
    bucket_name = bucket_name,
    key_list = key_list,
    max_summary_length = max_summary_length
)

print("OpenSearch payload has", len(opensearch_payload), "records")

#### Show a sample of the OpenSearch summary payload records

In [None]:
if len(opensearch_payload) < 5:
    sample_size = len(opensearch_payload)
else:
    sample_size = 5

for i in range(0, sample_size):
    print(opensearch_payload[i])
    print()

#### Index the summary records into OpenSearch
This can take a long time depending on the number and size of documents

In [None]:
summary_indexing_result = index_opensearch_summary_payload(
    region_name = region_name,
    opensearch_host = host,
    opensearch_payload = opensearch_payload,
    summary_index_name = summary_index_name
)

summary_indexing_result

## Part 2 - Populate the Full Text OpenSearch index

#### Delete any existing OpenSearch full text index records for the key list

In [None]:
delete_full_text_index(
    region_name = region_name, 
    opensearch_host = host, 
    key_list = key_list, 
    full_text_index_name = full_text_index_name
)

#### Iterate through list of markdown files, split into sections and add to OpenSearch index
For markdown files, adds a heading field into the payload based on the first line of text in each section that begins with a hash character.
<br>For pdf files, adds page number for each page.
<br>This can take a long time depending on the number and size of documents.

In [None]:
full_text_indexing_result = split_and_index_full_text(
    region_name = region_name, 
    opensearch_host = host,
    bucket_name = bucket_name,
    key_list = key_list,
    full_text_index_name = full_text_index_name
)

full_text_indexing_result

#### Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#### SPDX-License-Identifier: MIT-0