# Create Amazon OpenSearch index

This notebook demonstrates how to create an Amazon OpenSearch index starting from the PDF document and create embeddings by using GPT-J 6B FP-16 deployed in the previous notebook [02-Deploy-GPT-J-Embeddings](./02-Deploy-GPT-J-Embeddings.ipynb)

**SageMaker Studio Kernel**: Data Science 3.0

In this exercise you will do:
 - Use Amazon Textract for generating text files from PDF
 - Create Amazon OpenSearch index and index documents by using embeddings generater from GPT-J 6B FP-16
 - Test the end to end solutions by querying documents and generate response by using the two ML models

***

# Step 1 - Import Modules

Here we’ll import some libraries and define some variables.

In [None]:
import boto3
import sagemaker.session

In [None]:
s3_client = boto3.client("s3")
sagemaker_runtime = boto3.client('sagemaker-runtime')
textract_client = boto3.client("textract")

Create a SageMaker Session and save the default region and the execution role in some Python variables

In [None]:
sagemaker_session = sagemaker.Session()

In [None]:
bucket_name = sagemaker_session.default_bucket()
region = boto3.session.Session().region_name
role = sagemaker.get_execution_role()

***

# Step 2 - Create txt files from unstructured documents

Here we are converting different document types, like PDFs and docx, into `.txt` files

In [None]:
s3_input_file_path = "gen-ai-qa/data/input"
s3_output_file_path = "gen-ai-qa/data/output"

## Use Amazon Textract for extracting text from PDF

In this example, we are using the public Amazon Shareholder letter PDF file

In [None]:
import os

In [None]:
input_file_path = "./data/input"

In [None]:
files = [f for f in os.listdir(input_file_path) if os.path.isfile(os.path.join(input_file_path, f)) and f.endswith(".txt")]

In [None]:
file_name = files[0]

file_name

# Step 3 - Index documents in Amazon OpenSearch

Starting from the text files, we are going to index the documents in OpenSearch

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import re
from tqdm import tqdm

In [None]:
CHUNK_SIZE = 768
output_file_path = "./data/output"

In [None]:
def doc_iterator(dir_path: str):
    for root, _, filenames in os.walk(dir_path):
        for filename in filenames:
            file_path = os.path.join(root, filename)
            page = filename.split(".")[0].split("_")[-1]
            if os.path.isfile(file_path):
                with open(file_path, 'r') as file:
                    file_contents = file.read()
                    yield filename, page, file_contents

In [None]:
chunks = []
total_passages = 0

for doc_name, page, doc in tqdm(doc_iterator(output_file_path)):
    n_passages = 0

    doc = re.sub(r"(\w)-\n(\w)", r"\1\2", doc)
    doc = re.sub(r"(?<!\n)\n(?!\n)", " ", doc)
    doc = re.sub(r"\n{2,}", "\n", doc)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
        chunk_overlap=133,
    )

    tmp_chunks = text_splitter.split_text(doc)

    for i, chunk in enumerate(tmp_chunks):
        chunks.append({
            "file_name": doc_name,
            "page": page,
            "passage": chunk
        })
        n_passages += 1
        total_passages += 1

    print(f'Document segmented into {n_passages} passages')

print(f'Total passages to index: {total_passages}')

## Create OpenSearch index

In [None]:
import json
import requests
from requests.auth import HTTPBasicAuth
import time

In [None]:
es_username = ""
es_password = ""

domain_endpoint = ""
domain_index = "genai-qa-bpistone"

URL = f'{domain_endpoint}/{domain_index}'

print(es_username)
print(es_password)
print(URL)

In [None]:
mapping = {
    'settings': {
        'index': {
            'knn': True  # Enable k-NN search for this index
        }
    },
    'mappings': {
        'properties': {
            'embedding': {  # k-NN vector field
                'type': 'knn_vector',
                'dimension': 4096,  # Dimension of the vector
                'similarity': 'cosine'
            },
            'file_name': {
                'type': 'text'
            },
            'page': {
                'type': 'text'
            },
            'passage': {
                'type': 'text'
            }
        }
    }
}

In [None]:
response = requests.head(URL, auth=HTTPBasicAuth(es_username, es_password))

# If the index does not exist (status code 404), create the index
if response.status_code != 404:
    print('Index already exists!')
    response = requests.delete(URL, auth=HTTPBasicAuth(es_username, es_password))

    print(response.text)

response = requests.put(URL, auth=HTTPBasicAuth(es_username, es_password), json=mapping)
print(f'Index created: {response.text}')

### Encode passages (chunks) using JumpStart's GPT-J text embedding model and ingest to OpenSearch

In [None]:
endpoint_name_gpt_j = "gpt-j-qa-endpoint"

In [None]:
%%time

i = 1
for chunk in chunks:
    payload = {'text_inputs': [chunk["passage"]]}
    payload = json.dumps(payload).encode('utf-8')

    response = sagemaker_runtime.invoke_endpoint(EndpointName=endpoint_name_gpt_j,
                                                ContentType='application/json',
                                                Body=payload)

    model_predictions = json.loads(response['Body'].read())
    embedding = model_predictions['embedding'][0]

    document = {
        'embedding': embedding,
        'file_name': chunk["file_name"],
        'page': chunk["page"],
        "passage": chunk["passage"]
    }

    response = requests.post(f'{URL}/_doc/{i}', auth=HTTPBasicAuth(es_username, es_password), json=document)
    i += 1

    print(response.text)

    if response.status_code not in [200, 201]:
        print(response.status_code)
        print(response.text)
        break