## Shows coverting document into embedding and storing in Amazon OpenSearch Serverless Vector Engine
### Used for semantic search

In [None]:
!pip -q install langchain 
!pip -q install PyPDF2
!pip -q install requests
!pip -q install opensearch-py

### Use the links given to create Opensearch Serverless Collection and a vector index
https://aws.amazon.com/blogs/big-data/introducing-the-vector-engine-for-amazon-opensearch-serverless-now-in-preview/
https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless-vector-search.html

#### For this sample code the following configuration is used
##### vector index name = "myvectorindex" 
##### vector field name = "myvector" 
##### vector dimension = 4096 (because we are using "GPT-J 6B Embedding" FM for generating embedding and it creates enbedding of 4096 dimensions)


### Creating Client for the OpenSearch. 
##### Update collection id for the host name

In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
import boto3

In [None]:
host = '<Opensearch Collection Id'  # Update collection id
region = boto3.Session().region_name
service = 'aoss'
credentials = boto3.Session().get_credentials()
auth = AWSV4SignerAuth(credentials, region, service)
myindex = "myvectorindex"

In [None]:
client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=auth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    pool_maxsize=20,
)

### Reading document file and load text into a variable

In [None]:
from PyPDF2 import PdfReader

In [None]:
# used one pdf file store in local folder as the document
reader = PdfReader('./content/impromptu-rh.pdf')

In [None]:
# Extract the text from the PDF document
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [None]:
print(len(raw_text))

## Create chunks of the text. Text Embedding is generated for each chunk

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len
)

In [None]:
textchunks = text_splitter.split_text(raw_text)
len(textchunks)

### Using gpt-j-6b FM model from SageMaker JumpStart for embedding
### Deploy gpt-j-6b FM model from SageMaker JumpStart
### Update SageMaker Endpoint 
### Store embedding along with the text in the Opensearch vector index

In [None]:
import sagemaker,json

In [None]:
modelendpoint = "<SageMaker Endpoint Name>"

In [None]:
thenewline=  "\n"

##  Call SagaMaker Endpoint to get emedding for the text
def get_emedding(inputtext):
    client = boto3.client("runtime.sagemaker")
    response = client.invoke_endpoint(
        EndpointName=modelendpoint, ContentType="application/x-text", Body=inputtext.encode("utf-8")
    )
    model_predictions = json.loads(response["Body"].read())
    text_embedding = model_predictions["embedding"]
    return text_embedding[0]

## Create payload for vector index and create a record
def create_document(vec,txt):
    payload = {
      "myvector": vec,
      "mytext": txt
    }
    client.index(index=myindex, body=payload)

In [None]:
## Going over all the text chunks and insering the the vector index
for txt in textchunks:
    vec = get_emedding(txt)
    create_document(vec,txt)

### KNN Search into the Opensearch Vector Index

In [None]:
query = "Professor Mintz, frankly, excelled as a professor long before the development of LLMs."
vquery = get_emedding(query)
search_query = {"query": {"knn": {"myvector": {"vector": vquery, "k": 3}}}}
response = client.search(
    index=myindex,
    body=search_query
)

In [None]:
# Take first search or nth search
n=0
print((response["hits"]["hits"][n]["_source"]["mytext"]))

In [None]:
# Collect all search in the full context
fullcontext = ""
for h in response["hits"]["hits"]:
    txt = h["_source"]["mytext"]
    fullcontext = fullcontext + txt + thenewline
print(fullcontext)