## Steps to convert a document (PDF) into text embedding
### Can be easily converted into other document types and multiple documents

In [None]:
!pip -q install langchain 
!pip -q install PyPDF2

### Reading document file and load text into a variable

In [None]:
from PyPDF2 import PdfReader

In [None]:
reader = PdfReader('./content/impromptu-rh.pdf')

In [None]:
# Extract the text from the PDF document
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [None]:
print(len(raw_text))

### Create chunks of the text.
#### Text Embedding is generated for each chunk and stored in vector database 

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len
)

In [None]:
textchunks = text_splitter.split_text(raw_text)
len(textchunks)

In [None]:
textchunks[12]

### Call JumpStart Model or any other model which creates text embedding

In [None]:
import sagemaker, boto3, json, numpy

In [None]:
thenewline,bold, unbold = "\n", "\033[1m", "\033[0m"

def query_endpoint(encoded_text, endpoint_name):
    client = boto3.client("runtime.sagemaker")
    response = client.invoke_endpoint(
        EndpointName=endpoint_name, ContentType="application/x-text", Body=encoded_text
    )
    return response


def parse_response(query_response):
    model_predictions = json.loads(query_response["Body"].read())
    generated_text = model_predictions["generated_text"]
    return generated_text

In [None]:
inputtext = textchunks[12]
modelendpoint = "jumpstart-XXXXXXXXX-textembedding-gpt-j-6b"

query_response = query_endpoint(inputtext.encode("utf-8"), endpoint_name=modelendpoint)
model_predictions = json.loads(query_response["Body"].read())
text_embedding = model_predictions["embedding"]

In [None]:
len(text_embedding[0])

In [None]:
print(text_embedding)