# Splitting and Embedding Text Using LangChain

In [None]:
pip install -r ./requirements.txt -q

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('./docs/fhir-endpoint-for-base-urls.txt') as f:
  fhir_endpoint = f.read()

text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=100, # would normally be higher (also max size for chunk)
  chunk_overlap=20,
  length_function=len
)

In [None]:
chunks = text_splitter.create_documents([fhir_endpoint])
# print(chunks[10].page_content)
print(f'Now you have {len(chunks)} chunks.')

## Embedding Cost

In [None]:
####
def print_embedding_cost(texts):
  import tiktoken
  enc = tiktoken.encoding_for_model('text-embedding-ada-002')
  total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
  print(f'Total Tokens: {total_tokens}')
  print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

print_embedding_cost(chunks)

In [None]:
from langchain_openai.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [None]:
vector = embedding.embed_query('abc')
vector = embedding.embed_query(chunks[0].page_content)

print(vector)

# Inserting the Embeddings into a Pinecone Index

In [None]:
import pinecone
from langchain_community.vectorstores import Pinecone
pc = pinecone.Pinecone()

In [None]:
for i in pc.list_indexes().names():
  print('deleting all indexes')
  pc.delete_index(i)
  print('Done')

In [None]:
index_name = 'fhir-endpoint-for-base-urls'
if index_name not in pc.list_indexes().names():
  print(f'Creating index {index_name}')
  pc.create_index(
    name=index_name,
    dimension=1536,
    metric='cosine',
    spec=pinecone.PodSpec(
      environment='gcp-starter'
    )
  )
  print('Done')

In [None]:
# Load vector store from document chunks
# chunks - split text content with Text Splitter -> Create documents (chunks) from Splitter object -> output is our chunks
# embedding - the embeddings class we are using (eg. OpenAIEmbeddings())
# index_name - the defined index name we've chosen
vector_store = Pinecone.from_documents(chunks, embedding, index_name=index_name)

In [None]:
# Load vector store from an existing index
vector_store = Pinecone.from_existing_index(index_name='fhir-endpoint-for-base-urls', embedding=embedding)

# Asking Questions (Similarity Search)

In [None]:
query = 'What is the deadline for this requirement?'
result = vector_store.similarity_search(query)
# print(result);

In [None]:
for r in result:
  print(r.page_content)
  print('-' * 50)

In [None]:
from langchain.chains import RetrievalQA
from langchain_openai.chat_models import ChatOpenAI
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

retriever = vector_store.as_retriever(
  search_type='similarity',
  search_kwargs={
    'k': 3
  }
 )

chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

In [None]:
# query = 'What is the deadline for this requirement?'
# query = 'What FHIR Resources are used in this rule?'
# query = 'Give me a detailed explanation of what needs to be implemented as a requirement for Healthcare API Developers in this rule?'
query = 'What are the "must support" elements in this rule?'
answer = chain.run(query)
print(answer)