# Splitting and Embedding Text Using LangChain

In [1]:
pip install -r ./requirements.txt -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/usr/local/opt/python@3.11/bin/python3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [12]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('./fhir-endpoint-for-base-urls.txt') as f:
  fhir_endpoint = f.read()

text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=100, # would normally be higher (also max size for chunk)
  chunk_overlap=20,
  length_function=len
)

In [19]:
chunks = text_splitter.create_documents([fhir_endpoint])
# print(chunks[10].page_content)
print(f'Now you have {len(chunks)} chunks.')

Now you have 466 chunks.


## Embedding Cost

In [20]:
####
def print_embedding_cost(texts):
  import tiktoken
  enc = tiktoken.encoding_for_model('text-embedding-ada-002')
  total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
  print(f'Total Tokens: {total_tokens}')
  print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

print_embedding_cost(chunks)

Total Tokens: 8048
Embedding Cost in USD: 0.003219


In [21]:
from langchain_openai.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [None]:
vector = embedding.embed_query('abc')
vector = embedding.embed_query(chunks[0].page_content)

print(vector)

# Inserting the Embeddings into a Pinecone Index

In [15]:
import pinecone
from langchain_community.vectorstores import Pinecone
pc = pinecone.Pinecone()

  from tqdm.autonotebook import tqdm


In [16]:
for i in pc.list_indexes().names():
  print('deleting all indexes')
  pc.delete_index(i)
  print('Done')

deleting all indexes
Done


In [18]:
index_name = 'fhir-endpoint-for-base-urls'
if index_name not in pc.list_indexes().names():
  print(f'Creating index {index_name}')
  pc.create_index(
    name=index_name,
    dimension=1536,
    metric='cosine',
    spec=pinecone.PodSpec(
      environment='gcp-starter'
    )
  )
  print('Done')

Creating index fhir-endpoint-for-base-urls
Done


In [22]:
# Load vector store from document chunks
# chunks - split text content with Text Splitter -> Create documents (chunks) from Splitter object -> output is our chunks
# embedding - the embeddings class we are using (eg. OpenAIEmbeddings())
# index_name - the defined index name we've chosen
vector_store = Pinecone.from_documents(chunks, embedding, index_name=index_name)

In [None]:
# Load vector store from an existing index
vector_store = Pinecone.from_existing_index(index_name='fhir-endpoint-for-base-urls', embedding=embedding)

# Asking Questions (Similarity Search)

In [23]:
query = 'What is the deadline for this requirement?'
result = vector_store.similarity_search(query)
# print(result);

In [24]:
for r in result:
  print(r.page_content)
  print('-' * 50)

We proposed that Certified API Developers publish these standardized details by December 31, 2024.
--------------------------------------------------
90-days would be considered in violation of this proposed requirement.
--------------------------------------------------
that for the time period between when this final rule is effective and December 31, 2024, that
--------------------------------------------------
Rule to provide industry an opportunity to coalesce on specifications. We finalized § 170.404(b)(2)
--------------------------------------------------


In [25]:
from langchain.chains import RetrievalQA
from langchain_openai.chat_models import ChatOpenAI
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

retriever = vector_store.as_retriever(
  search_type='similarity',
  search_kwargs={
    'k': 3
  }
 )

chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

In [32]:
# query = 'What is the deadline for this requirement?'
# query = 'What FHIR Resources are used in this rule?'
# query = 'Give me a detailed explanation of what needs to be implemented as a requirement for Healthcare API Developers in this rule?'
query = 'Do I need to add address to Organization resource'
answer = chain.run(query)
print(answer)

Based on the provided context, there is no specific mention of an address being required in the Organization resource. However, the organization details that should be used by app developers include endpoints. If address information is not specified as a required element in the Organization resource in your specific context, then it may not be necessary to add it. It would be best to refer to the complete documentation for clear guidance on which elements are required for the Organization resource.
