In [3]:
# !pip install faiss-cpu
# !pip install -U sentence-transformers
# !pip install langchain-community

In [4]:
# "/content/drive/MyDrive/Colab Notebooks/GenAi/Data/icd10cm 1.pdf"

import os
from langchain.vectorstores import FAISS
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

In [None]:
# this
# Load the PDF document
pdf_path = "/content/drive/MyDrive/Colab Notebooks/GenAi/Data/icd10cm 1.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Split the documents into smaller chunks for embedding
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# Create embeddings using SentenceTransformer
embedding_model = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')
#Get the page content of each text chunk for embedding
page_contents = [text.page_content for text in texts]
embeddings = embedding_model.embed_documents(page_contents)

# Create text-embedding pairs using page contents instead of Document objects
text_embedding_pairs = zip(page_contents, embeddings)

# Build FAISS index
# !pip install faiss-cpu
from langchain_community.vectorstores import FAISS
faiss = FAISS.from_embeddings(text_embedding_pairs, embedding_model)

# Save the vector store for later use
# faiss.save()
# Save the FAISS index to a local directory
index_path = 'icd10cm_faiss_index' # Specify the directory to save the index
faiss.save_local(index_path)



In [6]:
# loading saved index

from langchain_community.vectorstores import FAISS
from langchain.embeddings import SentenceTransformerEmbeddings

# Specify the path to the saved index
index_path = "/content/drive/MyDrive/Colab Notebooks/GenAi/icd10cm_faiss_index"

# Load the FAISS index
embedding_model = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')
loaded_faiss = FAISS.load_local(index_path, embeddings=embedding_model, allow_dangerous_deserialization=True)

RuntimeError: Error in faiss::FileIOReader::FileIOReader(const char*) at /project/faiss/faiss/impl/io.cpp:67: Error: 'f' failed: could not open /content/drive/MyDrive/Colab Notebooks/GenAi/icd10cm_faiss_index/index.faiss for reading: No such file or directory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
# Define your query
query = "What is ICD code for neoplasm"

# Perform similarity search
# docs_and_scores = loaded_faiss.similarity_search_with_score(query, k=5)  # Get top 5 similar documents with scores

#This is safer as it explicitly handles the possible cases of having either the score or metadata returned
results = loaded_faiss.similarity_search_with_score(query, k=5)

# Print the results
for doc, score in results:
    print(f"Document: {doc.page_content}\nScore: {score}\n")

Document: Code First/Use Additional Code notes (etiology/manifestation paired codes)
Certain conditions have both an underlying etiology and multiple body system manifestations due to the underlying
etiology. For such conditions the ICD-10-CM has a coding convention that requires the underlying condition be sequenced
first followed by the manifestation. Wherever such a combination exists there is a 'use additional code' note at the etiology
code, and a 'code first' note at the manifestation code. These instructional notes indicate the proper sequencing order of
the codes, etiology followed by manifestation.
In most cases the manifestation codes will have in the code title, 'in diseases classified elsewhere.' Codes with this title
are a component of the etiology/ manifestation convention. The code title indicates that it is a manifestation code. 'In
diseases classified elsewhere' codes are never permitted to be used as first listed or principal diagnosis codes. They must
Score: 0.986278

In [None]:
import os
from langchain.vectorstores import FAISS
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader


In [None]:
from langchain.llms import AzureOpenAI
from langchain.chains import RetrievalQA
import dotenv

# dotenv.load_dotenv()
dotenv.load_dotenv(os.path.join(os.path.dirname(__file__), './.env'))


False

In [None]:
# Azure OpenAI Configuration
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2023-05-15"  # Update with your desired API version
os.environ["OPENAI_API_BASE"] = "YOUR_AZURE_OPENAI_ENDPOINT"  # Replace with your endpoint
os.environ["OPENAI_API_KEY"] = "YOUR_AZURE_OPENAI_API_KEY"  # Replace with your API key

# Load the FAISS index (assuming it's already loaded as 'loaded_faiss')
embedding_model = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')

# Initialize Azure OpenAI LLM
llm = AzureOpenAI(deployment_name="YOUR_DEPLOYMENT_NAME")  # Replace with your deployment name

# Create RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=loaded_faiss.as_retriever(),
    return_source_documents=True,  # Include source documents in the response
)

# Define your query
query = "What is ICD code for neoplasm"

# Run the query
result = qa_chain({"query": query})

# Print the result
print(f"Answer: {result['result']}")
print(f"Source Documents: {result['source_documents']}")

In [None]:
# Function to classify a patient's description
def classify_patient_description(description):
    # Generate embedding for the patient description
    description_embedding = embedding_model.embed_query(description)

    # Perform a similarity search in the FAISS index
    results = faiss_index.similarity_search(description_embedding, k=5)  # Get top 5 matches
    return results

# Example patient description
patient_description = ("The patient reports feeling unusually fatigued and thirsty over the past month, "
                       "with increased urination, especially at night. The patient has also noticed "
                       "unexplained weight loss despite eating regularly. There is a family history of diabetes, "
                       "and the patient has not been physically active in recent years. No prior diagnosis of "
                       "diabetes, but blood sugar levels were borderline high during a previous checkup.")

# Classify the patient's description
classification_results = classify_patient_description(patient_description)

# Output the classification results
for result in classification_results:
    print(f"ICD Code: {result.metadata['icd_code']}, Description: {result.metadata['description']}")
