In [1]:
!pip install -U langchain langchain-ollama langchain-community langchain-huggingface faiss-cpu huggingface_hub pandas python-dotenv




In [3]:
# Import necessary libraries
import os
import pandas as pd
import json
import logging
from dotenv import load_dotenv
from langchain.prompts import ChatPromptTemplate  # Check if 'langchain_core' is valid or needs correction
from langchain_ollama import OllamaLLM  # Corrected import for OllamaLLM
from langchain.vectorstores import FAISS  # Correct import for FAISS
from langchain.embeddings import HuggingFaceEmbeddings  # Corrected import path
import time
import atexit

# Load environment variables
load_dotenv()

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Define Ollama generation arguments
generation_kwargs = {
    "max_tokens": 500,
    "temperature": 0.5
}


In [5]:
# Install remaining packages
!pip install tqdm tenacity tiktoken

# Import necessary libraries
import pandas as pd
import os
from dotenv import load_dotenv
import traceback
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from tenacity import retry, stop_after_attempt, wait_random_exponential, retry_if_exception_type
from huggingface_hub import login
import tiktoken

# Add a huggingface token to .env file
load_dotenv()

# Use the Hugging Face access token for authentication
hf_access_token = os.getenv('HF_TOKEN')

# PATH_TO_DATA = "hf://datasets/zhengyun21/PMC-Patients/PMC-Patients.csv"
PATH_TO_DATA = "hf://datasets/ncbi/Open-Patients/Open-Patients.jsonl"
BATCH_SIZE = 10  # Adjust based on your needs
PATH_TO_VECTORDB = "./db/faiss_index"
TOKEN_LIMIT = 125  # Maximum number of tokens per chunk

# Initialize tokenizer (assuming LLaMA tokenizer, you can replace with a specific tokenizer you use)
tokenizer = tiktoken.get_encoding("cl100k_base")  # Replace with the correct tokenizer for your model if different

# Create the vector DB directory if it doesn't exist
if not os.path.exists(PATH_TO_VECTORDB):
    os.makedirs(PATH_TO_VECTORDB)

# Define retry decorator for API calls
@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(min=1, max=60), 
       retry=retry_if_exception_type(Exception))
def load_data_and_process():
    try:
        # Load the dataset (example with JSONL)
        dataset = pd.read_json(PATH_TO_DATA, lines=True)
        print(f"Loaded {len(dataset)} records from {PATH_TO_DATA}")
        
        # You can process your data here (e.g., tokenizing, embedding, etc.)
        # For example, split data into batches and process
        for i in tqdm(range(0, len(dataset), BATCH_SIZE)):
            batch = dataset.iloc[i:i+BATCH_SIZE]
            # Process each batch here (e.g., embeddings, vectorization, etc.)
            # Using FAISS, HuggingFace embeddings, or your desired method.
            
        print("Data processed successfully.")
    except Exception as e:
        logger.error(f"Error processing data: {e}")
        traceback.print_exc()

# Call the data processing function
load_data_and_process()


Loaded 180142 records from hf://datasets/ncbi/Open-Patients/Open-Patients.jsonl


100%|██████████| 18015/18015 [00:00<00:00, 64844.67it/s]


Data processed successfully.


In [6]:
import os
import pandas as pd
import traceback
import logging
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from tenacity import retry, stop_after_attempt, wait_random_exponential, retry_if_exception_type
import tiktoken

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()
hf_access_token = os.getenv('HF_TOKEN')

# Define constants
PATH_TO_DATA = "hf://datasets/ncbi/Open-Patients/Open-Patients.jsonl"
BATCH_SIZE = 10
PATH_TO_VECTORDB = "./db/faiss_index"
TOKEN_LIMIT = 125  # Maximum number of tokens per chunk

# Initialize tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

# Function to chunk text into smaller pieces
def chunk_text(text, token_limit):
    tokens = tokenizer.encode(text)
    chunks = []
    for i in range(0, len(tokens), token_limit):
        chunk_tokens = tokens[i:i + token_limit]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text)
    return chunks

# Function to create documents from rows
def create_documents(row):
    if not row['description']:  # Skip rows with empty descriptions
        return []
    chunks = chunk_text(row['description'], TOKEN_LIMIT)
    return [
        Document(
            page_content=chunk,
            metadata={"_id": row['_id']}
        )
        for chunk in chunks
    ]

# Retry function for embedding
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry=retry_if_exception_type(Exception))
def embed_with_backoff(embeddings, texts):
    return embeddings.embed_documents(texts)

# Process each batch
def process_batch(batch, embeddings):
    try:
        return embed_with_backoff(embeddings, batch)
    except Exception as e:
        logger.error(f"Error embedding batch: {str(e)}")
        return None

# Main function to build the vector database
def main():
    if os.path.exists(PATH_TO_VECTORDB):
        logger.info('Vector database already exists')
    else:
        try:
            logger.info('Loading data')
            df = pd.read_json(PATH_TO_DATA, lines=True)  # Assuming the JSON is in "lines" format
            logger.info(f"Loaded {len(df)} rows of data.")

            df = df.head(10)  # Limiting to first 10 rows for testing
            total_rows = len(df)

            logger.info('Creating documents')
            documents = []
            for _, row in tqdm(df.iterrows(), total=total_rows, desc="Creating documents"):
                documents.extend(create_documents(row))

            logger.info('Building vector database...')
            embeddings = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1", model_kwargs={"trust_remote_code": True})

            batches = [documents[i:i + BATCH_SIZE] for i in range(0, len(documents), BATCH_SIZE)]

            all_embeddings = []
            with ThreadPoolExecutor() as executor:
                futures = [executor.submit(process_batch, [doc.page_content for doc in batch], embeddings) for batch in batches]
                for future in tqdm(as_completed(futures), total=len(futures), desc="Processing batches"):
                    result = future.result()
                    if result is not None:
                        all_embeddings.extend(result)

            valid_docs = [doc for doc, emb in zip(documents, all_embeddings) if emb is not None]
            valid_embeddings = [emb for emb in all_embeddings if emb is not None]

            logger.info(f"Successfully embedded {len(valid_docs)} out of {len(documents)} documents")

            vector_store = FAISS.from_embeddings(
                text_embeddings=list(zip([doc.page_content for doc in valid_docs], valid_embeddings)),
                embedding=embeddings,
                metadatas=[doc.metadata for doc in valid_docs]
            )

            retriever = vector_store.as_retriever()

            logger.info('Saving vector database locally...')
            vector_store.save_local(PATH_TO_VECTORDB)

            logger.info('Done')
            return vector_store, retriever
        except Exception as e:
            logger.error(f"An error occurred: {str(e)}")
            logger.error("Traceback:")
            logger.error(traceback.format_exc())
            return "Error occurred", "Error occurred", [], []

if __name__ == "__main__":
    main()


2025-04-08 03:11:24,217 - INFO - Vector database already exists


In [9]:
from langchain_ollama.llms import OllamaLLM
from langchain.prompts import ChatPromptTemplate
import json
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Initialize Ollama model once (reuse across calls)
model = OllamaLLM(model="llama3.2:3b")

def query_ollama(prompt, generation_kwargs, output_format="text"):
    """
    Sends a prompt to the Ollama model using langchain-ollama integration.
    
    Args:
        prompt (str): The generated prompt to send to the model.
        generation_kwargs (dict): Generation settings for the model.
        output_format (str): The expected format of the output, either "text" or "json".
    
    Returns:
        str or dict: The model's response, either as text or a parsed JSON object.
    """
    try:
        # Create prompt template
        chat_prompt = ChatPromptTemplate.from_template(prompt)
        chain = chat_prompt | model
        
        # Invoke the chain with the prompt
        response = chain.invoke({"question": prompt})
        
        # Check output format
        if output_format == "json":
            try:
                result = json.loads(response)
                return result
            except json.JSONDecodeError:
                logger.error(f"Failed to decode JSON response: {response}")
                return {
                    "error": "Error in generating response or unexpected response format."
                }
        else:
            return response

    except Exception as e:
        logger.error(f"An error occurred during the Ollama query: {str(e)}")
        return {"error": "An error occurred during the query."}


In [11]:
def generate_summary(article_text, query_ollama, generation_kwargs):
    """
    Generates a concise summary of the article text, focusing on relevant phenotypes, diagnoses, 
    and other attributes of the patient.

    Args:
    - article_text (str): The full text of the article that needs to be summarized.
    - query_ollama (function): The function to query the Ollama model.
    - generation_kwargs (dict): Additional generation arguments for the model.
    
    Returns:
    - str: The summary of the article text.
    """
    prompt = f"""
    Summarize the following patient case in a single paragraph, focusing on relevant phenotypes, diagnoses, and other attributes of the patient.
    The summary should be concise and retain important scientific or clinical terminology.
    ### Article Text:
    {article_text}
    """.strip()

    response = query_ollama(prompt, generation_kwargs, output_format="text")
    return response


def generate_potential_causes(patient_case, article_summaries, query_ollama, generation_kwargs):
    """
    Generates a prompt to query Ollama for potential causes based on a patient's case and article summaries.

    Args:
    - patient_case (str): The user's provided detailed patient case.
    - article_summaries (list): List of summaries from similar patient cases in the literature.
    - query_ollama (function): The function to query the Ollama model.
    - generation_kwargs (dict): Additional generation arguments for the model.

    Returns:
    - str: The generated prompt to query Ollama.
    """
    examples = "\n\n".join([f"### Example Case {i+1}:\n{summary}" for i, summary in enumerate(article_summaries)])

    prompt = f"""
    A user has provided a detailed patient's case with specific symptoms and medical history.
    Below are some examples of similar patient cases from medical literature.

    Your task is to identify potential causes for the symptoms described in the user's case, using the examples as references.

    ### User-Provided Patient Case:
    {patient_case}

    {examples}

    Please provide:
    - A numbered list of potential causes based on the information.
    - A brief explanation for each cause.
    """
    
    # Query Ollama for potential causes
    response = query_ollama(prompt, generation_kwargs, output_format="text")
    return response


In [14]:
import time
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

def ask_me_potential_causes(patient_narrative, max_retries=10, delay=2, k=4):
    """
    Asks for potential causes of a patient's condition based on a provided narrative and retrieves similar cases.
    
    Args:
    - patient_narrative (str): A detailed description of the patient's case.
    - max_retries (int): The maximum number of retry attempts in case of failure.
    - delay (int): The time delay (in seconds) between retry attempts.
    - k (int): The number of similar cases to retrieve from the vector store.
    
    Returns:
    - str: The potential causes for the patient case, or a failure message if retries are exceeded.
    """
    retries = 0
    while retries < max_retries:
        try:
            logger.info(f"Attempt {retries+1}/{max_retries} - Processing Patient Case: {patient_narrative}")
            
            # Retrieve similar cases from FAISS vector store
            docs = vector_store.similarity_search_with_score(patient_narrative, k=k)
            article_summaries = [doc.page_content for doc, score in docs]

            # Check if summaries are retrieved successfully
            if not article_summaries:
                logger.error("No similar cases found.")
                return "No similar cases found."

            # Save related cases in human-readable format to a .txt file
            output_path_cases_txt = "related_patient_cases.txt"
            try:
                with open(output_path_cases_txt, 'w') as txt_file:
                    txt_file.write("Related Patient Cases (Double-Spaced):\n\n")
                    for i, summary in enumerate(article_summaries):
                        txt_file.write(f"Case {i+1}:\n{summary}\n\n")
                logger.info(f"Related patient cases saved to {output_path_cases_txt} in human-readable format.")
            except IOError as e:
                logger.error(f"Error writing related cases to file: {e}")
                return "Error saving related patient cases."

            # Generate a prompt to find potential causes
            prompt = generate_potential_causes(patient_narrative, article_summaries)
            potential_causes = query_ollama(prompt, generation_kwargs, output_format="text")

            # Check if potential causes were generated
            if not potential_causes:
                logger.error("No potential causes generated.")
                return "No potential causes generated."

            # Save potential causes to a .txt file
            output_path_causes_txt = "potential_causes.txt"
            try:
                with open(output_path_causes_txt, 'w') as txt_file:
                    txt_file.write("Potential Causes for Patient Case:\n\n")
                    txt_file.write(potential_causes)
                logger.info(f"Potential causes saved to {output_path_causes_txt}.")
            except IOError as e:
                logger.error(f"Error writing potential causes to file: {e}")
                return "Error saving potential causes."

            return potential_causes

        except Exception as e:
            retries += 1
            logger.error(f"Error occurred: {e}. Retrying {retries}/{max_retries}...")
            time.sleep(delay)

    logger.warning("Max retries reached, could not complete the request.")
    return "Max retries reached, could not complete the request."


# Example usage
if __name__ == "__main__":
    patient_narrative = "64-year-old obese female with diagnosis of diabetes mellitus and persistently elevated HbA1c"
    result = ask_me_potential_causes(patient_narrative)
    logger.info("Potential causes identified and saved.")


2025-04-08 03:17:06,639 - INFO - Attempt 1/10 - Processing Patient Case: 64-year-old obese female with diagnosis of diabetes mellitus and persistently elevated HbA1c
2025-04-08 03:17:06,641 - ERROR - Error occurred: name 'vector_store' is not defined. Retrying 1/10...
2025-04-08 03:17:08,644 - INFO - Attempt 2/10 - Processing Patient Case: 64-year-old obese female with diagnosis of diabetes mellitus and persistently elevated HbA1c
2025-04-08 03:17:08,646 - ERROR - Error occurred: name 'vector_store' is not defined. Retrying 2/10...
2025-04-08 03:17:10,648 - INFO - Attempt 3/10 - Processing Patient Case: 64-year-old obese female with diagnosis of diabetes mellitus and persistently elevated HbA1c
2025-04-08 03:17:10,649 - ERROR - Error occurred: name 'vector_store' is not defined. Retrying 3/10...
2025-04-08 03:17:12,652 - INFO - Attempt 4/10 - Processing Patient Case: 64-year-old obese female with diagnosis of diabetes mellitus and persistently elevated HbA1c
2025-04-08 03:17:12,652 - E