In [6]:
import re
from typing import List, Dict

def chunk_actionplan_by_topic_gene(text: str) -> List[Dict]:
    chunks = []
    current_topic = None
    current_gene = None
    chunk_id = 1
    metadata= []

    # Split the text into sections by detecting "Topic:" or "Gene:" headers
    sections = re.split(r"(?=Topic:|Gene:)", text)

    for section in sections:
        # Extract topic if found
        topic_match = re.search(r"Topic:\s*(.*)", section)
        if topic_match:
            current_topic = topic_match.group(1).strip()

        # Extract gene if found
        gene_match = re.search(r"Gene:\s*(.*)", section)
        if gene_match:
            current_gene = gene_match.group(1).strip()

        # Find all age-based blocks in this section
        age_blocks = re.findall(
            r"(Min Age: \d+\nMax Age: \d+\n(?:.|\n)*?)(?=\nMin Age: \d+|(?=Gene:|Topic:|$))",
            section
        )

        for block in age_blocks:
            min_age_match = re.search(r"Min Age: (\d+)", block)
            max_age_match = re.search(r"Max Age: (\d+)", block)
            headings = re.findall(r"Heading:\s*(.*)", block)

            metadata.append({
                "id": f"{chunk_id}",
                "topic": current_topic if current_topic else "Unknown",
                "gene": current_gene if current_gene else "Unknown",
                "min_age": int(min_age_match.group(1)) if min_age_match else None,
                "max_age": int(max_age_match.group(1)) if max_age_match else None,
                # "headings": [h.strip() for h in headings],
            })
            chunks.append({  "text": block.strip() })
            chunk_id += 1

    return [chunks, metadata]


In [7]:
from langchain_community.document_loaders import TextLoader
from helper import get_env

KNOWLEDGE_BASE_PATH = get_env.retreive_value("KNOWLEDGE_BASE_PATH") # PRO 
    # KNOWLEDGE_BASE_PATH = get_env.retreive_value("KNOWLEDGE_BASE_PATH_TEST") # TEST 
    
KNOWLEDGE_BASE_VECTOR_PATH = get_env.retreive_value("KNOWLEDGE_BASE_VECTOR_PATH_OPENAI")

loader = TextLoader(file_path= KNOWLEDGE_BASE_PATH, encoding="utf-8")
text = loader.load()
print ( text)

# Step 2: Chunk it
chunk_metadata = chunk_actionplan_by_topic_gene( text[0].page_content )
chunks   = chunk_metadata[0]
metadata = chunk_metadata[1]
chunk_metadata
    
# Export to JSON
import json
with open("chunked_output.json", "w", encoding="utf-8") as f:
    json.dump(chunks, f, indent=2, ensure_ascii=False)
    
with open("metadata_output.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

print(f"✅ Exported {len(chunks)} chunks")

[Document(metadata={'source': './resource/actionplan - whole.txt'}, page_content='Topic: Welcome to your ActionPlan\n\nGene: BRCA1\n\nMin Age: 20\nMax Age: 29\n\nHeading: Your cancer risk\nIf you were assigned female at birth:\n-   At your current age, your risk of developing breast and ovarian cancer is low .\n-   Unless you have a family member with a young-age breast or ovarian cancer diagnosis, you are not recommended to begin cancer risk management until age 30 years.\n-   This risk will change over time, and you can use this website to see how your risk management recommendations will change in the future.If you were assigned male at birth:\n- Your risk of developing breast cancer in your lifetime is increased, but remains low.\n- For your cancer risks and management guidelines, please see the section "Less common cancers".\n\nMin Age: 30\nMax Age: 39\n\nHeading: Your cancer risk\nIf you were assigned female at birth:\n- At your current age, your risk of developing breast cancer 

In [8]:
print( chunks )
print( metadata )

[{'text': 'Min Age: 20\nMax Age: 29\n\nHeading: Your cancer risk\nIf you were assigned female at birth:\n-   At your current age, your risk of developing breast and ovarian cancer is low .\n-   Unless you have a family member with a young-age breast or ovarian cancer diagnosis, you are not recommended to begin cancer risk management until age 30 years.\n-   This risk will change over time, and you can use this website to see how your risk management recommendations will change in the future.If you were assigned male at birth:\n- Your risk of developing breast cancer in your lifetime is increased, but remains low.\n- For your cancer risks and management guidelines, please see the section "Less common cancers".'}, {'text': 'Min Age: 30\nMax Age: 39\n\nHeading: Your cancer risk\nIf you were assigned female at birth:\n- At your current age, your risk of developing breast cancer is low, but higher than the general population.\n- Before the age of 35, your risk of developing ovarian cancer i

In [9]:
from langchain.schema import Document 
document = [
    Document( page_content=text["text"], metadata= meta)
   for text, meta in zip(chunks, metadata)
]

print ( document )

[Document(metadata={'id': '1', 'topic': 'Welcome to your ActionPlan', 'gene': 'BRCA1', 'min_age': 20, 'max_age': 29}, page_content='Min Age: 20\nMax Age: 29\n\nHeading: Your cancer risk\nIf you were assigned female at birth:\n-   At your current age, your risk of developing breast and ovarian cancer is low .\n-   Unless you have a family member with a young-age breast or ovarian cancer diagnosis, you are not recommended to begin cancer risk management until age 30 years.\n-   This risk will change over time, and you can use this website to see how your risk management recommendations will change in the future.If you were assigned male at birth:\n- Your risk of developing breast cancer in your lifetime is increased, but remains low.\n- For your cancer risks and management guidelines, please see the section "Less common cancers".'), Document(metadata={'id': '2', 'topic': 'Welcome to your ActionPlan', 'gene': 'BRCA1', 'min_age': 30, 'max_age': 39}, page_content='Min Age: 30\nMax Age: 39\n

In [10]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

from langchain_mistralai import MistralAIEmbeddings


MISTRAL_API_KEY = get_env.retreive_value( "MISTRAL_API_KEY")
mistral_embeddings = MistralAIEmbeddings(mistral_api_key=MISTRAL_API_KEY)

# embedding = OpenAIEmbeddings()
embedding = mistral_embeddings


vectordb = Chroma.from_documents(
    documents = document,
    embedding = embedding,
    persist_directory="vector_db_2"
)





In [18]:
age_input  = 25 
gene_input = "BRCA1"
question_input = "I am a man with BRCA1 and have not had screening, is there any imaging I should be doing?"


retriever = vectordb.as_retriever(
                                    search_type  = "similarity",
                                    search_kwargs={"k": 5},
                                    # filter       = filter_input
                             
)  

filter = {"gene": "BRCA1"}  

retriever.search_kwargs["filter"] = filter

# NO FILTER
results = retriever.invoke( question_input )
# NO FILTER
# results = retriever.invoke( question_input,filter={"gene": "BRCA1"} )

# results = retriever.invoke(question_input)

# print ( results )
# print ( filter_input)
print("-" * 40)
for i, doc in enumerate(results, 1):
    print(f"🔹 Document {i} --> ",  doc.metadata["id"], doc.metadata["gene"], " **",doc.metadata["topic"], " **",  doc.metadata["min_age"], "-",doc.metadata["max_age"])

          

----------------------------------------
🔹 Document 1 -->  16 BRCA1  ** Breast Cancer Early Detection  ** 20 - 29
🔹 Document 2 -->  16 BRCA1  ** Breast Cancer Early Detection  ** 20 - 29
🔹 Document 3 -->  18 BRCA1  ** Breast Cancer Early Detection  ** 40 - 49
🔹 Document 4 -->  18 BRCA1  ** Breast Cancer Early Detection  ** 40 - 49
🔹 Document 5 -->  19 BRCA1  ** Breast Cancer Early Detection  ** 50 - 100


In [12]:
print  (results)

[]


In [None]:
from openai import OpenAI
client = OpenAI( api_key = ""  )

response = client.responses.create(
    model="gpt-4.1",
    input="Write a one-sentence bedtime story about a unicorn."
)

print(response.output_text)

: 