In [2]:
# --- STEP ZERO: NETWORK CONFIGURATION FOR WSL ---
import os
os.environ["GRPC_DNS_RESOLVER"] = "native"
# --- END OF NETWORK CONFIGURATION ---

from dotenv import load_dotenv
import time
import langchain

# Load environment variables from the .env file
load_dotenv()

# Check if the Google API key was loaded
if 'GOOGLE_API_KEY' not in os.environ:
    print("Error: Google API Key not found. Check your .env file.")
else:
    print("Google API Key loaded successfully.")

# --- Imports for AI libraries ---
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# --- STEP 3.3.3: CREATE OR LOAD THE VECTOR STORE ---
# Define the path for the saved FAISS index
FAISS_INDEX_PATH = "faiss_index"

# Define the embedding model to be used.
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'} # Use 'cuda' if you have a configured GPU
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Check if the vector store already exists
if os.path.exists(FAISS_INDEX_PATH):
    print("Loading existing vector store from disk...")
    # allow_dangerous_deserialization is needed for loading local FAISS indexes
    vectorstore = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
    print("Vector store loaded successfully!")
else:
    print("No existing vector store found. Creating a new one...")
    # --- STEP 3.3.1: LOAD THE KNOWLEDGE BASE ---
    loader = DirectoryLoader('./knowledge_base/', glob="./*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    print(f"Total documents loaded: {len(documents)}")

    # --- STEP 3.3.2: SPLIT DOCUMENTS INTO CHUNKS ---
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    texts = text_splitter.split_documents(documents)
    print(f"Total text chunks created: {len(texts)}")

    # Create the FAISS vector store from the text chunks and the embedding model.
    print("\nCreating embeddings and vector store... This might take a moment.")
    vectorstore = FAISS.from_documents(texts, embeddings)
    print("Vector store created successfully!")

    # Save the vector store to disk for future use
    vectorstore.save_local(FAISS_INDEX_PATH)
    print(f"Vector store saved to disk at: {FAISS_INDEX_PATH}")


# --- STEP 3.3.4: CONFIGURE THE LLM AND THE RETRIEVAL CHAIN ---
# Configure the LLM that will be used to generate answers
llm = GoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.3) # <--- FINAL CORRECTED MODEL NAME

# Create a prompt template to instruct the LLM on how to behave
prompt_template = """
Use the following context to answer the question in a detailed and precise manner.
If the answer is not in the context, say "The information is not available in my knowledge base."

Context:
{context}

Question:
{question}

Answer:
"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

# Create the Retrieval Question-Answering chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}), # Retrieve the 3 most relevant chunks
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

# --- STEP 3.3.5: TESTING THE SYSTEM ---
print("\n--- Testing the AI Assistant ---")

# Now you can ask questions directly without debug mode for a cleaner output
question1 = "What are the primary challenges in diagnosing MASH, and how do non-invasive tests (NITs) like the FIB-4 score address these challenges?"
print(f"\nQuestion 1: {question1}")
response1 = qa_chain.invoke({"query": question1})
print(f"Answer 1: {response1['result']}\n")

question2 = "According to the provided Brazilian guidelines, what is the recommended screening and follow-up protocol for MASLD in overweight or obese individuals?"
print(f"Question 2: {question2}")
response2 = qa_chain.invoke({"query": question2})
print(f"Answer 2: {response2['result']}\n")

question3 = "What is the global prevalence of MASH, and how does it differ in Latin America compared to the rest of the world?"
print(f"Question 3: {question3}")
response3 = qa_chain.invoke({"query": question3})
print(f"Answer 3: {response3['result']}\n")

question4 = "What lifestyle modifications are recommended for the management of MASLD, and what is the evidence for their effectiveness?"
print(f"Question 4: {question4}")
response4 = qa_chain.invoke({"query": question4})
print(f"Answer 4: {response4['result']}\n")

question5 = "What are the key recommendations for the pharmacological treatment of MASH, including the use of resmetirom?"
print(f"Question 5: {question5}")
response5 = qa_chain.invoke({"query": question5})
print(f"Answer 5: {response5['result']}\n")

Google API Key loaded successfully.
Loading existing vector store from disk...
Vector store loaded successfully!

--- Testing the AI Assistant ---

Question 1: What are the primary challenges in diagnosing MASH, and how do non-invasive tests (NITs) like the FIB-4 score address these challenges?
Answer 1: The primary challenges in diagnosing MASH are:  1) its silent nature, leading to a high percentage (90%) of undiagnosed cases even in at-risk individuals; 2) low medical familiarity with diagnostic methods like FIB-4 and hepatic elastography, resulting in underutilization of non-invasive testing options; 3) a lack of widely disseminated clinical guidelines for screening and management, creating uncertainty among healthcare professionals about when and how to investigate; and 4) structural and access barriers hindering testing.

Non-invasive tests (NITs) such as the FIB-4 score directly address these challenges.  The FIB-4 score, a first-line NIT, is simple to calculate using readily av