In [None]:
# test_pipeline_ollama.ipynb

import os
import pickle
import faiss
import json
import requests  # <-- New import
from sentence_transformers import SentenceTransformer
import time
import pprint

# RAGRetriever class is unchanged
class RAGRetriever:
    def __init__(self, index_path="faiss_index.bin", chunks_path="clean_chunks.pkl"):
        print("Initializing the RAG Retriever...")
        self.index = faiss.read_index(index_path)
        with open(chunks_path, "rb") as f:
            self.chunks = pickle.load(f)
        self.model = SentenceTransformer('BAAI/bge-small-en-v1.5', device='cpu')
        print("Retriever initialized successfully.")

    def retrieve(self, query: str, k: int = 1) -> dict:
        query_embedding = self.model.encode(query, normalize_embeddings=True)
        query_embedding = query_embedding.reshape(1, -1)
        _, indices = self.index.search(query_embedding, k)
        return self.chunks[indices[0][0]]

# --- THIS IS THE NEW OLLAMA-BASED GENERATOR ---
class RAGGenerator:
    def __init__(self, model_name="mistral", ollama_host="http://localhost:11434"):
        print("Initializing the RAG Generator (Ollama)...")
        self.model_name = model_name
        self.ollama_api_url = f"{ollama_host}/api/generate"
        print("Generator initialized. Make sure the Ollama server is running.")

    def generate(self, statement: str, context_chunk: dict) -> dict:
        context_text = f"Section: {context_chunk['section_title']}\n\n{context_chunk['content']}"
        topic_id = context_chunk['topic_id']
        
        # We construct a single, clean prompt
        prompt = f"""Context:
        ---
        {context_text}
        ---
        Statement: "{statement}"

        Task: Based ONLY on the provided context, respond with a single, raw JSON object with two keys: "statement_is_true" (1 for true, 0 for false) and "statement_topic" (the integer topic ID, which is {topic_id}). Do not add any explanation or markdown.
        """

        # Data payload for the Ollama API
        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "format": "json",  # Ask Ollama to guarantee the output is JSON
            "stream": False,
            "options": {
                "temperature": 0.0,
                "num_predict": 60 # Max tokens to generate
            }
        }
        
        try:
            # Make the request to the local Ollama server
            response = requests.post(self.ollama_api_url, json=payload)
            response.raise_for_status() # Raise an exception for bad status codes
            
            # The response from Ollama with format="json" is already a parsed JSON object
            response_json = json.loads(response.json()['response'])
            return response_json

        except requests.exceptions.RequestException as e:
            print(f"Error connecting to Ollama server: {e}")
            return {"statement_is_true": -1, "statement_topic": -1}
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON from Ollama: {e}")
            print(f"Raw response was: {response.text}")
            return {"statement_is_true": -1, "statement_topic": -1}


# --- Main Test Function (unchanged) ---
def run_local_test():
    print("--- Initializing RAG Pipeline ---")
    retriever = RAGRetriever()
    generator = RAGGenerator()
    print("\n--- Pipeline Initialized ---")

    statement = (
        "In cases of abdominal gunshot wounds, the liver and intraabdominal "
        "vasculature are commonly injured, with involvement rates of 40% and "
        "30% respectively."
    )
    
    print(f"\nEvaluating Statement:\n\"{statement}\"")
    start_time = time.time()

    print("\nStep 1: Retrieving relevant context...")
    retrieved_chunk = retriever.retrieve(statement)
    print("Context retrieved.")
    pprint.pprint(retrieved_chunk)

    print("\nStep 2: Generating answer with LLM...")
    result = generator.generate(statement, retrieved_chunk)
    end_time = time.time()
    
    print("\n--- Final Result ---")
    pprint.pprint(result)
    print(f"\nTotal evaluation time: {end_time - start_time:.2f} seconds.")

# --- Run the test ---
run_local_test()

--- Initializing RAG Pipeline ---
Initializing the RAG Retriever...
Retriever initialized successfully.
Initializing the RAG Generator (Ollama)...
Generator initialized. Make sure the Ollama server is running.

--- Pipeline Initialized ---

Evaluating Statement:
"In cases of abdominal gunshot wounds, the liver and intraabdominal vasculature are commonly injured, with involvement rates of 40% and 30% respectively."

Step 1: Retrieving relevant context...
Context retrieved.
{'content': 'Traumatic injuries to the abdomen can result from a wide range of '
            'etiologies and can lead to life-threatening injuries, multi-organ '
            'system dysfunction, and death. Gunshot wounds in the abdominal '
            'region can range from minor wounds to severe traumatic injuries '
            'depending on the anatomical structures the bullet penetrates. '
            'While the leading cause of blunt abdominal trauma-related deaths '
            'in the United States in adults age

  return forward_call(*args, **kwargs)



--- Final Result ---
{'statement_is_true': 1, 'statement_topic': 0}

Total evaluation time: 1.39 seconds.


In [2]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
# test_pipeline.py

import time
import pprint
def run_local_test():
    """
    Runs a full, end-to-end test of the RAG pipeline on a sample statement.
    """
    # --- 1. Initialize Components ---
    # This will take some time as it loads all the models into memory.
    print("--- Initializing RAG Pipeline ---")
    retriever = RAGRetriever()
    # Make sure to have the model available. The library will download it on first run.
    generator = RAGGenerator()
    print("\n--- Pipeline Initialized ---")

    # --- 2. Define a Sample Statement ---
    # This statement comes from the example in your project description.
    # It's a good test case because it's specific and factual.
    statement = (
        "In cases of abdominal gunshot wounds, the liver and intraabdominal "
        "vasculature are commonly injured, with involvement rates of 40% and "
        "30% respectively."
    )
    
    print(f"\nEvaluating Statement:\n\"{statement}\"")

    # --- 3. Run the RAG Pipeline ---
    start_time = time.time()

    # a) Retrieve the most relevant context
    print("\nStep 1: Retrieving relevant context...")
    retrieved_chunk = retriever.retrieve(statement)
    print("Context retrieved.")
    pprint.pprint(retrieved_chunk)

    # b) Generate the answer using the context
    print("\nStep 2: Generating answer with LLM...")
    result = generator.generate(statement, retrieved_chunk)
    
    end_time = time.time()
    
    # --- 4. Display the Results ---
    print("\n--- Final Result ---")
    pprint.pprint(result)
    print(f"\nTotal evaluation time: {end_time - start_time:.2f} seconds.")

run_local_test()

--- Initializing RAG Pipeline ---
Initializing the RAG Retriever...
Retriever initialized successfully.
Initializing the RAG Generator (Ollama)...
Generator initialized. Make sure the Ollama server is running.

--- Pipeline Initialized ---

Evaluating Statement:
"In cases of abdominal gunshot wounds, the liver and intraabdominal vasculature are commonly injured, with involvement rates of 40% and 30% respectively."

Step 1: Retrieving relevant context...
Context retrieved.
{'content': 'Traumatic injuries to the abdomen can result from a wide range of '
            'etiologies and can lead to life-threatening injuries, multi-organ '
            'system dysfunction, and death. Gunshot wounds in the abdominal '
            'region can range from minor wounds to severe traumatic injuries '
            'depending on the anatomical structures the bullet penetrates. '
            'While the leading cause of blunt abdominal trauma-related deaths '
            'in the United States in adults age

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

In [21]:
from tqdm import tqdm 
import pathlib

def run_evaluation(statements_dir: str, ground_truth_dir: str):
    retriever = RAGRetriever()
    generator = RAGGenerator()
    statements_path = pathlib.Path(statements_dir)
    statement_files = sorted(list(statements_path.glob("*.txt")))
    
    if not statement_files:
        print(f"Error: No statement .txt files found in '{statements_dir}'")
        return

    total_statements = len(statement_files)
    correct_truth_predictions = 0
    correct_topic_predictions = 0

    print(f"\nStarting evaluation of {total_statements} statements...")

    for statement_file in tqdm(statement_files, desc="Evaluating Statements"):
        with open(statement_file, 'r', encoding='utf-8') as f:
            statement_text = f.read().strip()

        statement_id = statement_file.stem
        ground_truth_file = pathlib.Path(ground_truth_dir) / f"{statement_id}.json"
        
        try:
            with open(ground_truth_file, 'r', encoding='utf-8') as f:
                ground_truth = json.load(f)
        except FileNotFoundError:
            print(f"\n[Warning] Ground truth file not found for {statement_id}, skipping.")
            continue

        retrieved_chunk = retriever.retrieve(statement_text)
        prediction = generator.generate(statement_text, retrieved_chunk)


        if prediction["statement_is_true"] == ground_truth["statement_is_true"]:
            correct_truth_predictions += 1
        
        if prediction["statement_topic"] == ground_truth["statement_topic"]:
            correct_topic_predictions += 1

    truth_accuracy = (correct_truth_predictions / total_statements) * 100
    topic_accuracy = (correct_topic_predictions / total_statements) * 100

    print("\n--- Evaluation Complete ---")
    print(f"Total Statements Evaluated: {total_statements}")
    print("\n--- Accuracy Scores ---")
    print(f"Statement Truth (True/False) Accuracy: {truth_accuracy:.2f}%")
    print(f"Statement Topic Accuracy:               {topic_accuracy:.2f}%")
    print("-----------------------")


In [22]:
VALIDATION_STATEMENTS_DIR = "/home/torf/NM-i-AI-2025-Neural-Networks-Enjoyers/emergency-healthcare-rag/data/train/statements/"
VALIDATION_TRUTH_DIR = "/home/torf/NM-i-AI-2025-Neural-Networks-Enjoyers/emergency-healthcare-rag/data/train/answers/"

run_evaluation(
    statements_dir=VALIDATION_STATEMENTS_DIR,
    ground_truth_dir=VALIDATION_TRUTH_DIR
)

Initializing the RAG Retriever...
Retriever initialized successfully.
Initializing the RAG Generator (Ollama)...
Generator initialized. Make sure the Ollama server is running.

Starting evaluation of 200 statements...


Evaluating Statements: 100%|██████████| 200/200 [00:44<00:00,  4.51it/s]


--- Evaluation Complete ---
Total Statements Evaluated: 200

--- Accuracy Scores ---
Statement Truth (True/False) Accuracy: 72.00%
Statement Topic Accuracy:               70.50%
-----------------------





In [None]:
# evaluate.py (Final Version with Model Switching and VRAM Monitoring)

import json
import time
import pathlib
import requests
from sentence_transformers import SentenceTransformer
import faiss
import warnings
from tqdm import tqdm
import pynvml  # <-- New import for VRAM monitoring

# --- VRAM Monitoring Setup ---
try:
    pynvml.nvmlInit()
    # Assuming you are using the first GPU (device 0)
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    print("Successfully initialized NVML for VRAM monitoring.")
except Exception as e:
    print(f"Warning: Could not initialize NVML for VRAM monitoring. VRAM usage will not be shown. Error: {e}")
    handle = None
# -----------------------------


# Suppress common warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning, module="transformers")

# RAGRetriever remains the same
class RAGRetriever:
    def __init__(self, index_path="faiss_index.bin", chunks_path="clean_chunks.pkl"):
        print("Initializing the RAG Retriever...")
        self.index = faiss.read_index(index_path)
        with open(chunks_path, "rb") as f:
            self.chunks = pickle.load(f)
        self.model = SentenceTransformer('BAAI/bge-small-en-v1.5', device='cpu')
        print("Retriever initialized successfully.")

    def retrieve(self, query: str, k: int = 1) -> dict:
        """
        Retrieves the top-k most relevant chunk for a given query.
        """
        query_embedding = self.model.encode(query, normalize_embeddings=True)
        query_embedding = query_embedding.reshape(1, -1)
        _, indices = self.index.search(query_embedding, k)
        # We only need the single best chunk for this pipeline
        return self.chunks[indices[0][0]]

# RAGGenerator is updated to accept a model name
class RAGGenerator:
    def __init__(self, model_name: str, ollama_host="http://localhost:11434"):
        print(f"Initializing RAG Generator with Ollama model: '{model_name}'...")
        self.model_name = model_name
        self.ollama_api_url = f"{ollama_host}/api/generate"
        # Check if the model exists in Ollama
        try:
            requests.post(self.ollama_api_url, json={"model": model_name, "prompt": "Hi", "stream": False})
        except requests.exceptions.RequestException as e:
             print(f"\n---FATAL ERROR---\nCould not connect to Ollama server at {ollama_host}.\nPlease ensure the Ollama application is running.")
             exit() # Exit the script if Ollama isn't running
        print("Generator initialized successfully.")


    def generate(self, statement: str, context_chunk: dict) -> dict:
        context_text = f"Section: {context_chunk['section_title']}\n\n{context_chunk['content']}"
        topic_id = context_chunk['topic_id']
        
        prompt = f"""Context:
---
{context_text}
---
Statement: "{statement}"

Task: Based ONLY on the provided context, respond with a single, raw JSON object with two keys: "statement_is_true" (1 for true, 0 for false) and "statement_topic" (the integer topic ID, which is {topic_id}). Do not add any explanation or markdown."""

        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "format": "json",
            "stream": False,
            "options": {"temperature": 0.0, "num_predict": 60}
        }
        
        try:
            response = requests.post(self.ollama_api_url, json=payload)
            response.raise_for_status()
            response_json = json.loads(response.json()['response'])
            return response_json
        except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
            print(f"\nError during generation: {e}")
            return {"statement_is_true": -1, "statement_topic": -1}

def get_vram_usage_gb(device_handle):
    """Returns the used VRAM in gigabytes."""
    if not device_handle:
        return None
    info = pynvml.nvmlDeviceGetMemoryInfo(device_handle)
    return info.used / (1024**3)

def run_evaluation(model_to_test: str, statements_dir: str, ground_truth_dir: str):
    print(f"--- Starting Evaluation for Model: {model_to_test} ---")
    
    retriever = RAGRetriever()
    generator = RAGGenerator(model_name=model_to_test)

    statements_path = pathlib.Path(statements_dir)
    statement_files = sorted(list(statements_path.glob("*.txt")))
    
    total_statements = len(statement_files)
    correct_truth = 0
    correct_topic = 0
    
    for i, statement_file in enumerate(tqdm(statement_files, desc=f"Evaluating {model_to_test}")):
        with open(statement_file, 'r', encoding='utf-8') as f:
            statement_text = f.read().strip()

        statement_id = statement_file.stem
        ground_truth_file = pathlib.Path(ground_truth_dir) / f"{statement_id}.json"
        
        with open(ground_truth_file, 'r', encoding='utf-8') as f:
            ground_truth = json.load(f)

        retrieved_chunk = retriever.retrieve(statement_text)
        prediction = generator.generate(statement_text, retrieved_chunk)

        if prediction["statement_is_true"] == ground_truth["statement_is_true"]:
            correct_truth += 1
        
        if prediction["statement_topic"] == ground_truth["statement_topic"]:
            correct_topic += 1
            
        # Print VRAM usage every 20 statements
        if (i + 1) % 20 == 0 and handle:
            vram = get_vram_usage_gb(handle)
            print(f" | VRAM after statement {i+1}: {vram:.2f} GB")


    truth_accuracy = (correct_truth / total_statements) * 100
    topic_accuracy = (correct_topic / total_statements) * 100

    print("\n--- Evaluation Complete ---")
    print(f"Model Tested: {model_to_test}")
    print(f"Total Statements: {total_statements}")
    print("\n--- Accuracy Scores ---")
    print(f"Statement Truth Accuracy: {truth_accuracy:.2f}%")
    print(f"Statement Topic Accuracy: {topic_accuracy:.2f}%")
    print("-----------------------")



# ollama pull mistral
# ollama pull llama3
# ollama pull gemma:7b
MODEL_NAME = "mistral" 

VALIDATION_STATEMENTS_DIR = "/home/torf/NM-i-AI-2025-Neural-Networks-Enjoyers/emergency-healthcare-rag/data/train/statements/"
VALIDATION_TRUTH_DIR = "/home/torf/NM-i-AI-2025-Neural-Networks-Enjoyers/emergency-healthcare-rag/data/train/answers/"

run_evaluation(
    model_to_test=MODEL_NAME,
    statements_dir=VALIDATION_STATEMENTS_DIR,
    ground_truth_dir=VALIDATION_TRUTH_DIR
)

Successfully initialized NVML for VRAM monitoring.
--- Starting Evaluation for Model: mistral ---
Initializing the RAG Retriever...
Retriever initialized successfully.
Initializing RAG Generator with Ollama model: 'mistral'...
Generator initialized successfully.


  return forward_call(*args, **kwargs)
Evaluating mistral:  10%|█         | 21/200 [00:05<00:34,  5.15it/s]

 | VRAM after statement 20: 29.12 GB


Evaluating mistral:  20%|██        | 41/200 [00:10<00:35,  4.47it/s]

 | VRAM after statement 40: 29.12 GB


Evaluating mistral:  30%|███       | 60/200 [00:14<00:28,  4.90it/s]

 | VRAM after statement 60: 29.13 GB


Evaluating mistral:  40%|████      | 81/200 [00:18<00:22,  5.25it/s]

 | VRAM after statement 80: 29.17 GB


Evaluating mistral:  50%|█████     | 101/200 [00:23<00:22,  4.35it/s]

 | VRAM after statement 100: 29.19 GB


Evaluating mistral:  60%|██████    | 120/200 [00:27<00:14,  5.42it/s]

 | VRAM after statement 120: 29.18 GB


Evaluating mistral:  70%|███████   | 140/200 [00:31<00:11,  5.01it/s]

 | VRAM after statement 140: 29.20 GB


Evaluating mistral:  80%|████████  | 161/200 [00:36<00:08,  4.48it/s]

 | VRAM after statement 160: 29.15 GB


Evaluating mistral:  90%|█████████ | 180/200 [00:40<00:05,  3.77it/s]

 | VRAM after statement 180: 29.11 GB


Evaluating mistral: 100%|██████████| 200/200 [00:45<00:00,  4.44it/s]

 | VRAM after statement 200: 29.11 GB

--- Evaluation Complete ---
Model Tested: mistral
Total Statements: 200

--- Accuracy Scores ---
Statement Truth Accuracy: 72.50%
Statement Topic Accuracy: 70.50%
-----------------------





In [None]:
import pickle
import faiss
import json
import requests  # <-- New import
from sentence_transformers import SentenceTransformer
import time
import pprint

# RAGRetriever class is unchanged
class RAGRetriever:
    def __init__(self, index_path="faiss_index.bin", chunks_path="clean_chunks.pkl"):
        print("Initializing the RAG Retriever...")
        self.index = faiss.read_index(index_path)
        with open(chunks_path, "rb") as f:
            self.chunks = pickle.load(f)
        self.model = SentenceTransformer('BAAI/bge-small-en-v1.5', device='cpu')
        print("Retriever initialized successfully.")

    def retrieve(self, query: str, k: int = 1) -> dict:
        query_embedding = self.model.encode(query, normalize_embeddings=True)
        query_embedding = query_embedding.reshape(1, -1)
        _, indices = self.index.search(query_embedding, k)
        return self.chunks[indices[0][0]]

# --- THIS IS THE NEW OLLAMA-BASED GENERATOR ---
class RAGGenerator:
    def __init__(self, model_name="mistral", ollama_host="http://localhost:11434"):
        print("Initializing the RAG Generator (Ollama)...")
        self.model_name = model_name
        self.ollama_api_url = f"{ollama_host}/api/generate"
        print("Generator initialized. Make sure the Ollama server is running.")

    def generate(self, statement: str, context_chunk: dict) -> dict:
        context_text = f"Section: {context_chunk['section_title']}\n\n{context_chunk['content']}"
        topic_id = context_chunk['topic_id']
        
        # We construct a single, clean prompt
        prompt = f"""Context:
        ---
        {context_text}
        ---
        Statement: "{statement}"

        Task: Based ONLY on the provided context, respond with a single, raw JSON object with two keys: "statement_is_true" (1 for true, 0 for false) and "statement_topic" (the integer topic ID, which is {topic_id}). Do not add any explanation or markdown.
        """

        # Data payload for the Ollama API
        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "format": "json",  # Ask Ollama to guarantee the output is JSON
            "stream": False,
            "options": {
                "temperature": 0.0,
                "num_predict": 60 # Max tokens to generate
            }
        }
        
        try:
            # Make the request to the local Ollama server
            response = requests.post(self.ollama_api_url, json=payload)
            response.raise_for_status() # Raise an exception for bad status codes
            
            # The response from Ollama with format="json" is already a parsed JSON object
            response_json = json.loads(response.json()['response'])
            return response_json

        except requests.exceptions.RequestException as e:
            print(f"Error connecting to Ollama server: {e}")
            return {"statement_is_true": -1, "statement_topic": -1}
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON from Ollama: {e}")
            print(f"Raw response was: {response.text}")
            return {"statement_is_true": -1, "statement_topic": -1}