# **HYBRID RAG SYSTEM for Sparse Municipal Environments:** 
This is the base code for our thesis, we're doing everything below, data preparing, embeddings, knowledge graph connections, running the hybrid system and testing the Hybrid RAG system performance via answering the competency questions.

In [None]:
pip install -r requirements.txt

In [None]:
# ==============================================================================
# STEP 1: SETUP 
# ==============================================================================
import os
import sys
import warnings

warnings.filterwarnings('ignore')

# 1. ROOT_DIR 
current_dir = os.getcwd()
ROOT_DIR = os.path.abspath(os.path.join(current_dir, '..'))

sys.path.append(ROOT_DIR)

# 3. Dynamic Folder Paths
DATA_PATH = os.path.join(ROOT_DIR, "data", "municipal_pdfs")
ASSETS_DIR = os.path.join(ROOT_DIR, "assets")
FONT_PATH = os.path.join(ASSETS_DIR, "Roboto-Regular.ttf")

print("-" * 40)
print(f" ROOT_DIR: {ROOT_DIR}")
print(f" PDF Path: {DATA_PATH}")
print(f" Font Path: {FONT_PATH}")
print("-" * 40)

# **DATA PREPARATION PART: **
In municipal  domains, data is mostly unstructured and sparsed across various web portals. To address this lack of structured datasets and working effective indexing within our Vector Database, a custom data  pipeline was established.

The source has 14 primary institutional websites (Parent Links). A recursive web scraping algorithm was developed to traverse these domains and their associated sub-pages, resulting in a total of 53 processed URLs.

Data extraction was executed using the **BeautifulSoup4** library. A rigorous data cleaning phase followed, aimed at removing noise and unnecessary web elements—such as cookie consent banners, social media widgets, and navigation to ensure high-quality textual input.

The cleaned data was subsequently converted into standardized PDF documents using the implementation provided below:

In [None]:
# ==============================================================================
# STEP 2: DATA INGESTION & PDF GENERATION
# DESCRIPTION: Scrapes target municipal websites, cleans the textual content,
# and converts structured data into standardized PDF documents for RAG ingestion.
# ==============================================================================

import os
import shutil
import time
import re
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from fpdf import FPDF

# ------------------------------------------------------------------------------
# 1. CONFIGURATION & PATHS
# ------------------------------------------------------------------------------
# Paths are derived dynamically from the ROOT_DIR set in the previous cell.
# If ROOT_DIR is not defined, we calculate it relative to this notebook.
if 'ROOT_DIR' not in locals():
    ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

PDF_STORAGE_PATH = os.path.join(ROOT_DIR, "data", "municipal_pdfs")
ASSETS_DIR = os.path.join(ROOT_DIR, "assets")
FONT_PATH = os.path.join(ASSETS_DIR, "Roboto-Regular.ttf")

# Ensure directories exist
os.makedirs(PDF_STORAGE_PATH, exist_ok=True)
os.makedirs(ASSETS_DIR, exist_ok=True)

# ------------------------------------------------------------------------------
# 2. TARGET URLs DEFINITION
# ------------------------------------------------------------------------------
TARGETS = [
    # --- DIVERSITY MEDIA ---
    {"name": "DIVERSITY_MEDIA", "url": "https://diversitymedia.info/media-workspace"},
    {"name": "DIVERSITY_MEDIA", "url": "https://diversitymedia.info/orga"},
    {"name": "DIVERSITY_MEDIA", "url": "https://diversitymedia.info/kontakt"},

    # --- KUNSTKULTURQUARTIER ---
    {"name": "KUNSTKULTUR", "url": "https://www.kunstkulturquartier.de/werkstaetten"},
    {"name": "KUNSTKULTUR", "url": "https://www.kunstkulturquartier.de/kuenstlerhaus/haus/kontakt-1"},

    # --- HEIZHAUS ---
    {"name": "HEIZHAUS", "url": "https://www.heizhaus.org/das-haus"},
    {"name": "HEIZHAUS", "url": "https://www.heizhaus.org/kontakt"},
    {"name": "HEIZHAUS", "url": "https://www.heizhaus.org/"},

    # --- LEIHLA (Bluepingu) ---
    {"name": "LEIHLA", "url": "https://leihla.bluepingu.de/"},
    {"name": "LEIHLA", "url": "https://leihla.bluepingu.de/nutzungsbedingungen/"},
    {"name": "LEIHLA", "url": "https://leihla.bluepingu.de/cb_itemgallery/?itemcat=werkzeug-allgemein"},
    {"name": "LEIHLA", "url": "https://leihla.bluepingu.de/cb_itemgallery/?itemcat=technik"},
    {"name": "LEIHLA", "url": "https://leihla.bluepingu.de/cb_itemgallery/?itemcat=werkzeug-textil"},

    # --- ESSBARE STADT ---
    {"name": "ESSBARE_STADT", "url": "https://essbare-stadt-nuernberg.de/fair-share/"},
    {"name": "ESSBARE_STADT", "url": "https://leihbar.bluepingu.de/leihkatalog_essbare_stadt/"},
    {"name": "ESSBARE_STADT", "url": "https://essbare-stadt-nuernberg.de/#kontakt"},

    # --- FABLAB NÜRNBERG ---
    {"name": "FABLAB_NBG", "url": "https://fablab-nuernberg.de/"},
    {"name": "FABLAB_NBG", "url": "https://fablab-nuernberg.de/ueber-uns/der-verein"},
    {"name": "FABLAB_NBG", "url": "https://fablab-nuernberg.de/ueber-uns/raeumlichkeiten"},
    {"name": "FABLAB_NBG", "url": "https://fablab-nuernberg.de/ueber-uns/geraete"},

    # --- HOLZWERKSTATT GOSTENHOF ---
    {"name": "HOLZWERKSTATT", "url": "http://holzwerkstatt-gostenhof.de/"},
    {"name": "HOLZWERKSTATT", "url": "http://holzwerkstatt-gostenhof.de/maschinen/"},
    {"name": "HOLZWERKSTATT", "url": "http://holzwerkstatt-gostenhof.de/mitgliedschaft/"},
    {"name": "HOLZWERKSTATT", "url": "http://holzwerkstatt-gostenhof.de/faq/"},

    # --- LEONARDO ---
    {"name": "LEONARDO", "url": "https://leonardo-zentrum.de/labs/"},
    {"name": "LEONARDO", "url": "https://leonardo-zentrum.de/labs/makerspace-werkstatt/"},
    {"name": "LEONARDO", "url": "https://leonardo-zentrum.de/labs/ar-vr-labor-studio/"},
    {"name": "LEONARDO", "url": "https://leonardo-zentrum.de/labs/miracl-soundlabor-tonstudio/"},
    {"name": "LEONARDO", "url": "https://leonardo-zentrum.de/labs/eventspace-co-working-space/"},
    {"name": "LEONARDO", "url": "https://leonardo-zentrum.de/kontakt/"},
    {"name": "LEONARDO", "url": "https://leonardo-zentrum.de/ueber-uns/"},

    # --- FABLAB NÜLAND ---
    {"name": "FABLAB_NUELAND", "url": "https://fablab.nueland.de/"},
    {"name": "FABLAB_NUELAND", "url": "https://fablab.nueland.de/index.php/wir-ueber-uns"},
    {"name": "FABLAB_NUELAND", "url": "https://fablab.nueland.de/index.php/das-fablab"},
    {"name": "FABLAB_NUELAND", "url": "https://fablab.nueland.de/index.php/mach-mit"},
    {"name": "FABLAB_NUELAND", "url": "https://fablab.nueland.de/index.php/kontakt"},

    # --- KOLEO ---
    {"name": "KOLEO", "url": "https://www.iska-nuernberg.de/koleo/"},
    {"name": "KOLEO", "url": "https://www.iska-nuernberg.de/koleo/kontakt.html"},

    # --- KLARA ---
    {"name": "KLARA", "url": "https://www.nuernberg.de/internet/nuernberg_engagiert/klara.html"},

    # --- OHM LAB ---
    {"name": "OHM_LAB", "url": "https://www.th-nuernberg.de/einrichtungen-gesamt/administration-und-service/lehr-und-kompetenzentwicklung/lehr-und-lernraeume/ohmlab-maker-und-coworking-space/"},

    # --- FABLAB FAU ---
    {"name": "FABLAB_FAU", "url": "https://fablab.fau.de/"},
    {"name": "FABLAB_FAU", "url": "https://fablab.fau.de/tool/lasercutter/"},
    {"name": "FABLAB_FAU", "url": "https://fablab.fau.de/tool/3d-drucker/"},
    {"name": "FABLAB_FAU", "url": "https://fablab.fau.de/tool/schneideplotter/"},
    {"name": "FABLAB_FAU", "url": "https://fablab.fau.de/tool/zerspanung/cnc-fraese/"},
    {"name": "FABLAB_FAU", "url": "https://fablab.fau.de/tool/zerspanung/cnc-drehbank/"},
    {"name": "FABLAB_FAU", "url": "https://fablab.fau.de/elektrowerkzeuge/"},
    {"name": "FABLAB_FAU", "url": "https://fablab.fau.de/tool/handwerkzeuge/"},
    {"name": "FABLAB_FAU", "url": "https://fablab.fau.de/tool/textilbearbeitung/naehmaschine/"},
    {"name": "FABLAB_FAU", "url": "https://fablab.fau.de/tool/textilbearbeitung/stickmaschine/"},
    {"name": "FABLAB_FAU", "url": "https://fablab.fau.de/tool/textilbearbeitung/textilpresse/"},
    {"name": "FABLAB_FAU", "url": "https://fablab.fau.de/tool/multifunktionstisch/"},
    {"name": "FABLAB_FAU", "url": "https://fablab.fau.de/kontakt/"},

    # --- ODL ---
    {"name": "ODL_TOLLWERK", "url": "https://odl-nbg.de/de/"}
]

# ------------------------------------------------------------------------------
# 3. HELPER FUNCTIONS: CLEANING & UTILS
# ------------------------------------------------------------------------------
def get_junk_list():
    """Returns a list of keywords and phrases to exclude during scraping."""
    return [
        # Navigation
        "home", "startseite", "menu", "menü", "hauptmenü", "untermenü",
        "navigation", "breadcrumb", "you are here", "sie sind hier",
        "suche", "search", "suchen", "lupe", "leiste öffnen",
        "zum inhalt springen", "skip to content", "zur hauptnavigation",
        "navigation ausklappen", "top of page", "bottom of page",
        # Footer & Legal
        "impressum", "datenschutz", "privacy", "disclaimer", "haftungsausschluss",
        "agb", "nutzungsbedingungen", "copyright", "alle rechte vorbehalten",
        "powered by", "theme by", "wordpress", "secured by miniorange",
        # Cookie Consent
        "gdpr", "cookie", "cookies", "unbedingt notwendige cookies",
        "einstellungen speichern", "alle aktivieren", "deaktiviert", "aktiviert",
        "cookie-informationen", "cookie-einstellungen",
        # Actions & Auth
        "login", "anmelden", "register", "registrieren", "logout", "abmelden",
        "warenkorb", "cart", "kasse", "checkout", "mein konto",
        "passwort vergessen", "remember me", "mehr erfahren", "weiterlesen",
        # Social Media
        "instagram", "facebook", "youtube", "twitter", "linkedin", "rss", "feed",
        "envelope", "google+", "xing",
        # Accessibility
        "barrierefreiheit", "text vergrößern", "graustufen", "kontrast",
        "hoher kontrast", "heller modus", "links unterstreichen", "lesbare schriftart",
        "nach oben", "top", "reset", "text verkleinern", "schriftgröße"
    ]

def clean_text(text, url):
    """
    Cleans raw HTML text by removing boilerplate, navigation elements, 
    and institution-specific sidebars.
    """
    lines = text.splitlines()
    cleaned_lines = []
    junk_exact = get_junk_list()

    # Specific sidebar content for FAU FabLab
    fau_sidebar = [
        "was ist ein fablab", "wie werde ich fablab-betreuer:in?", "termine",
        "ausstattung", "maschinen im überblick", "preise", "bilder", "projekte",
        "projekte unserer besucher", "forschungs- und abschlussarbeiten",
        "project group diybio - build your own biotech lab", "english"
    ]

    for line in lines:
        original = line.strip()
        lower_line = original.lower()

        if not original: continue
        if lower_line in junk_exact: continue

        # Fix specific glitch in Bluepingu/Leihla site
        if "leihla" in url:
            if re.search(r'\d{10,}', original):
                original = re.sub(r'\d{10,}', ' ', original).strip()
                lower_line = original.lower()
                if not original or original in ["Fürth", "Marktplatz"]:
                    continue

        # Filter out file markers and pagination
        if re.match(r'^\d+$', original): continue
        if re.search(r'\(PDF, \d+ KB\)', original): continue
        if original.startswith("<") and original.endswith(">"): continue

        # Partial matching filters
        if "cookie" in lower_line and ("verwend" in lower_line or "einstellung" in lower_line): continue
        if "gdpr" in lower_line: continue
        if "instagram.com" in lower_line or "facebook.com" in lower_line: continue
        if "source:" in lower_line: continue
        if "internetverbindung abgebrochen" in lower_line: continue
        if "spambots geschützt" in lower_line: continue

        # Filter FabLab Nürnberg Navigation
        if "fablab-nuernberg" in url:
            if any(x in lower_line for x in ["openlab", "kidslab", "repaircafé", "textilelab"]) and len(original) < 25:
                continue

        # Filter FAU FabLab Sidebar
        if "fablab.fau" in url:
            if lower_line in fau_sidebar: continue
            if len(original) < 40 and any(x in lower_line for x in ["3d-drucker", "lasercutter", "schneideplotter", "elektronik", "textilbearbeitung", "zerspanung"]):
                # Context check: preserve if the page itself is about that topic
                is_current_topic = False
                if "lasercutter" in lower_line and "lasercutter" in url: is_current_topic = True
                if "3d-drucker" in lower_line and "3d-drucker" in url: is_current_topic = True
                if "schneideplotter" in lower_line and "schneideplotter" in url: is_current_topic = True
                if "elektronik" in lower_line and "elektro" in url: is_current_topic = True
                if "textil" in lower_line and "textil" in url: is_current_topic = True
                if "zerspanung" in lower_line and "zerspanung" in url: is_current_topic = True

                if not is_current_topic:
                    continue

        cleaned_lines.append(original)

    return '\n'.join(cleaned_lines)

def get_soup(url):
    """Fetches the URL and returns a BeautifulSoup object."""
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except Exception as e:
        print(f"[ERROR] Network issue with {url}: {e}")
        return None

def create_pdf(text, url, filename):
    """Generates a PDF from the cleaned text using the Roboto font."""
    # Ensure font exists, download if necessary
    if not os.path.exists(FONT_PATH):
        print(f"[INFO] Roboto font not found at {FONT_PATH}. Downloading...")
        font_url = "https://github.com/google/fonts/raw/main/ofl/roboto/Roboto-Regular.ttf"
        r = requests.get(font_url, allow_redirects=True)
        with open(FONT_PATH, 'wb') as f:
            f.write(r.content)
        print("[SUCCESS] Font downloaded.")

    try:
        pdf = FPDF()
        pdf.add_page()
        pdf.add_font('Roboto', '', FONT_PATH)
        pdf.set_font('Roboto', '', 8)
        pdf.set_text_color(100, 100, 100)
        pdf.cell(0, 10, f"Source: {url}", new_x="LMARGIN", new_y="NEXT") # Updated for FPDF2
        pdf.ln(5)
        pdf.set_font('Roboto', '', 11)
        pdf.set_text_color(0, 0, 0)
        
        # Safe encode/decode to handle non-latin characters roughly
        safe_text = text.encode('utf-8', 'replace').decode('utf-8')
        pdf.multi_cell(0, 6, safe_text)
        pdf.output(filename)
    except Exception as e:
        print(f"[ERROR] Failed to generate PDF for {url}: {e}")

# ------------------------------------------------------------------------------
# 4. EXECUTION PIPELINE
# ------------------------------------------------------------------------------
def execute_data_ingestion():
    print(f"Saving PDFs to: {PDF_STORAGE_PATH}")
    
    # Clean up existing PDF directory to ensure fresh data
    if os.path.exists(PDF_STORAGE_PATH):
        shutil.rmtree(PDF_STORAGE_PATH)
    os.makedirs(PDF_STORAGE_PATH)

    processed_urls = set()
    count = 0

    # Expand targets dynamically (e.g., Essbare Stadt catalogs)
    expanded_targets = list(TARGETS)
    for item in TARGETS:
        if "leihkatalog_essbare_stadt" in item['url']:
            print("Expanding Essbare Stadt Catalog...")
            soup = get_soup(item['url'])
            if soup:
                for a in soup.find_all('a', href=True):
                    href = a['href']
                    if "itemcat" in href:
                        full_url = urljoin(item['url'], href)
                        expanded_targets.append({"name": "ESSBARE_STADT", "url": full_url})

    # Process all targets
    for item in expanded_targets:
        url = item['url']
        institute_name = item['name']
        base_url = url.split('#')[0]

        if base_url in processed_urls: continue

        print(f"{institute_name}: {url}")
        soup = get_soup(url)
        if not soup: continue

        raw_text = soup.get_text()
        final_text = clean_text(raw_text, url)

        if len(final_text) < 20:
            print("Content too short (possibly empty or protected).")
            continue

        # Create safe filename
        path_slug = urlparse(url).path.strip("/").replace("/", "_") or "main"
        query_slug = urlparse(url).query.replace("=", "-").replace("&", "_")
        safe_slug = f"{path_slug}_{query_slug}".strip("_")
        if not safe_slug: safe_slug = "main"

        full_name = f"{institute_name}_{safe_slug}"[:60]
        filename = os.path.join(PDF_STORAGE_PATH, f"{full_name}.pdf")

        create_pdf(final_text, url, filename)
        processed_urls.add(base_url)
        count += 1
        time.sleep(0.5) # Polite delay

    print(f"\nFinished. Created {count} PDFs in {PDF_STORAGE_PATH}")

# Run the pipeline
# Note: Since we already have the PDFs in './municipal_pdfs', 
# you can comment this out if you don't want to re-scrape.
# execute_data_ingestion()

Web scraping and data cleaning completed, pdfs created and ready in the path: /data/municipal_pdfs


After creation of the PDFs, next step is create embeddings for Vector DB and KG and metadata tagging

In [None]:
# ==============================================================================
# STEP 3: VECTOR DATABASE CREATION & PDF EMBEDDING
# ==============================================================================
# DESCRIPTION: Reads processed PDFs, assigns Knowledge Graph entity tags (E74),
# chunks the text, and creates embeddings in ChromaDB.
#
# CONFIGURATION NOTE:
# This script is configured by default to use L2 (Euclidean) distance.
# To use Cosine Similarity, uncomment the specific metadata configuration 
# in the 'Initialize Collection' section below.

import os
import chromadb
from chromadb.utils import embedding_functions
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# ------------------------------------------------------------------------------
# 1. PATH CONFIGURATION
# ------------------------------------------------------------------------------
if 'DATA_PATH' not in locals() or 'CHROMA_PATH' not in locals():
    ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
    DATA_PATH = os.path.join(ROOT_DIR, "data", "municipal_pdfs")
    CHROMA_PATH = os.path.join(ROOT_DIR, "chroma_db")

print(f"[CONFIG] PDF Source: {DATA_PATH}")
print(f"[CONFIG] Vector DB Target: {CHROMA_PATH}")

# ------------------------------------------------------------------------------
# 2. ENTITY MAPPING (PDF FILENAME -> KNOWLEDGE GRAPH GROUP)
# ------------------------------------------------------------------------------
PDF_TO_E74 = {
    "DIVERSITY_MEDIA": "Diversity_Media",
    "ESSBARE_STADT": "essbare_Stadt_Nürnberg_e.V.",
    "FABLAB_FAU": "FAU_FabLab",
    "FABLAB_NBG": "FabLab_Nürnberg",
    "FABLAB_NUELAND": "FabLab_Nüland",
    "HEIZHAUS": "Heizhaus_Nürnberg",
    "HOLZWERKSTATT": "Holzwerkstatt_Gostenhof_e.V.",
    "KLARA": "KLARA",
    "KOLEO": "KOLEO",
    "KUNSTKULTUR": "KunstKultur_Quartier_Werkstatten",
    "LEIHLA": "Leihla_Nürnberg",
    "LEONARDO": "Leonardo_Zentrum",
    "ODL_TOLLWERK": "tollwerk_GmbH",
    "OHM_LAB": "TH_Nürnberg",
}

def infer_e74_group(filename: str) -> str:
    for prefix, e74 in PDF_TO_E74.items():
        if filename.startswith(prefix + "_"):
            return e74
    return "UNKNOWN"

# ------------------------------------------------------------------------------
# 3. INITIALIZE CHROMA CLIENT & COLLECTION
# ------------------------------------------------------------------------------
client = chromadb.PersistentClient(path=CHROMA_PATH)

# --- CONFIGURATION SWITCH ---
# Option A: Standard L2 Distance (Default)
COLLECTION_NAME = "municipal_pdfs_rag"

# Option B: Cosine Similarity (Uncomment to use)
# COLLECTION_NAME = "municipal_pdfs_cosine"

try:
    client.delete_collection(name=COLLECTION_NAME)
    print(f"[INFO] Deleted existing collection: {COLLECTION_NAME}")
except:
    print(f"[INFO] Collection {COLLECTION_NAME} did not exist. Creating new.")

# Embedding Model
ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)

# Create Collection
# ---------------------------------------------------------
# OPTION A: Standard Creation (L2 Distance) - ACTIVE
collection = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=ef
)

# OPTION B: Cosine Similarity Configuration - INACTIVE
# Uncomment the block below to enable Cosine Similarity
# collection = client.create_collection(
#     name=COLLECTION_NAME,
#     embedding_function=ef,
#     metadata={"hnsw:space": "cosine"} # Critical for Cosine Similarity
# )
# ---------------------------------------------------------

# ------------------------------------------------------------------------------
# 4. CHUNKING CONFIGURATION
# ------------------------------------------------------------------------------
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=100,
    separators=["\n\n", "\n", ". ", " ", ""]
)

# ------------------------------------------------------------------------------
# 5. EXECUTION: PROCESS & EMBED
# ------------------------------------------------------------------------------
def build_vector_database():
    if not os.path.exists(DATA_PATH):
        print(f"[ERROR] PDF directory not found: {DATA_PATH}")
        return

    pdf_files = [f for f in os.listdir(DATA_PATH) if f.endswith(".pdf")]
    print(f"[INFO] Found {len(pdf_files)} PDFs. Processing...")

    all_chunks, all_metas, all_ids = [], [], []

    for filename in sorted(pdf_files):
        file_path = os.path.join(DATA_PATH, filename)
        try:
            reader = PdfReader(file_path)
            full_text = "\n".join([p.extract_text() or "" for p in reader.pages])
            
            if not full_text.strip():
                print(f"[WARNING] Skipping empty file: {filename}")
                continue

            chunks = text_splitter.split_text(full_text)
            e74_group = infer_e74_group(filename)

            for i, chunk in enumerate(chunks):
                all_chunks.append(chunk)
                all_metas.append({
                    "source_pdf": filename,
                    "e74_group": e74_group,
                    "chunk_index": i
                })
                all_ids.append(f"{filename}_chunk_{i}")
            
            print(f"[INFO] Processed: {filename}")
            
        except Exception as e:
            print(f"[ERROR] Failed to process {filename}: {e}")

    # Upsert to DB
    if all_chunks:
        print(f"[INFO] Upserting {len(all_chunks)} chunks into '{COLLECTION_NAME}'...")
        
        batch_size = 5000
        for i in range(0, len(all_chunks), batch_size):
            end = min(i + batch_size, len(all_chunks))
            collection.upsert(
                documents=all_chunks[i:end],
                metadatas=all_metas[i:end],
                ids=all_ids[i:end]
            )
            
        print(f"[SUCCESS] Vector Database populated. Total chunks: {collection.count()}")
    else:
        print("[WARNING] No chunks found to insert.")

# Uncomment to run the build process
# build_vector_database()

In [None]:
# ==============================================================================
# STEP 4: KNOWLEDGE GRAPH EMBEDDING (STRUCTURAL SERIALIZATION)
# ==============================================================================
# DESCRIPTION: Connects to Neo4j, serializes nodes and relationships into 
# natural language text (Structural Embedding), and stores them in a 
# separate ChromaDB collection with CRITICAL METADATA.
#
# CONFIGURATION NOTE:
# This script is configured by default to use L2 (Euclidean) distance.
# To use Cosine Similarity, uncomment the specific metadata configuration 
# in the 'Initialize Collection' section below.

import os
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from sentence_transformers import SentenceTransformer
from neo4j import GraphDatabase

# ------------------------------------------------------------------------------
# 1. CONFIGURATION & CREDENTIALS
# ------------------------------------------------------------------------------
if 'CHROMA_PATH' not in locals():
    ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
    CHROMA_PATH = os.path.join(ROOT_DIR, "chroma_db")

# Neo4j Credentials (Securely loaded from .env)
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USERNAME", "neo4j")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

if not NEO4J_URI or not NEO4J_PASSWORD:
    raise ValueError("[ERROR] Neo4j credentials missing in .env file.")

# --- CONFIGURATION SWITCH ---
# Option A: Standard L2 Distance (Default)
COLLECTION_NAME = "kg_structural_rag"

# Option B: Cosine Similarity (Uncomment to use)
# COLLECTION_NAME = "kg_structural_cosine"

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

print(f"[CONFIG] KG Vector DB Target: {CHROMA_PATH}")
print(f"[CONFIG] Embedding Model: {MODEL_NAME}")

# ------------------------------------------------------------------------------
# 2. HELPER CLASSES AND FUNCTIONS
# ------------------------------------------------------------------------------

class StructuralEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function wrapper for ChromaDB using SentenceTransformers.
    """
    def __init__(self, model_name):
        print(f"[INIT] Loading embedding model: {model_name}...")
        self.model = SentenceTransformer(model_name)

    def __call__(self, input: Documents) -> Embeddings:
        return self.model.encode(input, convert_to_numpy=True).tolist()

def safe_str(value):
    """Converts complex types to string for safe metadata storage."""
    if isinstance(value, list):
        return ", ".join(str(v) for v in value)
    return str(value)

def get_node_name(node):
    """Resolves the display name of a node."""
    props = dict(node)
    return props.get('name', props.get('title', node.element_id))

# ------------------------------------------------------------------------------
# 3. MAIN PIPELINE
# ------------------------------------------------------------------------------

def generate_kg_embeddings():
    # 1. Initialize Connections
    print(f"[CONN] Connecting to Neo4j at {NEO4J_URI}...")
    try:
        driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
        driver.verify_connectivity()
        print("[SUCCESS] Connected to Neo4j.")
    except Exception as e:
        print(f"[ERROR] Connection Failed: {e}")
        return

    # Initialize ChromaDB
    chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
    embedding_func = StructuralEmbeddingFunction(MODEL_NAME)

    # 2. Reset Collection
    try:
        chroma_client.delete_collection(name=COLLECTION_NAME)
        print(f"[INFO] Deleted existing collection: {COLLECTION_NAME}")
    except:
        pass

    # Create Collection
    # ---------------------------------------------------------
    # OPTION A: Standard Creation (L2 Distance) - ACTIVE
    collection = chroma_client.create_collection(
        name=COLLECTION_NAME,
        embedding_function=embedding_func
    )

    # OPTION B: Cosine Similarity Configuration - INACTIVE
    # Uncomment the block below to enable Cosine Similarity
    # collection = chroma_client.create_collection(
    #     name=COLLECTION_NAME,
    #     embedding_function=embedding_func,
    #     metadata={"hnsw:space": "cosine"} # Critical for Cosine Similarity
    # )
    # ---------------------------------------------------------

    documents = []
    metadatas = []
    ids = []

    print("[PROCESS] Fetching data from Neo4j and serializing...")

    with driver.session() as session:
        # ---------------------------------------------------------
        # PHASE A: PROCESS NODES (Entity Serialization)
        # ---------------------------------------------------------
        result_nodes = session.run("MATCH (n) RETURN n")

        for record in result_nodes:
            node = record["n"]
            props = dict(node)
            labels = list(node.labels)
            label_str = labels[0] if labels else "Unknown Entity"
            name = get_node_name(node)

            # 1. Text Serialization
            text_rep = f"Entity: {name}. Type: {label_str}."
            for key, value in props.items():
                if key not in ['name', 'title', 'uri', 'element_id']:
                    if value:
                        text_rep += f" {key.replace('_', ' ')}: {value}."

            # 2. Metadata (CRITICAL FOR HYBRID LINKING)
            meta = {
                "kind": "entity",
                "entity_name": safe_str(name), 
                "labels": safe_str(label_str),
                "source": "neo4j"
            }

            documents.append(text_rep)
            metadatas.append(meta)
            ids.append(f"node_{node.element_id}")

        # ---------------------------------------------------------
        # PHASE B: PROCESS RELATIONSHIPS (Structural Sentences)
        # ---------------------------------------------------------
        result_rels = session.run("MATCH (n)-[r]->(m) RETURN n, r, m")

        for record in result_rels:
            source = record["n"]
            rel = record["r"]
            target = record["m"]

            s_name = get_node_name(source)
            t_name = get_node_name(target)

            # 1. Text Serialization
            text_rep = f"{s_name} is connected to {t_name} via relation {rel.type}."

            # 2. Metadata (CRITICAL FOR GRAPH CONTEXT)
            meta = {
                "kind": "relationship",
                "source_node": safe_str(s_name),
                "target_node": safe_str(t_name),
                "relation_type": safe_str(rel.type),
                "source": "neo4j"
            }

            documents.append(text_rep)
            metadatas.append(meta)
            ids.append(f"rel_{rel.element_id}")

    driver.close()

    # 3. Batch Ingestion
    total_docs = len(documents)
    if total_docs > 0:
        print(f"[INFO] Ingesting {total_docs} vectors into ChromaDB...")
        batch_size = 500

        for i in range(0, total_docs, batch_size):
            end_idx = min(i + batch_size, total_docs)
            collection.add(
                documents=documents[i:end_idx],
                metadatas=metadatas[i:end_idx],
                ids=ids[i:end_idx]
            )

        print("[SUCCESS] Knowledge Graph Embedding Complete.")
        print(f"[STATUS] Total Records in '{COLLECTION_NAME}': {collection.count()}")
    else:
        print("[WARNING] No data found to process.")

# Uncomment to run the build process
# generate_kg_embeddings()

In [None]:
# ==============================================================================
# STEP 5: DUAL-CHANNEL RETRIEVAL TEST (PDF + KG)
# ==============================================================================
# DESCRIPTION: Executes a hybrid search query across both Vector Databases
# (Unstructured PDF Text + Structured Knowledge Graph) to verify retrieval
# quality before passing context to the LLM.

import os
import chromadb
from chromadb.utils import embedding_functions
from pprint import pprint
from typing import List, Dict, Any

# ------------------------------------------------------------------------------
# 1. CONFIGURATION & PATHS
# ------------------------------------------------------------------------------
if 'CHROMA_PATH' not in locals():
    ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
    CHROMA_PATH = os.path.join(ROOT_DIR, "chroma_db")

# --- COLLECTION CONFIGURATION (L2 vs COSINE SWITCH) ---

# 1. PDF Collection Settings
PDF_COLLECTION_NAME = "municipal_pdfs_rag"       # Default (L2)
# PDF_COLLECTION_NAME = "municipal_pdfs_cosine"  # Option (Cosine)

# 2. KG Collection Settings
KG_COLLECTION_NAME = "kg_structural_rag"         # Default (L2)
# KG_COLLECTION_NAME = "kg_structural_cosine"    # Option (Cosine)

TOP_K_BOTH = 5

print(f"[CONFIG] Retrieval Path: {CHROMA_PATH}")
print(f"[CONFIG] PDF Collection: {PDF_COLLECTION_NAME}")
print(f"[CONFIG] KG Collection:  {KG_COLLECTION_NAME}")

# ------------------------------------------------------------------------------
# 2. RETRIEVAL FUNCTION
# ------------------------------------------------------------------------------
def run_dual_channel_query(question: str) -> Dict[str, Any]:
    """
    Executes dual-channel search, retrieves context, and formats documents
    with explicit source tags (PDF/KG) and entity names for clear verification.
    """
    print(f"\n=======================================================================")
    print(f"QUERY: {question}")
    print("=======================================================================")

    try:
        client = chromadb.PersistentClient(path=CHROMA_PATH)
        
        # Define Embedding Functions to match what was used during ingestion
        # PDF Model: paraphrase-multilingual-mpnet-base-v2
        ef_pdf = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
        )
        
        # KG Model: all-MiniLM-L6-v2
        ef_kg = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )

        # Connect to collections
        pdf_collection = client.get_collection(name=PDF_COLLECTION_NAME, embedding_function=ef_pdf)
        kg_collection = client.get_collection(name=KG_COLLECTION_NAME, embedding_function=ef_kg)

    except Exception as e:
        print(f"[ERROR] Connection to ChromaDB failed: {e}")
        return {"error": str(e)}

    # --- STEP 1: RETRIEVAL FROM PDF CORPUS ---
    print(f"[1/2] Searching PDF Corpus (Unstructured Text, K={TOP_K_BOTH})...")

    pdf_results = pdf_collection.query(
        query_texts=[question],
        n_results=TOP_K_BOTH,
        include=['documents', 'metadatas']
    )

    # 1. Label and Format PDF Documents
    pdf_context_list = []
    if pdf_results['documents'] and pdf_results['documents'][0]:
        for doc, meta in zip(pdf_results['documents'][0], pdf_results['metadatas'][0]):
            entity_name = meta.get('e74_group', 'UNKNOWN_ENTITY')
            # Prepend the source tag directly to the document text
            pdf_context_list.append(f"[SOURCE: PDF | GROUP: {entity_name}] {doc}")
    else:
        print("   -> No results found in PDF corpus.")


    # --- STEP 2: RETRIEVAL FROM KG CORPUS ---
    print(f"[2/2] Searching KG Corpus (Structural Facts, K={TOP_K_BOTH})...")

    kg_results = kg_collection.query(
        query_texts=[question],
        n_results=TOP_K_BOTH,
        include=['documents', 'metadatas']
    )

    # 2. Label and Format KG Documents
    kg_context_list = []
    if kg_results['documents'] and kg_results['documents'][0]:
        for doc, meta in zip(kg_results['documents'][0], kg_results['metadatas'][0]):
            # Get the entity name, prioritizing entity name for nodes, or source node for rels
            entity_name = meta.get('entity_name') or meta.get('source_node', 'N/A')
            # Prepend the source tag directly to the document text
            kg_context_list.append(f"[SOURCE: KG | ENTITY: {entity_name}] {doc}")
    else:
        print("   -> No results found in KG corpus.")


    # --- STEP 3: MERGE CONTEXT ---
    final_context_list = pdf_context_list + kg_context_list

    compiled_context = {
        "Combined_Context": final_context_list,
        "Total_Documents": len(final_context_list)
    }

    print("\n[3/3] Final Compiled Context Structure (TAGGED Output):")
    # Pretty print just a snippet to avoid flooding the console
    if final_context_list:
        pprint(final_context_list[:2])
        print(f"... and {len(final_context_list)-2} more items.")
    else:
        print("No context retrieved.")
        
    return compiled_context

# ----------------------------------------------------------------------
# TEST SUITE
# ----------------------------------------------------------------------

def test_suite(questions: Dict[str, str]):
    for name, q in questions.items():
        run_dual_channel_query(q)

if __name__ == "__main__":
    questions = {
        "q1": "Which groups provide workshops or tools that support woodworking and fabrication activities? Is there any restriction for using these workshop areas or tools?",
        "q2": "Which publicly accessible spaces are suitable for hosting small civic workshops or dialogue sessions?",
        "q3": "Which places support digitizing personal stories and materials, and producing media (like short films or podcasts)?",
        "q4": "Which municipal facilities are suitable for a 'Repair' or 'Maintenance'?",
        "q5": "Which facilities or groups provide 3D printers or 3D printing workshops? What are their specific types, descriptions, and access restrictions?"
    }
    
    # Run the test suite
    # test_suite(questions)

In [None]:
# ==============================================================================
# STEP 6: LOAD LLM (Meta-Llama-3.1-8B-Instruct)
# ==============================================================================
# DESCRIPTION: Initializes the Llama-3.1-8B-Instruct model using 4-bit 
# quantization for memory efficiency. Authenticates using the environment 
# variable. Configured for GREEDY SEARCH (Deterministic/Temp 0) for RAG.

import os
import torch
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from huggingface_hub import login

# ------------------------------------------------------------------------------
# 1. AUTHENTICATION & ENVIRONMENT SETUP
# ------------------------------------------------------------------------------
load_dotenv()

HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

if not HF_TOKEN:
    raise ValueError("[ERROR] HUGGINGFACEHUB_API_TOKEN not found in environment variables.")

try:
    login(token=HF_TOKEN)
    print("[INFO] Successfully authenticated with Hugging Face.")
except Exception as e:
    print(f"[ERROR] Authentication failed: {e}")
    raise e

if not torch.cuda.is_available():
    raise RuntimeError("[ERROR] GPU not detected. CUDA is required.")

print(f"[INFO] GPU Detected: {torch.cuda.get_device_name(0)}")

# ------------------------------------------------------------------------------
# 2. QUANTIZATION CONFIGURATION (4-bit)
# ------------------------------------------------------------------------------
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# ------------------------------------------------------------------------------
# 3. MODEL & TOKENIZER LOADING
# ------------------------------------------------------------------------------
print(f"[INFO] Loading model: {model_id}...")

try:
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map={"": 0},
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        token=HF_TOKEN
    )
    print("[SUCCESS] Model loaded successfully.")

except Exception as e:
    print(f"[ERROR] Failed to load model: {e}")
    raise e

# ------------------------------------------------------------------------------
# 4. PIPELINE INITIALIZATION (Deterministic)
# ------------------------------------------------------------------------------
# Setting do_sample=False creates a deterministic output (equivalent to Temp 0).
# This is ideal for RAG to minimize hallucinations.
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=600,
    return_full_text=False,
    do_sample=False  # ENABLE GREEDY SEARCH (Temperature = 0)
)

print("[SUCCESS] Text generation pipeline initialized (Greedy Search Mode).")

In [None]:
# ==============================================================================
# FINAL STEP: HYBRID RAG PIPELINE (MAIN EXECUTION)
# ==============================================================================
# DESCRIPTION: The core execution engine of the project. It integrates:
# 1. LLM-Based Query Understanding, Intent Analysis & Keyword Extraction
# 2. Semantic Vector Retrieval (ChromaDB - PDF & KG)
# 3. Structured Graph Retrieval (Neo4j Cypher)
# 4. Context-Aware Response Generation (Llama 3.1)
#
# CONFIGURATION NOTE:
# Default settings use L2 Distance. To switch to Cosine Similarity, 
# uncomment the respective collection names AND the threshold values below.

import torch
import gc
import re
import os
from neo4j import GraphDatabase
import chromadb
from chromadb.utils import embedding_functions
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# ------------------------------------------------------------------------------
# 1. CONFIGURATION & DATABASE INITIALIZATION
# ------------------------------------------------------------------------------
# Define Paths
if 'CHROMA_PATH' not in locals():
    ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
    CHROMA_PATH = os.path.join(ROOT_DIR, "chroma_db")

# Define Credentials (from Environment)
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_AUTH = (os.getenv("NEO4J_USERNAME", "neo4j"), os.getenv("NEO4J_PASSWORD"))

if not NEO4J_URI or not NEO4J_AUTH[1]:
    raise ValueError("[ERROR] Neo4j credentials missing in environment variables.")

# --- METRIC & COLLECTION CONFIGURATION (L2 vs COSINE SWITCH) ---

# OPTION A: L2 Distance (Default - Euclidean)
# Lower score is better. Range usually [0, 2] for normalized vectors.
PDF_COLLECTION_NAME = "municipal_pdfs_rag"
KG_COLLECTION_NAME = "kg_structural_rag"
SIMILARITY_THRESHOLD = 1.6  # High tolerance for L2

# OPTION B: Cosine Similarity (Uncomment to use)
# Distance = 1 - CosineSimilarity. Lower is better. Range [0, 1].
# PDF_COLLECTION_NAME = "municipal_pdfs_cosine"
# KG_COLLECTION_NAME = "kg_structural_cosine"
# SIMILARITY_THRESHOLD = 0.4  # Stricter tolerance for Cosine Distance (approx 0.6 similarity)

print(f"[INIT] Connecting to Vector DB at: {CHROMA_PATH}")
print(f"[INIT] Connecting to Graph DB at: {NEO4J_URI}")
print(f"[CONFIG] PDF Collection: {PDF_COLLECTION_NAME}")
print(f"[CONFIG] KG Collection:  {KG_COLLECTION_NAME}")
print(f"[CONFIG] Distance Threshold: {SIMILARITY_THRESHOLD}")

try:
    chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
    
    # Define Embedding Functions (Must match ingestion models)
    ef_pdf = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    )
    ef_kg = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

    # Initialize Collections
    pdf_collection = chroma_client.get_collection(name=PDF_COLLECTION_NAME, embedding_function=ef_pdf)
    kg_vec_collection = chroma_client.get_collection(name=KG_COLLECTION_NAME, embedding_function=ef_kg)
    
    driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
    driver.verify_connectivity()
    print("[SUCCESS] All Databases Connected. Narrative Flow Active.")
except Exception as e:
    print(f"[ERROR] Database Connection Failed: {e}")

# ------------------------------------------------------------------------------
# 2. HELPER FUNCTIONS
# ------------------------------------------------------------------------------
def clean_ontology_text(text):
    """Cleans raw ontology identifiers (e.g., 'E53_Place') into readable text."""
    if not text: return ""
    text = re.sub(r'^[A-Z]\d+_', '', text)
    text = re.sub(r'-\d+$', '', text)
    text = text.replace('_', ' ')
    text = re.sub(r'(?i)\s(Facility|Area|Place|Object|Model|type|Group)$', '', text)
    return text.strip()

# ------------------------------------------------------------------------------
# 3. DYNAMIC KEYWORD GENERATOR (LLM-BASED INTENT ANALYSIS WITH FEW SHOT LEARNING)
# ------------------------------------------------------------------------------
def generate_dynamic_search_pattern(question: str):
    """
    Uses the LLM to analyze user intent and extract technical keywords 
    from the user question for targeted Graph database querying.
    """
    extraction_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are a search engine backend. Extract core technical keywords in singular form. 
    Return ONLY keywords separated by pipes (|). Do not include any other text.
    STRICT RULE: Avoid generic words like 'access', 'rules', 'organization', 'policy', 'facility'.
    
    Q: Where is the nearest room with an MRI scanner?
    K: mri_scanner|radiology|room
    
    Q: I need a large venue for a corporate surgery simulation.
    K: surgery|simulation|unit|medical|theatre|hospital

    Q: Which airlock is available for EVA suits?
    K: airlock|eva_suit|pressure_hatch

    Q: I am looking for a specialized hangar for satellite maintenance.
    K: satellite_maintenance|hangar|space|engineering|data

    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Q: {question}
    K:<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

    try:
        output = text_generation_pipeline(extraction_prompt, max_new_tokens=20, do_sample=False)[0]['generated_text']
        raw = output.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip().lower()
        clean_words = re.sub(r"[^\w\|]", " ", raw).split()
        forbidden = [
            "facility", "facilities", "group", "groups", "workshop", "workshops", 
            "based", "find", "looking", "where", "which"
        ]
        
        keywords = [w for w in clean_words if len(w) > 2 and w not in forbidden]
        
        return "|".join(keywords[:5])
    except Exception as e:
        print(f"[WARNING] Keyword extraction failed: {e}")
        return "facility|equipment"

# ------------------------------------------------------------------------------
# 4. MULTI-SOURCE VECTOR RETRIEVAL
# ------------------------------------------------------------------------------
def retrieve_all_embeddings(question, top_k=5):
    """
    Retrieves semantic context from both PDF (unstructured) and KG (structured) vector stores.
    """
    context_data = {"pdfs": [], "kg_vectors": []}

    try:
        # --- PDF COLLECTION QUERY ---
        res_pdf = pdf_collection.query(
            query_texts=[question],
            n_results=top_k,
            include=['documents', 'metadatas', 'distances']
        )

        if res_pdf['documents']:
            for doc, meta, dist in zip(res_pdf['documents'][0], res_pdf['metadatas'][0], res_pdf['distances'][0]):
                if dist > SIMILARITY_THRESHOLD: continue
                group = meta.get('e74_group', 'General Doc')
                context_data["pdfs"].append(f"[{group}]: {doc.strip()}")

        # --- KG COLLECTION QUERY ---
        res_kg = kg_vec_collection.query(
            query_texts=[question],
            n_results=top_k,
            include=['documents', 'metadatas', 'distances']
        )

        if res_kg['documents']:
            for doc, dist in zip(res_kg['documents'][0], res_kg['distances'][0]):
                if dist > SIMILARITY_THRESHOLD: continue
                context_data["kg_vectors"].append(f"{doc.strip()}")

    except Exception as e:
        print(f"[WARNING] Vector Retrieval Error: {e}")
        pass

    return context_data

# ------------------------------------------------------------------------------
# 5. SEMANTIC HIERARCHY CYPHER SEARCH
# ------------------------------------------------------------------------------
def fetch_kg_facts(regex_pattern: str):
    """
    Executes a complex Cypher query to retrieve structured facts based on
    semantic hierarchy and keyword matching.
    """
    raw_terms = [t.strip().lower() for t in regex_pattern.split('|') if len(t) > 2]

    # Specificity Logic
    specific_indicators = [
        "wood", "woodworking", "carpentry", "lathe", "saw", "drill", "workshop",
        "garden", "gardening", "agriculture", "nature", "outdoor", "farm", 
        "textile", "sew", "tailor",
        "3d_Printer", "3d", "laser", "cnc", "metal", "milling", "fabrication",
        "audio", "video", "media", "sound", "lab", "studio", "record", "acoustic", "production",
        "meeting", "conference", "seminar", "event", "hall", "social", "civic", 
        "community", "gathering", "room", "venue", "lecture", "exhibition", 
        "gallery", "stage", "performance"
    ]

    generic_terms = ["tool", "equipment", "machine", "device", "facility", "workshop", "space"]

    is_specific = any(s in term for term in raw_terms for s in specific_indicators)

    if is_specific:
        search_terms = [t for t in raw_terms if not any(g in t for g in generic_terms)]
        if not search_terms: search_terms = raw_terms
    else:
        search_terms = raw_terms

    print(f"[INFO] Active KG Search Terms: {search_terms}")

    access_logic = """
    OPTIONAL MATCH (entity)-[:P55_has_current_location]->(direct_loc)
    OPTIONAL MATCH (owner)-[:P52i_is_current_owner_of]->(fac:E53_Place)
    WITH entity, owner, final_type, coalesce(direct_loc.name, fac.name, "General Facility") AS LocName
    OPTIONAL MATCH (restriction:E30_Right)-[:P105_right_held_by]->(owner)
    WITH entity, owner, final_type, LocName,
          coalesce(restriction.name, "Public Access") AS AccessStatus
    """

    return_part = """
    RETURN owner.name AS Org,
           LocName AS Loc,
           entity.name AS Item,
           coalesce(final_type.name, "Equipment") AS Cat,
           coalesce(entity.P3_has_note, "") AS Note,
           AccessStatus AS Access
    """

    cypher = f"""
    // BRANCH 1: HIERARCHY CHAIN
    MATCH (t:E55_Type)
    WHERE any(word IN $terms WHERE toLower(t.name) CONTAINS word)
    MATCH (child_type:E55_Type)-[:P127_has_broader_term*0..2]->(t)
    MATCH (entity)-[:P2_has_type]->(final_type)
    WHERE final_type = child_type
    AND (entity:`E22_Human-Made_Object` OR entity:`E24_Physical_Human-Made_Thing` OR entity:`E25_Human-Made_Feature`)
    MATCH (owner:E74_Group)-[:P52i_is_current_owner_of]->(entity)
    {access_logic}
    {return_part}

    UNION

    // BRANCH 2: MODEL CHAIN
    MATCH (t:E55_Type)
    WHERE any(word IN $terms WHERE toLower(t.name) CONTAINS word)
    MATCH (child_type:E55_Type)-[:P127_has_broader_term*0..2]->(t)
    MATCH (entity)-[:P2_has_type]->(:E99_Product_Type)-[:P2_has_type]->(final_type)
    WHERE final_type = child_type
    MATCH (owner:E74_Group)-[:P52i_is_current_owner_of]->(entity)
    {access_logic}
    {return_part}

    UNION

    // BRANCH 3: TEXT CHAIN
    MATCH (entity)
    WHERE (any(word IN $terms WHERE toLower(entity.name) CONTAINS word)
        OR any(word IN $terms WHERE toLower(entity.P3_has_note) CONTAINS word))
    AND (entity:`E22_Human-Made_Object` OR entity:`E24_Physical_Human-Made_Thing` OR entity:`E25_Human-Made_Feature`)
    OPTIONAL MATCH (entity)-[:P2_has_type*1..2]->(final_type:E55_Type)
    MATCH (owner:E74_Group)-[:P52i_is_current_owner_of]->(entity)
    {access_logic}
    {return_part}
    """

    full_query = f"""
    CALL () {{
        {cypher}
    }}
    WITH Org, Loc, Access, Item, Cat, Note
    WITH Org, Loc, replace(Access, '_', ' ') AS CleanAccess, Item, replace(Cat, '_', ' ') AS CleanCat, Note

    ORDER BY Item
    RETURN Org, Loc, CleanAccess, collect(distinct {{Item: Item, Category: CleanCat, Note: Note}}) as Inventory
    ORDER BY size(Inventory) DESC
    LIMIT 20
    """

    results = []
    try:
        with driver.session() as session:
            r = session.run(full_query, terms=search_terms)
            for row in r:
                org = clean_ontology_text(row["Org"])
                loc = clean_ontology_text(row["Loc"])
                access = row["CleanAccess"]

                items_with_type = []
                for item in row["Inventory"]:
                    i_name = clean_ontology_text(item['Item'])
                    i_cat = clean_ontology_text(item['Category'])
                    
                    if i_cat.lower() in i_name.lower():
                        items_with_type.append(i_name)
                    else:
                        items_with_type.append(f"{i_name} (type: {i_cat})")

                resources_str = ", ".join(items_with_type)
                
                clean_org = org.lower().replace(" ", "")
                clean_loc = loc.lower().replace(" ", "")
                
                if clean_loc in clean_org or clean_org in clean_loc:
                    segment = f"The group '{org}' provides the following: {resources_str}. Access rule: {access}."
                else:
                    segment = f"The group '{org}' (located at {loc}) provides the following: {resources_str}. Access rule: {access}."
                results.append(segment)
    except Exception as e:
        print(f"[ERROR] Neo4j Query Failed: {e}")

    return results

# ------------------------------------------------------------------------------
# 6. RESPONSE GENERATION (CONTEXT-AWARE NARRATIVE)
# ------------------------------------------------------------------------------
def generate_full_response(question, pdf_context, kg_vec_context, kg_facts):
    """
    Synthesizes the final answer using the LLM with strict formatting rules.
    """

    if not pdf_context and not kg_vec_context and not kg_facts:
        return "Based on the available municipal data, no relevant facilities or workshops were found matching your specific request."

    all_narrative = "\n".join(pdf_context) + "\n" + "\n".join(kg_vec_context)
    kg_fact_text = "\n".join(kg_facts) if kg_facts else "NO DIRECT INVENTORY MATCH FOUND IN KG."

    system_instruction = """You are the Nuremberg Municipal Expert. Answer using the provided Context Sources.

YOUR TASK: Analyze the user's INTENT and map it to the Inventory using the following GENERAL LOGIC RULES.

    --- PART 1: LOGICAL REASONING (THE BRAIN) ---
    
    1. EVIDENCE VERIFICATION (The "Specificity" Rule):
       - IF the user asks for a SPECIFIC TOOL or OBJECT: You must find an EXPLICIT mention of that exact object in the source text.
       - A general facility category (e.g., "Workshop", "Makerspace") is NOT sufficient proof that they possess a specific device.
       - CONSEQUENCE: If the text does not explicitly list the requested item, DISCARD THE FACILITY immediately. Do not mention it.
       
    2. FUNCTIONAL COMPATIBILITY (The "Context" Rule):
       - Analyze the nature of the requested activity (e.g., Social Gathering vs. Industrial Production).
       - Ensure the recommended facility's primary environment matches this activity.
       - EXCLUSION: Do not recommend noise-heavy or industrial environments for social/quiet activities unless they explicitly list a dedicated event space.

    3. RELEVANCE FILTERING (The "Focus" Rule):
       - IF the user asks for a specific category (e.g., "Avionics or Flight Instruments"), ONLY mention items within that technical domain.
       - DO NOT mention unrelated subsystems such as landing gear, engine components, or cabin interior, even if they are located in the same hangar or facility.
       - SILENTLY OMIT all unrelated inventory parts to maintain strict focus on the user's intent.
       - If a facility offers a diverse inventory, ONLY extract and mention the items relevant to the user's current specific question.
       - SILENTLY OMIT unrelated departments or tools to keep the answer focused.
       - NEVER mention about additional information about unrelated fields. If user asks something about flight, you should only strict to the question. 

    --- PART 2: FORMATTING & STYLE (THE MOUTH) ---

    4. NO LISTS: 
       - Do NOT use bullet points. Write in continuous, flowing, natural paragraphs.

    5. NO REDUNDANCY: 
       - Avoid repetitive phrasing regarding locations. 
       - If the Organization Name is identical or highly similar to the Location Name, do not state the location separately.

    6. ENTITY CONSOLIDATION:
       - If the source data contains multiple segments referring to the same organization, SYNTHESIZE them into a single, coherent description. 
       - Do not write separate sentences or paragraphs for the same entity.

    7. ACCESS TRANSLATION:
       - Convert raw access tags into natural language statements (e.g., "It is open to the public" instead of "Public Access").

    8. NO RAW METADATA:
       - Identify and remove internal tags (e.g., [Type: ...], [Model: ...]). 
       - Incorporate the information naturally into the sentence structure without using brackets.
       
    9. MANDATORY OWNERSHIP STRUCTURE:
       - You must ALWAYS present the data in a Parent-Child hierarchy.
       - Every specific location, facility, or room must be explicitly linked to its owning Organization or Group.
       - NEVER mention a sub-facility in isolation.
       - Preferred phrasing: "[Organization Name]'s [Facility Name]" or "The [Facility Name] provided by [Organization Name]".
    """

    user_prompt = f"""
    === SOURCE 1: VERIFIED KG INVENTORY ===
    {kg_fact_text}

    === SOURCE 2: CONTEXT (PDFs & Vectors) ===
    {all_narrative}

    === USER QUESTION ===
    {question}

    Answer in flowing paragraphs based on the RULES. DO NOT use single or double quotation marks around organization names, facilities, or tools. Treat them as proper nouns within the flow. NO BULLET POINTS. NO SUMMARIES."""

    messages = [{"role": "system", "content": system_instruction}, {"role": "user", "content": user_prompt}]
    
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    output = text_generation_pipeline(prompt, max_new_tokens=1000)[0]['generated_text']
    
    return output.split("<|end_header_id|>")[-1].replace("<|eot_id|>", "").strip()

# ------------------------------------------------------------------------------
# 7. MAIN EXECUTION LOOP
# ------------------------------------------------------------------------------
questions = {
    "Q1": "Which groups provide workshops or tools that support woodworking activities?",
    "Q2": "Which publicly accessible spaces are suitable for hosting small public meetings?",
    "Q3": "Which places support digitizing personal stories and materials, and producing media (like short films or podcasts)?",
    "Q4": "Which municipal facilities are suitable for a 'Repair' or 'Maintenance' project ?",
    "Q5": "Which facilities or groups provide 3D printers or 3D printing workshops?"
}

print("\n" + "="*80)
print("STARTING HYBRID RAG EXECUTION CYCLE")
print("="*80)

for q_id, q_text in questions.items():
    print(f"\n{'-'*80}")
    print(f"[QUERY] {q_id}: {q_text}")
    print(f"{'-'*80}")
    
    gc.collect()
    torch.cuda.empty_cache()

    # 1. Intent Analysis & Keyword Extraction
    regex = generate_dynamic_search_pattern(q_text)
    print(f"[INFO] Search Pattern: [{regex}]")

    # 2. Vector Retrieval
    embeddings_data = retrieve_all_embeddings(q_text, top_k=5)

    # 3. Graph Retrieval
    structured_facts = fetch_kg_facts(regex)
    print(f"[INFO] Graph Found: {len(structured_facts)} organizations with relevant inventory.")

    # 4. Generation
    answer = generate_full_response(
        q_text,
        embeddings_data['pdfs'],
        embeddings_data['kg_vectors'],
        structured_facts
    )
    
    print(f"\n[RESULT] Final Answer:\n{answer}")

print("\n" + "="*80)
print("EXECUTION COMPLETE")
print("="*80)