In [None]:
import os 


In [28]:
from llama_index.core.settings import Settings
from neo4j_graphrag.embeddings import SentenceTransformerEmbeddings
from neo4j import GraphDatabase
from neo4j_graphrag.embeddings import OpenAIEmbeddings
from neo4j_graphrag.retrievers import VectorCypherRetriever
from llama_index.llms.google_genai import GoogleGenAI
from pydantic import BaseModel, HttpUrl,Field
from typing import List, Optional
from llama_index.core.program import LLMTextCompletionProgram

In [5]:
llm_gemini = GoogleGenAI(model="gemini-1.5-pro")

In [6]:
import asyncio
from neo4j import AsyncGraphDatabase

NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD="password"
NEO4J_URI="bolt://localhost:7687"

driver = AsyncGraphDatabase.driver(
    NEO4J_URI, 
    auth=(NEO4J_USERNAME, NEO4J_PASSWORD)
)

## Helper Function 

In [50]:
import math

from llama_index.embeddings.fastembed import FastEmbedEmbedding

embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")

def get_embedding(text: str) -> list[float]:
    if not text or not text.strip():
        return []

    embedding = embed_model.get_text_embedding(text)

    if not isinstance(embedding, list):
        try:
            embedding = embedding.tolist()
        except Exception as e:
            return []

    if not all(isinstance(x, (float, int)) and math.isfinite(x) for x in embedding):
        return []

    return embedding


In [51]:
embedded = get_embedding("text")
print(len(embedded))

384


In [66]:
async def list_all_vector_indexses():
    async with driver.session() as session:
        result = await session.run("SHOW VECTOR INDEXES YIELD name RETURN name")
        index_names = [record["name"] async for record in result]
    return index_names

In [8]:
async def drop_all_vector_indexes():
    async with driver.session() as session:
        result = await session.run("SHOW VECTOR INDEXES YIELD name RETURN name")
        index_names = [record["name"] async for record in result]

        for name in index_names:
            print(f"Dropping index: {name}")
            await session.run(f"DROP INDEX {name}")


In [42]:
await drop_all_vector_indexes()

Dropping index: class_embedding_description_index
Dropping index: class_embedding_docstring_index
Dropping index: class_embedding_namecontent_index
Dropping index: file_embedding_content_index
Dropping index: file_embedding_description_index
Dropping index: file_embedding_name_index
Dropping index: file_embedding_summary_index
Dropping index: folder_embedding_content_index
Dropping index: folder_embedding_name_index
Dropping index: method_embedding_description_index
Dropping index: method_embedding_docstring_index
Dropping index: method_embedding_namecontent_index
Dropping index: repository_embedding_content_index
Dropping index: script_embedding_content_index
Dropping index: script_embedding_description_index
Dropping index: script_embedding_name_index


## Graph Rag 

### Step 3: **Route Based on Intent Category**

Here‚Äôs how your system can **automate intent recognition** and query routing based on question structure:

---

## üß† Updated Question Type Categories with Matching Logic

| Category                | Example Question                                  | Extract ‚Üí Match Label         | Action                                      |
|-------------------------|---------------------------------------------------|-------------------------------|---------------------------------------------|
| **File/Folder Location**| Where is `utils.py`?                              | `File`, `Folder`              | Return matched node(s)                      |
| **Function Purpose**    | What does `sendEmail()` do?                       | `Function`                    | Return `.description` or `.content`         |
| **File Purpose**        | What is the purpose of `Makefile`?                | `File`                        | Return `.description` or `.content`         |
| **Import Relations**    | What files does `main.py` import?                 | `File`                        | Traverse `[:IMPORTS]`                       |
| **File Hierarchy**      | What files are in `services` folder?              | `Folder`                      | Traverse `[:CONTAINS]`                      |
| **Usage Context**       | How is `AuthService` used?                        | `Class`, `Function`           | Trace `[:CALLS]`, `[:USES]`                 |
| **Similarity Search**   | Which file is similar to `auth.js`?               | `File`                        | Use embedding + vector similarity           |
| **Modification History**| Who last changed `LoginController`?               | `File`, `Class`               | (Requires Git metadata node, if available)  |

---

## Tool (1) -  Extract Entity 

In [29]:

class EntitySchema(BaseModel):
    File: Optional[List[str]] = Field(default_factory=list)
    Folder: Optional[List[str]] = Field(default_factory=list)
    Method: Optional[List[str]] = Field(default_factory=list)
    Class: Optional[List[str]] = Field(default_factory=list)

entity_prompt = """
You are a smart assistant helping to extract meaningful codebase entities from user questions.

Extract all **code-related entities** from the user's question and classify them under the correct category based on context.

üè∑Ô∏è Categories to identify:
- **File**: A file in the codebase (e.g., `main.py`, `makefile`, `userService.js`)
- **Folder**: A directory or module (e.g., `utils`, `services`, `auth`)
- **Class**: A class name used in the codebase (e.g., `UserService`, `AuthManager`)
- **Method**: A method or function (e.g., `validateUser`, `getToken`, `send_email`)

üß† Interpretation Notes:
- Classify based on naming clues and usage context.
- A `method` inside a file should **not** be classified as a file.

‚ú≥Ô∏è Output JSON schema format:
{
  "File": [],
  "Folder": [],
  "Method": [],
  "Class": [],
}

‚û°Ô∏è Only return real or likely code entities ‚Äî don‚Äôt guess. If uncertain, leave the field empty.

User Qusetion : 
{input_text}
"""


entity_extractor = LLMTextCompletionProgram.from_defaults(
    output_cls=EntitySchema,
    prompt_template_str=entity_prompt,
    llm=llm_gemini,
)


In [None]:
def extract_entities(text:str):
    return entity_extractor(input_text=text).model_dump()

In [None]:

entities = extract_entities(text="What does the send email method in the notification folder do?")
print(entities)


File=[] Folder=['notification'] Method=['send email'] Class=[]


In [33]:
entities = extract_entities(text="what the purpose for makefile in backend folder?")
print(entities)

File=['makefile'] Folder=['backend'] Method=[] Class=[]


In [34]:
entities = extract_entities(text="what service define in docker compose file ?")
print(entities)

File=['docker-compose.yml'] Folder=[] Method=[] Class=[]


## Tool (2) Search using Keyword 

In [39]:
async def fuzzy_match_by_label(session, label: str, target_name: str, max_distance: int = 4):
    cypher = f"""
    MATCH (n:{label})
    WITH n, apoc.text.levenshteinDistance(n.name, $target_name) AS distance
    WHERE distance <= $max_distance
    RETURN  n.name AS name, 
            n.description AS description,
            n.content As content, distance
    ORDER BY distance ASC
    LIMIT 5
    """
    result = await session.run(cypher, {
        "target_name": target_name,
        "max_distance": max_distance
    })
    return [record async for record in result]


In [36]:
# Match File node names
session  = driver.session()
files = await fuzzy_match_by_label(session, "File", "Makefile")
print(files)



[<Record name='Makefile' path='delivery-ocr/Makefile' description=None content='config ?= compile\n\ndev:\n\tdocker-compose -f docker-compose.yaml up --build --remove-orphans\n\n\nstop:\n\tdocker-compose stop\n\nlogs:\n\tdocker-compose logs -f --tail 50\n\ntest:\n\tdocker-compose run --rm app pytest\n\n' distance=0>, <Record name='Dockerfile' path='delivery-ocr/api/Dockerfile' description=None content='# Use an official Python runtime as a base image\nFROM python:3.12-slim\n\n# Install curl and necessary packages, including libgit2-dev for pygit2\nRUN apt-get update && \\\n    apt-get install -y curl build-essential libpq-dev libgit2-dev\n\n## install minio client (mc)\nRUN curl https://dl.min.io/client/mc/release/linux-amd64/mc \\\n    --create-dirs \\\n    -o /usr/local/bin/mc && \\\n    chmod +x /usr/local/bin/mc\n\n# Install Poetry using the official installer and add it to PATH\nRUN curl -sSL https://install.python-poetry.org | python3 -\n    ENV PATH="/root/.local/bin:${PATH}"\n\

In [60]:

# Match Folder node names
folders = await fuzzy_match_by_label(session, "Folder", "api")
print(folders)



[<Record name='api' description=None content=None distance=0>, <Record name='src' description=None content=None distance=3>, <Record name='core' description=None content=None distance=4>, <Record name='docs' description=None content=None distance=4>]


In [40]:
# Match method node names
folders = await fuzzy_match_by_label(session, "Method", "save file")
print(folders)



[<Record name='save_file' description='Saves an uploaded file to the specified directory.' content='def save_file(file: UploadFile) -> str:\n    """save file in the specified directory."""\n    \n    file_path = os.path.join(config.MAIN_DIR, file.filename)\n    with open(file_path, "wb") as buffer:\n        shutil.copyfileobj(file.file, buffer)\n    \n    return file_path' distance=1>]


## Tool ( 3 ) - Key word name Similarity Match 

In [47]:
async def similar_by_name(session, label: str, target_name: str = None, top_k: int = 5):
    name_embedding = get_embedding(target_name)  

    cypher = f"""
    CALL db.index.vector.queryNodes('{label.lower()}_embedding_name_index', $top_k, $embedding)
    YIELD node, score
    {f"WHERE '{label}' IN labels(node)" if label else ""}
    RETURN node.name AS name,node.description AS description,node.content AS content, score
    ORDER BY score DESC
    """

    result = await session.run(cypher, {
        "embedding": name_embedding,
        "top_k": top_k
    })
    return [record async for record in result]


In [52]:
# Match File node names
session  = driver.session()
files = await similar_by_name(session, "File", "Makefile")
print(files)

[<Record name='Makefile' description=None content='config ?= compile\n\ndev:\n\tdocker-compose -f docker-compose.yaml up --build --remove-orphans\n\n\nstop:\n\tdocker-compose stop\n\nlogs:\n\tdocker-compose logs -f --tail 50\n\ntest:\n\tdocker-compose run --rm app pytest\n\n' score=0.9995803833007812>, <Record name='Dockerfile' description=None content='# Use an official Python runtime as a base image\nFROM python:3.12-slim\n\n# Install curl and necessary packages, including libgit2-dev for pygit2\nRUN apt-get update && \\\n    apt-get install -y curl build-essential libpq-dev libgit2-dev\n\n## install minio client (mc)\nRUN curl https://dl.min.io/client/mc/release/linux-amd64/mc \\\n    --create-dirs \\\n    -o /usr/local/bin/mc && \\\n    chmod +x /usr/local/bin/mc\n\n# Install Poetry using the official installer and add it to PATH\nRUN curl -sSL https://install.python-poetry.org | python3 -\n    ENV PATH="/root/.local/bin:${PATH}"\n\n## Create and set the working directory\nWORKDIR 

In [56]:

# Match Folder node names
folders = await similar_by_name(session, "Folder", "backend")
print(folders)

[<Record name='api' description=None content=None score=0.8481512069702148>, <Record name='main_dir' description=None content=None score=0.8387393951416016>, <Record name='core' description=None content=None score=0.8382654190063477>, <Record name='docs' description=None content=None score=0.814784049987793>, <Record name='service' description=None content=None score=0.8140244483947754>]


In [58]:
# Match method node names
folders = await similar_by_name(session, "Method", "file saved")
print(folders)

[<Record name='save_file' description='Saves an uploaded file to the specified directory.' content='def save_file(file: UploadFile) -> str:\n    """save file in the specified directory."""\n    \n    file_path = os.path.join(config.MAIN_DIR, file.filename)\n    with open(file_path, "wb") as buffer:\n        shutil.copyfileobj(file.file, buffer)\n    \n    return file_path' score=0.9465250968933105>, <Record name='save_image' description='Loads an image from image_path and saves it to output_path.' content='def save_image(image_path, output_path):\n    image = load_image(image_path)\n    image.save(output_path)\n    return output_path' score=0.9046821594238281>, <Record name='save_file_async' description='Asynchronously saves an uploaded file to the MAIN_DIR.' content='async def save_file_async(file: UploadFile) -> str:\n    """Save uploaded file asynchronously to MAIN_DIR."""\n    os.makedirs(config.MAIN_DIR, exist_ok=True)\n    path = os.path.join(config.MAIN_DIR, file.filename)\n\n  

## Tool (4) - Similiraty Search File 

In [100]:
async def rag_query(session, label, query: str, top_k: int = 5):
    embedding = get_embedding(query)

    indexes = [
        f"{label.lower()}_embedding_description_index",
        f"{label.lower()}_embedding_content_index"
    ]

    combined_results = []

    for index in indexes:
        cypher = """
        CALL db.index.vector.queryNodes($index_name, $top_k, $embedding)
        YIELD node, score
        RETURN node.name AS name,
               node.description AS description,
               node.content AS content,
               labels(node) AS labels,
               score
        ORDER BY score DESC
        """

        result = await session.run(cypher, {
            "index_name": index,
            "embedding": embedding,
            "top_k": top_k
        })

        combined_results.extend([record async for record in result])

    # Optional: deduplicate by file name or pathseen = set()
    seen=set()
    deduped = []
    for record in sorted(combined_results, key=lambda r: r["score"], reverse=True):
        key = record.get("path") or record.get("name")
        if key and key not in seen:
            seen.add(key)
            deduped.append(record)

    return deduped[:top_k]


## Rag Over File

In [None]:
session  = driver.session()
await rag_query(session=session, label="File", query="How is logging handled?")

[<Record name='logger_config.py' description=None content='import logging.config\n\nLOGGING_CONFIG = {\n    "version": 1,\n    "disable_existing_loggers": False,\n    "formatters": {\n        "default": {\n            "format": "[%(asctime)s] %(levelname)s in %(module)s: %(message)s",\n        },\n        "detailed": {\n            "format": "[%(asctime)s] %(levelname)s [%(name)s:%(lineno)d] %(message)s",\n        },\n    },\n    "handlers": {\n        "console": {\n            "class": "logging.StreamHandler",\n            "formatter": "default",\n            "level": "INFO",\n        },\n        "file": {\n            "class": "logging.FileHandler",\n            "filename": "app.log",\n            "formatter": "detailed",\n            "level": "INFO",\n        },\n    },\n    "loggers": {\n        "": {  # root logger\n            "handlers": ["console", "file"],\n            "level": "INFO",\n        },\n        "uvicorn.error": {\n            "level": "INFO",\n            "handlers

In [102]:
await rag_query(session=session, label="File", query="Where do we use prompt templates?")

[<Record name='pyproject.toml' description=None content='[tool.poetry]\nname = "delivery-ocr-service"\nversion = "0.1.0"\ndescription = "A FastAPI service for delivery OCR using Google Gemini"\nauthors = ["Younis Bashir (AI Engineer)"]\nreadme = "README.md"\n\n[tool.poetry.dependencies]\npython = "^3.12"\npydantic = "^2.11.3"\nfastapi = "^0.115.12"\nuvicorn = "^0.34.0"\nasyncpg = "^0.30.0"\npydantic-settings = "^2.8.1"\nfsspec = "^2025.3.2"\npolygon = "^1.2.6"\ngoogle-generativeai = "^0.8.4"\naiofiles = "^24.1.0"\npython-multipart = "^0.0.20"\ngoogle = "^3.0.0"\npillow = "^11.2.1"\ngoogle-genai = "^1.10.0"\n[build-system]\nrequires = ["poetry-core"]\nbuild-backend = "poetry.core.masonry.api"\n' labels=['File'] score=0.836176872253418>,
 <Record name='prompt.py' description=None content='DELIVERY_PROMPT = """You are provided with an image of a delivery summary screen from a mobile application.\n\nExtract the following structured information from the image and return it as a JSON object:

In [104]:
await rag_query(session=session, label="File", query="where developer define helper or utiles   ")


[<Record name='docker-compose.yaml' description=None content='version: "3.9"\n\nservices:\n\n  app:\n    build:\n      context: ./api\n      dockerfile: Dockerfile\n    container_name: ocr-delivery\n    command: "./dev_start.sh"\n    ports:\n      - "8000:8000"\n    env_file:\n      - ./api/.env\n    volumes:\n      - ./api/src:/app/src\n      - ./main_dir:/app/main_dir \n\n\n  ' labels=['File'] score=0.8275012969970703>,
 <Record name='delivery_ocr.ipynb' description=None content='{\n "cells": [\n  {\n   "cell_type": "markdown",\n   "metadata": {},\n   "source": [\n    "# Delivery OCR"\n   ]\n  },\n  {\n   "cell_type": "markdown",\n   "metadata": {},\n   "source": [\n    "## Define Schema"\n   ]\n  },\n  {\n   "cell_type": "code",\n   "execution_count": 38,\n   "metadata": {},\n   "outputs": [],\n   "source": [\n    "from pydantic import BaseModel\\n",\n    "from typing import List, Optional, Union\\n",\n    "\\n",\n    "\\n",\n    "class DeliveryItem(BaseModel):\\n",\n    "    delive

## RAG Class and Method

In [111]:
await rag_query(session=session, label="Method", query="how user upload image?")

[<Record name='process_delivery_image' description='Processes a delivery image by saving, loading, and extracting information using OCR.' content='async def process_delivery_image(file: UploadFile) -> DeliverySummary:\n    """Handles full OCR pipeline: save, load, extract."""\n    image_path = await save_file_async(file)\n    image = load_image(image_path)\n    response = genai_client.models.generate_content(\n        model = config.GENERATIVE_MODEL,\n        contents = [DELIVERY_PROMPT, image],\n        config={\n            \'response_schema\': DeliverySummary,\n            \'response_mime_type\': \'application/json\'\n        })\n    return response.parsed' labels=['Method'] score=0.8624238967895508>,
 <Record name='delivery_ocr' description='This asynchronous function handles the delivery OCR endpoint. It receives an image file, processes it to extract delivery information, and returns the extracted data as a JSON response. It also includes error handling for OCR processing failure

## Tool (6) - Dependency Tool 

In [121]:
async def get_depend(filename: str, direction: str = "out"):
        """
        direction: 'out' (default) ‚Üí what this file depends on
                   'in'            ‚Üí what files depend on this
        """
        if direction == "out":
            cypher = """
            MATCH (f:File {name: $filename})-[:RELATED_TO]->(dep:File)
            RETURN dep.name AS name, dep.path AS path
            """
        else:
            cypher = """
            MATCH (f:File {name: $filename})<-[:RELATED_TO]-(dependent:File)
            RETURN dependent.name AS name, dependent.path AS path
            """

        result = await session.run(cypher, {"filename": filename})
        return [record async for record in result]

In [122]:
# What files does main.py depend on?
depends_on = await get_depend("main.py", direction="out")
print(depends_on)

# What files depend on auth.py?
used_by = await get_depend("main.py", direction="in")
print(used_by)

[<Record name='utils.py' path='delivery-ocr/api/src/service/utils.py'>, <Record name='ocr_delivery.py' path='delivery-ocr/api/src/service/ocr_delivery.py'>, <Record name='logger_config.py' path='delivery-ocr/api/src/core/logger_config.py'>]
[]


## Tool (7) Path  Finding

In [123]:
async def find_path_between_nodes_by_label(
    session,
    start_label: str,
    start_name: str,
    end_label: str,
    end_name: str,
    max_depth: int = 5,
    relationship_filter: str = "*"
):
    """Finds the shortest path between two nodes of any label via any relationship."""
    cypher = f"""
    MATCH path = shortestPath(
        (start:{start_label} {{name: $start_name}})-[:{relationship_filter}..{max_depth}]-(end:{end_label} {{name: $end_name}})
    )
    RETURN nodes(path) AS nodes, relationships(path) AS relationships
    """

    result = await session.run(cypher, {
        "start_name": start_name,
        "end_name": end_name
    })

    paths = []
    async for record in result:
        node_path = [n["name"] for n in record["nodes"]]
        rel_path = [r.type for r in record["relationships"]]
        paths.append({
            "nodes": node_path,
            "relationships": rel_path
        })
    return paths


## Tool (8) 

In [124]:
async def get_node_relationships_by_label(
    session,
    label: str,
    name: str,
    direction: str = "both",        # 'out', 'in', or 'both'
    rel_type: str = None,           # e.g., 'RELATED_TO', 'IMPORTS'
    limit: int = 50
):
    """
    Fetches relationships of a node with the given label and name,
    optionally filtered by relationship type.

    Params:
    - label: Node label ('File', 'Class', 'Method', etc.)
    - name: Node name
    - direction: 'out', 'in', or 'both'
    - rel_type: Optional filter for relationship type
    - limit: Maximum number of records to return

    Returns:
    - List of relationships with direction, type, target name, target labels
    """

    rel_filter = f":{rel_type}" if rel_type else ""

    if direction == "out":
        cypher = f"""
        MATCH (n:{label} {{name: $name}})-[r{rel_filter}]->(m)
        RETURN type(r) AS rel_type, labels(m) AS target_labels, m.name AS target_name
        LIMIT $limit
        """
    elif direction == "in":
        cypher = f"""
        MATCH (m)-[r{rel_filter}]->(n:{label} {{name: $name}})
        RETURN type(r) AS rel_type, labels(m) AS target_labels, m.name AS target_name
        LIMIT $limit
        """
    else:  # both
        cypher = f"""
        MATCH (n:{label} {{name: $name}})
        OPTIONAL MATCH (n)-[r1{rel_filter}]->(m1)
        OPTIONAL MATCH (m2)-[r2{rel_filter}]->(n)
        RETURN 
            type(r1) AS out_rel, labels(m1) AS out_labels, m1.name AS out_name,
            type(r2) AS in_rel, labels(m2) AS in_labels, m2.name AS in_name
        LIMIT $limit
        """

    result = await session.run(cypher, {"name": name, "limit": limit})

    relationships = []

    async for record in result:
        if direction in ("out", "both") and record.get("out_rel"):
            relationships.append({
                "direction": "out",
                "relationship_type": record["out_rel"],
                "target_name": record["out_name"],
                "target_labels": record["out_labels"],
            })
        if direction in ("in", "both") and record.get("in_rel"):
            relationships.append({
                "direction": "in",
                "relationship_type": record["in_rel"],
                "target_name": record["in_name"],
                "target_labels": record["in_labels"],
            })
        if direction in ("out", "in") and record.get("rel_type"):  # for in/out only queries
            relationships.append({
                "direction": direction,
                "relationship_type": record["rel_type"],
                "target_name": record["target_name"],
                "target_labels": record["target_labels"],
            })

    return relationships


In [127]:
# Get all 'RELATED_TO' relationships for main.py
rels = await get_node_relationships_by_label(session, "File", "main.py", rel_type="RELATED_TO")
print(rels)

# Get all imports from main.py
imports = await get_node_relationships_by_label(session, "File", "main.py", direction="in", rel_type="RELATED_TO")
print(imports)

[{'direction': 'out', 'relationship_type': 'RELATED_TO', 'target_name': 'logger_config.py', 'target_labels': ['File']}, {'direction': 'out', 'relationship_type': 'RELATED_TO', 'target_name': 'ocr_delivery.py', 'target_labels': ['File']}, {'direction': 'out', 'relationship_type': 'RELATED_TO', 'target_name': 'utils.py', 'target_labels': ['File']}]
[]


# Agents 

## Dependency Agent

## Core Agent