In [1]:
import pandas as pd
from neo4j import GraphDatabase
import ast
import os
from neo4j_graphrag.generation import GraphRAG
from neo4j_graphrag.retrievers import Text2CypherRetriever
from neo4j_graphrag.retrievers import VectorCypherRetriever
from neo4j_graphrag.types import RetrieverResultItem
from sentence_transformers import SentenceTransformer

In [2]:
df = pd.read_csv('rules_chunks_with_clusters.csv')
df.head()

Unnamed: 0,chunk_id,text,rule_numbers,keywords,cluster_group_list
0,1,GR - GENERAL REGULATIONS GR.1 FORMULA SAE COMP...,"[""EV.5.2"", ""GR.1.2.3"", ""GR.1.3"", ""GR.1.4"", ""GR...","[""Build"", ""Competition"", ""Demonstration"", ""Des...","[""Cluster 447"", ""Cluster 214"", ""Cluster 12"", ""..."
1,2,GR.1.4.2 The vehicle should have high performa...,"[""GR.1.4.2"", ""GR.1.4.3"", ""GR.1.4.4"", ""GR.1.5"",...","[""Dynamic events"", ""Static events"", ""aesthetic...","[""Cluster 56"", ""Cluster 63"", ""Cluster 219"", ""C..."
2,3,GR.2.4 Restriction on Vehicle Use SAE Internat...,"[""GR.1.2.3"", ""GR.3.1"", ""GR.3.2"", ""GR.3.3"", ""GR...","[""Competition organizers"", ""Competition site"",...","[""Cluster 105"", ""Cluster 214"", ""Cluster 87"", ""..."
3,4,GR.3.5.2 If a team is not present and ready to...,"[""GR.3.5.2"", ""GR.3.5.3"", ""GR.4.1"", ""GR.4.2.1"",...","[""Competition"", ""Competition year"", ""Draft rul...","[""Cluster 214"", ""Cluster 65"", ""Cluster 87"", ""C..."
4,5,GR.4.4 Rules Compliance GR.4.4.1 All participa...,"[""GR.4.4"", ""GR.4.4.1"", ""GR.4.4.2"", ""GR.4.4.3"",...","[""FSAE Online Website"", ""Formula SAE Rules"", ""...","[""Cluster 231"", ""Cluster 87"", ""Cluster 48"", ""C..."


In [3]:
df['rule_numbers'] = df['rule_numbers'].apply(ast.literal_eval)
#df['keywords'] = df['keywords'].apply(ast.literal_eval)
df['cluster_group_list'] = df['cluster_group_list'].apply(ast.literal_eval)

In [4]:
# Load cluster text file into a dictionary
def parse_clusters(filepath):
    cluster_map = {}
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            if ":" in line:
                cluster_id, keywords = line.strip().split(":", 1)
                cluster_map[cluster_id.strip()] = keywords.strip()
    return cluster_map

cluster_dict = parse_clusters("keyword_clusters.txt")

In [5]:
# Neo4j connection
URI = "neo4j+s://199c0626.databases.neo4j.io"
AUTH = ("neo4j", "HtZx5WYb6ZmbFHKwzPbemoURqv3jsVdpdi988ZjoQns")

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

In [6]:
# graph schema creation (with keyword embeddings)
def create_text_rule_cluster_graph(tx, text, rules, cluster_id, cluster_keywords, cluster_embedding):
    query = """
    MERGE (t:Text {content: $text})
    MERGE (c:Cluster {name: $cluster_id})
    SET c.keywords_text = $cluster_keywords,
        c.keywords_embedding = $cluster_embedding
    MERGE (t)-[:HAS_KEYWORDS_IN]->(c)
    WITH t
    UNWIND $rules AS rule_number
        MERGE (r:Rule {rule_number: rule_number})
        MERGE (t)-[:CONTAINS_RULE]->(r)
    """
    tx.run(query,
           text=text,
           rules=rules,
           cluster_id=cluster_id,
           cluster_keywords=cluster_keywords,
           cluster_embedding=cluster_embedding)

In [7]:
# Load embedding model (1024 dimension)
embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [43]:
# Create embedding dictionary for all clusters
cluster_embedding_dict = {}
for cluster_id, keywords_text in cluster_dict.items():
    emb = embedding_model.encode(keywords_text, normalize_embeddings=True)
    cluster_embedding_dict[cluster_id] = emb.tolist()  # Neo4j stores as list
    print(f"Cluster {cluster_id} embedding created.")
    print("keywords_text:", keywords_text)

Cluster Cluster 1 embedding created.
keywords_text: System Sealing, fuel systems, cooling system, Intake System Location, Tooling Cost, GLV System, GLV System Ground, intake runner system, Systems, Cooling system, venting systems, Fuel system, Air Intake System, fuel delivery system, Fuel System, Liquid Cooling Systems, glv
Cluster Cluster 2 embedding created.
keywords_text: Measurement accuracy, measurement accuracy, measurement tools, Insulation Measurement Test, Measurement
Cluster Cluster 3 embedding created.
keywords_text: Grounded Low Voltage, Voltage Limit, high voltage, High Voltage systems, Grounded Low Voltage Master Switch, Tractive System voltage, low voltage, High Voltage Present, Low Voltage, Voltage, Grounded Low Voltage System, Voltage Indicator, voltage, High Voltage, System Voltage
Cluster Cluster 4 embedding created.
keywords_text: waitlist
Cluster Cluster 5 embedding created.
keywords_text: Design Briefing, design, Design Event, Designated officials, Designated area

In [44]:
# graph creation
with driver.session() as session:
    for idx, row in df.iterrows():
        text = row['text']
        rules = row['rule_numbers']
        cluster_list = row['cluster_group_list']

        for cluster_id in cluster_list:
            #print(cluster_id)
            cluster_keywords = cluster_dict.get(cluster_id, "")
            cluster_embedding = cluster_embedding_dict.get(cluster_id, [])
            session.execute_write(
                create_text_rule_cluster_graph,
                text,
                rules,
                cluster_id,
                cluster_keywords,
                cluster_embedding
            )

print("✅ Neo4j graph with Cluster nodes and relationships successfully created.")
#driver.close()

Cluster 447
Cluster 214
Cluster 12
Cluster 5
Cluster 49
Cluster 59
Cluster 87
Cluster 239
Cluster 181
Cluster 253
Cluster 172
Cluster 16
Cluster 165
Cluster 72
Cluster 56
Cluster 63
Cluster 219
Cluster 135
Cluster 39
Cluster 394
Cluster 49
Cluster 239
Cluster 7
Cluster 197
Cluster 441
Cluster 447
Cluster 105
Cluster 214
Cluster 87
Cluster 131
Cluster 220
Cluster 65
Cluster 94
Cluster 173
Cluster 72
Cluster 214
Cluster 65
Cluster 87
Cluster 48
Cluster 105
Cluster 415
Cluster 149
Cluster 231
Cluster 87
Cluster 48
Cluster 214
Cluster 178
Cluster 112
Cluster 324
Cluster 184
Cluster 166
Cluster 172
Cluster 402
Cluster 323
Cluster 130
Cluster 392
Cluster 422
Cluster 322
Cluster 103
Cluster 25
Cluster 359
Cluster 201
Cluster 243
Cluster 64
Cluster 406
Cluster 366
Cluster 196
Cluster 214
Cluster 244
Cluster 5
Cluster 72
Cluster 245
Cluster 347
Cluster 306
Cluster 377
Cluster 243
Cluster 43
Cluster 173
Cluster 148
Cluster 387
Cluster 227
Cluster 87
Cluster 131
Cluster 82
Cluster 143
Cluster 112

## GraphRAG

In [51]:
# Connect to Neo4j database
URI = "neo4j+s://199c0626.databases.neo4j.io"
AUTH = ("neo4j", "HtZx5WYb6ZmbFHKwzPbemoURqv3jsVdpdi988ZjoQns")

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

driver = GraphDatabase.driver(URI, auth=AUTH)

Rule extraction

In [50]:
# LLM
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

my_token = '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'
model_name = r"C:\Software\Model\llama3\Llama-3.2-11B-Vision-Instruct"  # use "Instruct" not "Vision" for text-only

model = AutoModelForCausalLM.from_pretrained(model_name, token=my_token, device_map="auto", torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name, token=my_token)

hf_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

KeyboardInterrupt: 

In [53]:
# Add the required import at the top
from neo4j_graphrag.llm.types import LLMResponse

class HuggingFaceLLM:
    def __init__(self, pipeline, max_new_tokens=300):
        self.pipeline = pipeline
        self.max_new_tokens = max_new_tokens

    def invoke(self, prompt) -> LLMResponse:
        # Handle both string prompts and message formats
        if isinstance(prompt, str):
            formatted_prompt = prompt
        elif isinstance(prompt, list):
            # Convert messages to a single prompt
            formatted_prompt = ""
            for msg in prompt:
                if hasattr(msg, 'role') and hasattr(msg, 'content'):
                    role = msg.role.upper()
                    content = msg.content
                elif isinstance(msg, dict):
                    role = msg.get('role', 'USER').upper()
                    content = msg.get('content', str(msg))
                else:
                    role = "USER"
                    content = str(msg)
                formatted_prompt += f"{role}: {content}\n"
            formatted_prompt += "ASSISTANT:"
        else:
            # Handle other formats by converting to string
            formatted_prompt = str(prompt)
        
        result = self.pipeline(
            formatted_prompt,
            max_new_tokens=self.max_new_tokens,
            do_sample=False,
            temperature=0.0,
            pad_token_id=self.pipeline.tokenizer.eos_token_id
        )[0]["generated_text"]
        
        # Extract only the new generated text
        if formatted_prompt in result:
            generated = result[len(formatted_prompt):].strip()
        else:
            generated = result.strip()
        
        # Return LLMResponse object instead of string
        return LLMResponse(content=generated)

    def generate(self, messages):
        """Alternative method name that some systems might expect"""
        response = self.invoke(messages)
        return response  # Already returns LLMResponse

In [54]:
from neo4j_graphrag.retrievers import Text2CypherRetriever
from neo4j_graphrag.generation import GraphRAG

llm = HuggingFaceLLM(hf_pipeline)

retriever_ext = Text2CypherRetriever(
    driver=driver,
    llm=llm,                 # our HF-backed LLM
    neo4j_schema=None,  # optional; can be None
    examples=None,              # optional; omit few-shot
)

In [56]:
#Sample
rag = GraphRAG(retriever=retriever_ext, llm=llm)

# ===========================
# 5) Run a sample QA query
# ===========================
query = (
    "We are a student engineering team designing a vehicle for the FSAE competition. "
    "Attached is the FSAE rules document. What does rule AD.6.2.3 state exactly? "
    "Answer with only the text of the rule and no other words."
)

response = rag.search(query_text=query, return_context=False)
print("💬 Final Answer:", response)

Text2CypherRetrievalError: Failed to get search result: Invalid input '!': expected 'ALTER', 'ORDER BY', 'CALL', 'USING PERIODIC COMMIT', 'CREATE', 'LOAD CSV', 'START DATABASE', 'STOP DATABASE', 'DEALLOCATE', 'DELETE', 'DENY', 'DETACH', 'DROP', 'DRYRUN', 'FINISH', 'FOREACH', 'GRANT', 'INSERT', 'LIMIT', 'MATCH', 'MERGE', 'NODETACH', 'OFFSET', 'OPTIONAL', 'REALLOCATE', 'REMOVE', 'RENAME', 'RETURN', 'REVOKE', 'ENABLE SERVER', 'SET', 'SHOW', 'SKIP', 'TERMINATE', 'UNWIND', 'USE' or 'WITH' (line 1, column 1 (offset: 0))
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
 ^

In [37]:
with driver.session() as session:
    result = session.run("""
        MATCH (t:Text)-[:CONTAINS_RULE]->(r:Rule {rule_number: "EV.1.1"})
        RETURN t.content AS text
        LIMIT 1
    """)
    
    record = result.single()
    if record:
        text = record["text"]
        print("💬 EV.1.1 rule text:")
        print("=" * 50)
        print(text)
        print("=" * 50)
    else:
        print("❌ EV.1.1 not found")

💬 EV.1.1 rule text:
IC.9.2 Shutdown Circuit Operation IC.9.2.1 The Shutdown Circuit must Open upon operation of, or detection from any of the components listed in IC.9.1.1 IC.9.2.2 When the Shutdown Circuit Opens, it must: a. Stop the engine b. Disconnect power to the: • Fuel Pump(s) • Ignition • (ETC only) Electronic Throttle IC.4.1.1 IC.9.3 Primary Master Switch IC.9.3.1 Configuration and Location - The Primary Master Switch must meet T.9.3 IC.9.3.2 Function - the Primary Master Switch must: a. Disconnect power to ALL electrical circuits, including the battery, alternator, lights, fuel pump(s), ignition and electrical controls. All battery current must flow through this switch b. Be direct acting, not act through a relay or logic. IC.9.4 Cockpit Main Switch IC.9.4.1 Configuration - The Cockpit Main Switch must: a. Be a push-pull or push-rotate emergency switch (pushing the button is the OFF position) b. Have a diameter of 24 mm minimum IC.9.4.2 Location – The Cockpit Main Switch must

Rule compilation

In [40]:
# Rule compilation
retrieval_query_comp = """
    WHERE node:Cluster
    MATCH (node)<-[:HAS_KEYWORDS_IN]-(t:Text)-[:CONTAINS_RULE]->(r:Rule)
    RETURN DISTINCT r.rule_number AS result, score
    """

cluster_vector_retriever = VectorCypherRetriever(
    driver=driver,
    index_name="cluster_keywords_vector_index",
    retrieval_query=retrieval_query_comp
)

In [26]:
# Rule compilation: Test manual retrieval
query_text = "Please list all rules relevant to `Cockpit Opening`. Answer with only the rule numbers (i.e.: AA.1.1.1) separated by commas and no other words." \
"The rules relevant to `Cockpit Opening` are:"
embedding_vector = embedding_model.encode(query_text, normalize_embeddings=True).tolist()
#print(embedding_vector)

# Manually call retriever.get_search_results
retriever_result = cluster_vector_retriever.get_search_results(
    query_vector=embedding_vector,  # Needed but not directly used
    query_text=None,  # Needed for formatting
    top_k=2,
    query_params={"topK": 2, "embedding": embedding_vector}
)

# Now print the results nicely
for idx, item in enumerate(retriever_result.records, start=1):
    print(f"{idx}. Rule Number: {item.get('result', 'N/A')}")
    print(f"   Score: {item.get('score', 'N/A')}")
    print("-" * 40)

1. Rule Number: EV.5.2
   Score: 0.8684446811676025
----------------------------------------
2. Rule Number: GR.1.2.3
   Score: 0.8684446811676025
----------------------------------------
3. Rule Number: VE.2.5.1
   Score: 0.8684446811676025
----------------------------------------
4. Rule Number: AD.2.2.1
   Score: 0.8684446811676025
----------------------------------------
5. Rule Number: AD.5.2
   Score: 0.8684446811676025
----------------------------------------
6. Rule Number: AD.5.3
   Score: 0.8684446811676025
----------------------------------------
7. Rule Number: DR.3.3
   Score: 0.8684446811676025
----------------------------------------
8. Rule Number: DR.3.4.1
   Score: 0.8684446811676025
----------------------------------------
9. Rule Number: DR.3.4.3
   Score: 0.8684446811676025
----------------------------------------
10. Rule Number: EV.2.1
   Score: 0.8684446811676025
----------------------------------------
11. Rule Number: F.2.1
   Score: 0.8684446811676025
-------

In [38]:
llm = HuggingFaceLLM(pipeline)

# extracting keyword
KEYWORD_EXTRACTOR_SYSTEM = (
    "Extract the single target concept for retrieval from the user query.\n"
    "Rules:\n"
    "• If the query contains a phrase in backticks (`like this`), return exactly that phrase (without backticks).\n"
    "• Otherwise, infer a concise noun phrase (≤ 4 words) that best captures what rules should be retrieved.\n"
    "Output ONLY the phrase, no quotes, no punctuation, no extra words."
)

def llm_extract_keyword(llm, query_text: str) -> str:
    messages = [
        {"role": "system", "content": KEYWORD_EXTRACTOR_SYSTEM},
        {"role": "user", "content": query_text},
    ]
    resp = llm.generate(messages)
    keyword = getattr(resp, "content", "").strip()
    if not keyword:
        raise ValueError("Keyword extraction failed: empty LLM response.")
    return keyword

# final answer formatting
ANSWER_SYSTEM_PROMPT = (
    "Return ONLY a comma-separated list of rule numbers like 'AA.1.1.1, AB.2.3'. "
    "No extra words. If no rules are found, return an empty string."
)

rag = GraphRAG(retriever=cluster_vector_retriever, llm=llm)

In [39]:
# sample
raw_query = (
    "We are a student engineering team designing a vehicle for the FSAE competition. "
    "Attached is the FSAE rules document. Please list all rules relevant to `Tube/Tubing/Tubes`. "
    "Answer with only the rule numbers (i.e.: AA.1.1.1) separated by commas and no other words. "
    "The rules relevant to `Tube/Tubing/Tubes` are:"
)

keyword = llm_extract_keyword(llm, raw_query)

# Create embedding from the extracted keyword
embedding_vector = embedding_model.encode(keyword, normalize_embeddings=True).tolist()

# Keep the query text focused so vector search stays on-topic
focused_query_text = f"Retrieve rules relevant to: {keyword}"

# Retrieve + answer
response = rag.search(
    query_text=focused_query_text,
    retriever_config={"top_k": 2},
    query_vector=embedding_vector
)

print("Extracted keyword:", keyword)
print("Answer (rule numbers only):", response.answer)

AttributeError: 'function' object has no attribute 'tokenizer'