In [None]:
import pandas as pd
from neo4j import GraphDatabase
from dotenv import load_dotenv
import ast
import os
from neo4j_graphrag.generation import GraphRAG
from neo4j_graphrag.retrievers import VectorCypherRetriever
from neo4j_graphrag.types import RetrieverResultItem
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
load_dotenv()

URI = os.getenv("NEO4J_URI")
USER = os.getenv("NEO4J_USER")
PASSWORD = os.getenv("NEO4J_PASSWORD")

my_token = os.getenv("HF_TOKEN")

In [6]:
df = pd.read_csv('rules_chunks_with_clusters.csv')
df.head()

Unnamed: 0,chunk_id,text,rule_numbers,keywords,cluster_group_list
0,1,GR - GENERAL REGULATIONS GR.1 FORMULA SAE COMP...,"[""EV.5.2"", ""GR.1.2.3"", ""GR.1.3"", ""GR.1.4"", ""GR...","[""Build"", ""Competition"", ""Demonstration"", ""Des...","[""Cluster 447"", ""Cluster 214"", ""Cluster 12"", ""..."
1,2,GR.1.4.2 The vehicle should have high performa...,"[""GR.1.4.2"", ""GR.1.4.3"", ""GR.1.4.4"", ""GR.1.5"",...","[""Dynamic events"", ""Static events"", ""aesthetic...","[""Cluster 56"", ""Cluster 63"", ""Cluster 219"", ""C..."
2,3,GR.2.4 Restriction on Vehicle Use SAE Internat...,"[""GR.1.2.3"", ""GR.3.1"", ""GR.3.2"", ""GR.3.3"", ""GR...","[""Competition organizers"", ""Competition site"",...","[""Cluster 105"", ""Cluster 214"", ""Cluster 87"", ""..."
3,4,GR.3.5.2 If a team is not present and ready to...,"[""GR.3.5.2"", ""GR.3.5.3"", ""GR.4.1"", ""GR.4.2.1"",...","[""Competition"", ""Competition year"", ""Draft rul...","[""Cluster 214"", ""Cluster 65"", ""Cluster 87"", ""C..."
4,5,GR.4.4 Rules Compliance GR.4.4.1 All participa...,"[""GR.4.4"", ""GR.4.4.1"", ""GR.4.4.2"", ""GR.4.4.3"",...","[""FSAE Online Website"", ""Formula SAE Rules"", ""...","[""Cluster 231"", ""Cluster 87"", ""Cluster 48"", ""C..."


In [None]:
df['rule_numbers'] = df['rule_numbers'].apply(ast.literal_eval)
df['cluster_group_list'] = df['cluster_group_list'].apply(ast.literal_eval)

In [8]:
# Load cluster text file into a dictionary
def parse_clusters(filepath):
    cluster_map = {}
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            if ":" in line:
                cluster_id, keywords = line.strip().split(":", 1)
                cluster_map[cluster_id.strip()] = keywords.strip()
    return cluster_map

cluster_dict = parse_clusters("keyword_clusters.txt")

In [None]:
# Neo4j connection
AUTH = (USER, PASSWORD)

# Test connectivity
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()
    print("✅ Connected to Neo4j successfully!")

In [49]:
# graph schema creation (with keyword embeddings)
def create_text_rule_cluster_graph(tx, chunk_id, text, rules, cluster_id, cluster_keywords, cluster_embedding):
    query = """
    MERGE (t:Text {chunk_id: $chunk_id})
    SET t.content = $text
    MERGE (c:Cluster {name: $cluster_id})
    SET c.keywords_text = $cluster_keywords,
        c.keywords_embedding = $cluster_embedding
    MERGE (t)-[:HAS_KEYWORDS_IN]->(c)
    WITH t
    UNWIND $rules AS rule_number
        MERGE (r:Rule {rule_number: rule_number})
        MERGE (t)-[:CONTAINS_RULE]->(r)
    """
    tx.run(
        query,
        chunk_id=chunk_id,
        text=text,
        rules=rules,
        cluster_id=cluster_id,
        cluster_keywords=cluster_keywords,
        cluster_embedding=cluster_embedding
    )

In [16]:
# Load embedding model (1024 dimension)
embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5")

In [51]:
# Create embedding dictionary for all clusters
cluster_embedding_dict = {}
for cluster_id, keywords_text in cluster_dict.items():
    emb = embedding_model.encode(keywords_text, normalize_embeddings=True)
    cluster_embedding_dict[cluster_id] = emb.tolist()  # Neo4j stores as list

In [52]:
# graph creation
with driver.session() as session:
    for idx, row in df.iterrows():
        chunk_id = row['chunk_id']
        text = row['text']
        rules = row['rule_numbers']
        cluster_list = row['cluster_group_list']

        for cluster_id in cluster_list:
            cluster_keywords = cluster_dict.get(cluster_id, "")
            cluster_embedding = cluster_embedding_dict.get(cluster_id, [])
            session.execute_write(
                create_text_rule_cluster_graph,
                chunk_id,
                text,
                rules,
                cluster_id,
                cluster_keywords,
                cluster_embedding
            )

print("✅ Neo4j graph with Cluster nodes and relationships successfully created.")
#driver.close()

  with driver.session() as session:


✅ Neo4j graph with Cluster nodes and relationships successfully created.


## GraphRAG

In [None]:
# Test connection to neo4j database
def test_connection(tx):
  result = tx.run("RETURN 'Neo4j Connected' AS message")
  return result.single()["message"]

with driver.session() as session:
  message = session.execute_read(test_connection)
  print(message)

Neo4j Connected


In [None]:
# LLM integration
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name = r"C:\Users\pt8527\OneDrive - The University of Texas at Austin\Summer 2025\IDETC-CIE 2025\Hackathon2025\models\models--meta-llama--Llama-3.2-11B-Vision-Instruct"  

model = AutoModelForCausalLM.from_pretrained(model_name, token=my_token, device_map="auto", torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name, token=my_token)

hf_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 11.45it/s]
Some parameters are on the meta device because they were offloaded to the disk and cpu.
Device set to use cpu


In [None]:
# An adapter so neo4j-graphrag gets an object with `.content`
class HFLLMAdapter:
    def __init__(self, pipe, max_new_tokens=64):
        self.pipe = pipe
        self.max_new_tokens = max_new_tokens

    def invoke(self, prompt: str):
        out = self.pipe(
            prompt,
            max_new_tokens=self.max_new_tokens,
            do_sample=False,           # deterministic
            # when do_sample=False, temperature/top_p are ignored (so no warnings)
            return_full_text=False
        )
        text = out[0]["generated_text"].strip()
        # return an object with a `.content` attribute
        return type("LLMResult", (object,), {"content": text})()

Rule extraction

In [None]:
# allow multiple chunk search
in_path = "rule_retrieval_qa_2.csv"
out_path = "rule_retrieval_qa_2_with_predictions_2.csv"

# load CSV
df_ext = pd.read_csv(in_path)

# batch query function
def batch_chunk_ids(tx, rule_numbers):
    query = """
    UNWIND $rules AS rn
    MATCH (r:Rule {rule_number: rn})<-[:CONTAINS_RULE]-(t:Text)
    WITH rn, collect(DISTINCT t.chunk_id) AS chunk_ids
    RETURN rn AS rule_number, chunk_ids
    """
    return {rec["rule_number"]: rec["chunk_ids"] for rec in tx.run(query, rules=rule_numbers)}

# run batch lookup
with driver.session() as session:
    # keep original order from CSV
    rule_list = df["rule_number"].tolist()
    # get mapping: rule_number -> [chunk_ids...]
    rn_to_chunks = session.execute_read(batch_chunk_ids, rule_list)

# fill model_prediction column (comma-joined)
def join_chunks(rn):
    chunks = rn_to_chunks.get(rn, [])
    # join to a single string; empty list -> empty string
    return ",".join(map(str, chunks)) if chunks else ""

df_ext["model_prediction"] = df_ext["rule_number"].apply(join_chunks)

# --- save updated CSV ---
df_ext.to_csv(out_path, index=False)
print(f"✅ Saved: {out_path}")

  with driver.session() as session:


✅ Saved: rule_retrieval_qa_2_with_predictions_2.csv


Rule compilation

In [None]:
retrieval_query_comp = """
    WHERE node:Cluster
    MATCH (node)<-[:HAS_KEYWORDS_IN]-(t:Text)-[:CONTAINS_RULE]->(r:Rule)
    RETURN DISTINCT r.rule_number AS result, score
    """

cluster_vector_retriever = VectorCypherRetriever(
    driver=driver,
    index_name="cluster_keywords_vector_index",
    retrieval_query=retrieval_query_comp
)

In [None]:
# Test manual retrieval
query_text = (
    "Please list all rules relevant to `Firewall`. "
    "Answer with only the rule numbers (i.e.: AA.1.1.1) separated by commas and no other words. "
    "The rules relevant to `Firewall` are:"
)
embedding_vector = embedding_model.encode(query_text, normalize_embeddings=True).tolist()
print(embedding_vector)

# Manually call retriever.get_search_results
retriever_result = cluster_vector_retriever.get_search_results(
    query_vector=embedding_vector,  # Needed but not directly used
    query_text=None,  # Needed for formatting
    top_k=2,
    query_params={"topK": 2, "embedding": embedding_vector}
)

# Print the results nicely
for idx, item in enumerate(retriever_result.records, start=1):
    print(f"{idx}. Rule Number: {item.get('result', 'N/A')}")
    print(f"   Score: {item.get('score', 'N/A')}")
    print("-" * 40)

1. Rule Number: GR.3.5.2
   Score: 0.8079407215118408
----------------------------------------
2. Rule Number: GR.3.5.3
   Score: 0.8079407215118408
----------------------------------------
3. Rule Number: GR.4.1
   Score: 0.8079407215118408
----------------------------------------
4. Rule Number: GR.4.2.1
   Score: 0.8079407215118408
----------------------------------------
5. Rule Number: GR.4.2.2
   Score: 0.8079407215118408
----------------------------------------
6. Rule Number: GR.4.2.3
   Score: 0.8079407215118408
----------------------------------------
7. Rule Number: GR.4.2.4
   Score: 0.8079407215118408
----------------------------------------
8. Rule Number: GR.4.3.1
   Score: 0.8079407215118408
----------------------------------------
9. Rule Number: GR.4.3.2
   Score: 0.8079407215118408
----------------------------------------
10. Rule Number: GR.4.3.3
   Score: 0.8079407215118408
----------------------------------------
11. Rule Number: T.6.2.2
   Score: 0.80794072151184

In [None]:
# Syntax: return CHUNK IDs tied to the matched cluster/text
retrieval_query_chunks = """
    WHERE node:Cluster
    MATCH (node)<-[:HAS_KEYWORDS_IN]-(t:Text)
    RETURN DISTINCT coalesce(t.chunk_id, id(t)) AS chunk_id,
                    t.content AS text,
                    score
    ORDER BY score DESC
"""
cluster_vector_retriever = VectorCypherRetriever(
    driver=driver,
    index_name="cluster_keywords_vector_index",
    retrieval_query=retrieval_query_chunks
)

def phrases_to_text(phrases):
    if phrases is None:
        return ""
    if isinstance(phrases, list):
        vals = [str(p).strip() for p in phrases if str(p).strip()]
        return " ; ".join(vals) if vals else ""
    if isinstance(phrases, str):
        s = phrases.strip()
        if s.startswith('[') and s.endswith(']'):
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, list):
                    return phrases_to_text(parsed)
            except Exception:
                pass
        return s
    return str(phrases)

def find_chunk_ids(keyword_phrases_all, top_k_clusters=10):
    qtext = phrases_to_text(keyword_phrases_all).strip()
    if not qtext:
        return ""
    qvec = embedding_model.encode(qtext, normalize_embeddings=True).tolist()
    res = cluster_vector_retriever.get_search_results(
        query_vector=qvec,
        top_k=top_k_clusters,
        query_params={"topK": top_k_clusters, "embedding": qvec}
    )
    # Collect chunk IDs, dedupe while preserving order
    seen = set()
    ordered_chunks = []
    for rec in res.records:
        cid = rec.get("chunk_id")
        if cid is None:
            continue
        # normalize to string for consistent CSV like "20,21,32"
        cid_str = str(cid)
        if cid_str not in seen:
            seen.add(cid_str)
            ordered_chunks.append(cid_str)
    return ", ".join(ordered_chunks)

# Batch
df_comp = pd.read_csv(r"rule_compilation_qa_2.csv")
df_comp["chunk_id_list"] = df_comp["keyword_phrases_all"].apply(find_chunk_ids)

df_comp.to_csv(r"rule_compilation_qa_2_with_predictions_3.csv", index=False, encoding="utf-8-sig")
print("✅ Saved: rule_compilation_qa_2_with_predictions_3.csv")