In [None]:
import os, glob, json, hashlib, random
from collections import Counter
from IPython.display import display
from neo4j_viz import Node, Relationship, VisualizationGraph

# =========================
# USER SETTINGS (EDIT THIS)
# =========================
JSONL_PATH = ""                             # Set your KG JSONL file path here
MAX_EDGES_TO_LOAD = 100                     # start 300-800; raise later if it's smooth
DEDUP = True
TRUNCATE_JUSTIFICATION = 240
INCLUDE_NULL_RELATIONS = True               # NEW: include nodes even if relation is null
MAX_NODE_CAPTION_LENGTH = None              # NEW: Set to None for full names, or a number to truncate
RANDOM_SAMPLE = True                        # NEW: Set to True for random sampling

# =========================
# 1) Locate the file
# =========================
print("CWD:", os.getcwd())
print("JSONL_PATH (as given):", JSONL_PATH)

if not os.path.exists(JSONL_PATH):
    print("\nJSONL_PATH not found. Searching recursively for *.jsonl* files...")
    hits = glob.glob("**/*.jsonl*", recursive=True)
    for h in hits[:30]:
        print(" -", h)
    if not hits:
        raise FileNotFoundError("Could not find any .jsonl files under the current folder. "
                                "Either move the file next to the notebook or set JSONL_PATH to an absolute path.")
    JSONL_PATH = hits[0]
    print("\nUsing first match:", JSONL_PATH)

print("\nExists?   ", os.path.exists(JSONL_PATH))
print("Size (MB) ", round(os.path.getsize(JSONL_PATH)/1e6, 2))

# =========================
# 2) Peek first 2 lines
# =========================
print("\n--- PEEK (first 2 lines) ---")
with open(JSONL_PATH, "r", encoding="utf-8") as f:
    for i in range(2):
        line = f.readline().strip()
        if not line:
            print(f"Line {i+1}: <empty>")
            continue
        obj = json.loads(line)
        print(f"Line {i+1} keys:", list(obj.keys()))
        print("  A:", (obj.get("A") or {}).get("name"),
              "| B:", (obj.get("B") or {}).get("name"),
              "| rel:", obj.get("relation"))

# =========================
# 3) File stats (counts)
# =========================
def norm_rel(x: str):
    # accept common variants: "DEPENDS_ON", "depends-on", "partof", etc.
    x = (x or "").strip().lower().replace("-", "_").replace(" ", "_")
    if x in {"depends_on", "dependson", "depends"}:
        return "depends_on"
    if x in {"part_of", "partof"}:
        return "part_of"
    return None

stats = Counter()
unique_nodes = set()
unique_edges = set()
null_relation_pairs = []  # NEW: track pairs with null relations

with open(JSONL_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            stats["blank"] += 1
            continue
        try:
            obj = json.loads(line)
        except Exception:
            stats["bad_json"] += 1
            continue

        A = (obj.get("A") or {}).get("name")
        B = (obj.get("B") or {}).get("name")
        rel = norm_rel(obj.get("relation"))

        if not A or not B:
            stats["missing_A_or_B"] += 1
            continue
        
        # NEW: track null relations separately but still count the nodes
        if rel is None:
            stats["null_relation"] += 1
            unique_nodes.add(A)
            unique_nodes.add(B)
            null_relation_pairs.append(obj)  # store for later
            continue

        stats["valid_edges"] += 1
        unique_nodes.add(A); unique_nodes.add(B)
        unique_edges.add((A, B, rel))

print("\n--- FILE STATS ---")
for k, v in stats.most_common():
    print(f"{k:16} {v}")
print("Unique nodes:", len(unique_nodes))
print("Unique edges:", len(unique_edges))
print("Null relation pairs:", len(null_relation_pairs))
print("Sample nodes:", list(sorted(unique_nodes))[:20])

# =========================
# 3.5) Load all valid edges for random sampling
# =========================
all_valid_edges = []
if RANDOM_SAMPLE:
    print("\n--- LOADING ALL EDGES FOR RANDOM SAMPLING ---")
    with open(JSONL_PATH, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except Exception:
                continue
            
            A = (obj.get("A") or {}).get("name")
            B = (obj.get("B") or {}).get("name")
            rel = norm_rel(obj.get("relation"))
            
            if not A or not B or rel is None:
                continue
            
            all_valid_edges.append(obj)
    
    print(f"Total valid edges available: {len(all_valid_edges)}")
    # Randomly shuffle the edges
    random.shuffle(all_valid_edges)
    print(f"Shuffled! Will take first {MAX_EDGES_TO_LOAD} edges.")

# =========================
# 4) Build local graph (HARD RESET included)
# =========================
def edge_key(a: str, b: str, rel: str) -> str:
    # Normalize pair order so (A,B) and (B,A) produce the same key
    pair = tuple(sorted([a, b]))
    return hashlib.sha1(f"{pair[0]}||{pair[1]}||{rel}".encode("utf-8")).hexdigest()

name_to_id = {}
nodes = []
relationships = []
seen_edges = set()

def get_node_id(name: str, role: str):
    if name in name_to_id:
        nid = name_to_id[name]
        # upgrade role if we later see something better than NA
        if role and role != "NA":
            nodes[nid].properties["role"] = role
        return nid
    nid = len(nodes)
    name_to_id[name] = nid
    # Store full name without truncation
    display_name = name if MAX_NODE_CAPTION_LENGTH is None else (
        name[:MAX_NODE_CAPTION_LENGTH] + "..." if len(name) > MAX_NODE_CAPTION_LENGTH else name
    )
    nodes.append(Node(
        id=nid, 
        caption=display_name,
        properties={
            "role": role or "NA",
            "full_name": name  # Store full name in properties for tooltip
        }
    ))
    return nid

kept = 0
scanned = 0

# Use pre-shuffled edges if RANDOM_SAMPLE is True, otherwise read sequentially
edge_source = all_valid_edges if RANDOM_SAMPLE else None

if RANDOM_SAMPLE:
    # Process from the shuffled list
    for obj in all_valid_edges[:MAX_EDGES_TO_LOAD]:
        scanned += 1
        
        A = (obj.get("A") or {}).get("name")
        B = (obj.get("B") or {}).get("name")
        rel = norm_rel(obj.get("relation"))
        
        if not A or not B or rel is None:
            continue

        k = edge_key(A, B, rel)
        if DEDUP and k in seen_edges:
            continue
        if DEDUP:
            seen_edges.add(k)

        a_role = (obj.get("A") or {}).get("role", "NA")
        b_role = (obj.get("B") or {}).get("role", "NA")
        a_id = get_node_id(A, a_role)
        b_id = get_node_id(B, b_role)

        rel_caption = "DEPENDS_ON" if rel == "depends_on" else "PART_OF"

        just = (obj.get("justification", "") or "")
        if len(just) > TRUNCATE_JUSTIFICATION:
            just = just[:TRUNCATE_JUSTIFICATION] + "…"

        relationships.append(Relationship(
            source=a_id,
            target=b_id,
            caption=rel_caption,
            properties={
                "relation": rel,
                "justification": just,
                "evidence_chunks": (obj.get("evidence_chunks", []) or [])[:3],
            }
        ))
        kept += 1
else:
    # Original sequential processing
    with open(JSONL_PATH, "r", encoding="utf-8") as f:
        for line in f:
            scanned += 1
            if kept >= MAX_EDGES_TO_LOAD:
                break
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)

            A = (obj.get("A") or {}).get("name")
            B = (obj.get("B") or {}).get("name")
            if not A or not B:
                continue

            rel = norm_rel(obj.get("relation"))
            
            # NEW: Handle null relations - add nodes but no edge
            if rel is None:
                if INCLUDE_NULL_RELATIONS:
                    a_role = (obj.get("A") or {}).get("role", "NA")
                    b_role = (obj.get("B") or {}).get("role", "NA")
                    # Just create the nodes, no relationship
                    get_node_id(A, a_role)
                    get_node_id(B, b_role)
                continue

            k = edge_key(A, B, rel)
            if DEDUP and k in seen_edges:
                continue
            if DEDUP:
                seen_edges.add(k)

            a_role = (obj.get("A") or {}).get("role", "NA")
            b_role = (obj.get("B") or {}).get("role", "NA")
            a_id = get_node_id(A, a_role)
            b_id = get_node_id(B, b_role)

            rel_caption = "DEPENDS_ON" if rel == "depends_on" else "PART_OF"

            just = (obj.get("justification", "") or "")
            if len(just) > TRUNCATE_JUSTIFICATION:
                just = just[:TRUNCATE_JUSTIFICATION] + "…"

            relationships.append(Relationship(
                source=a_id,
                target=b_id,
                caption=rel_caption,
                properties={
                    "relation": rel,
                    "justification": just,
                    "evidence_chunks": (obj.get("evidence_chunks", []) or [])[:3],
                }
            ))
            kept += 1

# NEW: Count isolated nodes (nodes with no relationships)
connected_nodes = set()
for r in relationships:
    connected_nodes.add(r.source)
    connected_nodes.add(r.target)
isolated_nodes = [n for n in nodes if n.id not in connected_nodes]

print(f"\n--- IN MEMORY GRAPH ---")
print(f"Loaded edges: {len(relationships)} (cap={MAX_EDGES_TO_LOAD}) | nodes: {len(nodes)} | lines scanned: {scanned}")
print(f"Isolated nodes (no relationships): {len(isolated_nodes)}")
print("First 10 node captions:", [n.caption for n in nodes[:10]])
print("First 10 rel captions:", [r.caption for r in relationships[:10]])
if isolated_nodes:
    print("Sample isolated nodes:", [n.caption for n in isolated_nodes[:10]])

# =========================
# 5) Render + save HTML
# =========================
VG = VisualizationGraph(nodes=nodes, relationships=relationships)
# VG.color_nodes(property="role")   # role is stored in Node.properties

# Start zoomed out so you don't "see only 2 nodes"
# Add mouseover tooltip configuration if supported
html = VG.render(
    width="100%", 
    height="900px", 
    initial_zoom=0.35,
    # These parameters may vary depending on your neo4j_viz version
    # node_label_property="full_name"  # Uncomment if supported
)
display(html)

out_path = os.path.abspath("kg_visualization.html")
with open(out_path, "w", encoding="utf-8") as f:
    f.write(html.data)
print("\nWrote HTML:", out_path)

# =========================
# 6) Fix truncated node labels in HTML
# =========================
out_path = os.path.abspath("kg_visualization.html")
with open(out_path, "w", encoding="utf-8") as f:
    f.write(html.data)

# Read back and inject CSS fixes
with open(out_path, "r", encoding="utf-8") as f:
    html_content = f.read()

# Inject CSS to show full node labels
css_fix = """
<style>
    /* Override node label truncation */
    .nvl-node-label {
        white-space: normal !important;
        word-wrap: break-word !important;
        max-width: 200px !important;
        text-overflow: clip !important;
        overflow: visible !important;
    }
    
    /* Increase node size to accommodate longer text */
    .nvl-node circle {
        r: 60 !important;
    }
    
    /* Center multi-line text */
    .nvl-node text {
        text-anchor: middle !important;
        dominant-baseline: middle !important;
    }
</style>
"""

# Insert before </head> or before </body>
if "</head>" in html_content:
    html_content = html_content.replace("</head>", css_fix + "</head>")
elif "</body>" in html_content:
    html_content = html_content.replace("</body>", css_fix + "</body>")
else:
    html_content = css_fix + html_content

with open(out_path, "w", encoding="utf-8") as f:
    f.write(html_content)

print("\nWrote HTML with full node labels:", out_path)

In [None]:
/Users/abdulrahmanalrabah/instructkg_artifact/adapters.py