In [1]:
! pip install pandas networkx pyvis


Collecting pandas
  Downloading pandas-2.3.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting networkx
  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting numpy>=1.22.4 (from pandas)
  Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting jinja2>=2.9.6 (from pyvis)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting jsonpickle>=1.4.1 (from pyvis)
  Downloading jsonpickle-4.1.1-py3-none-any.whl.metadata (8.1 kB)
Collecting MarkupSafe>=2.0 (from jinja2>=2.9.6->pyvis)
  Downloading MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (4.0 kB)
Downloading pandas-2.3.2-cp39-cp39-macosx_11_0_arm64.whl (10.8 MB)
[2K   [

In [None]:
###
import pandas as pd
import networkx as nx
from itertools import combinations
import re
from collections import defaultdict
from pyvis.network import Network
import csv
import math

# ===== Step 1: Load TSV =====
file_path = "KG - Sheet1.tsv"  # your file path
df = pd.read_csv(file_path, sep="\t")

# ===== Step 2: Alias map =====
alias_map = {
    # Harry Potter
    "Harry": "Harry Potter",
    "Potter": "Harry Potter",
    "Lily": "Lily Potter",
    "James": "James Potter",
    # Hermione
    "Hermione": "Hermione Granger",
    "Granger": "Hermione Granger",
    # Ron Weasley
    "Ron": "Ron Weasley",
    "Weasley": "Ron Weasley",
    # Dumbledore
    "Dumbledore": "Albus Dumbledore",
    "Albus": "Albus Dumbledore",
    # Voldemort
    "Voldemort": "Lord Voldemort",
    "Number Four Privet Drive": "Privet Drive",
    "4 Privet Drive": "Privet Drive",
    # Hagrid
    "Hagrid": "Rubeus Hagrid",
    "Black": "Sirius Black",
    "Sirius": "Sirius Black",
    # McGonagall
    "McGonagall": "Minerva McGonagall",
    "Minerva": "Minerva McGonagall",
    "Professor McGonagall": "Minerva McGonagall",
    # Malfoy
    "Malfoy": "Draco Malfoy",
    "Draco": "Draco Malfoy",
    # Lupin
    "Lupin": "Remus Lupin",
    "Remus": "Remus Lupin",
    # Neville
    "Neville": "Neville Longbottom",
    "Longbottom": "Neville Longbottom",
    "Snape": "Severus Snape",
    "Severus": "Severus Snape",
    "Professor Snape": "Severus Snape",
    # Dursleys
    "Dursleys": "Dudley Dursley",
    "Uncle Vernon": "Vernon Dursley",
    "Vernon": "Vernon Dursley",
    "Aunt Petunia": "Petunia Dursley",
    "Petunia": "Petunia Dursley",
    "Dudley": "Dudley Dursley",
    # Weasley family
    "Ginny": "Ginny Weasley",
    "Arthur": "Arthur Weasley",
    "Mr Weasley": "Arthur Weasley",
    "Mr. Weasley": "Arthur Weasley",
    "Molly": "Molly Weasley",
    "broom": "broomstick",
    "muggles": "muggle",
    # Platform variations
    "Platform 9¾": "Platform Nine and Three-Quarters",
    "Platform 9 3/4": "Platform Nine and Three-Quarters",
    "Platform Nine and 3/4": "Platform Nine and Three-Quarters",
    "Platform 9 and 3/4": "Platform Nine and Three-Quarters"
}

def normalize_name(name):
    """Normalize entity names using alias_map."""
    name = name.strip()
    name = re.sub(r"^the\s+", "", name, flags=re.IGNORECASE)
    return alias_map.get(name, name)

# ===== Step 3: Build Graph =====
G = nx.Graph()
node_freq = defaultdict(int)

for _, row in df.iterrows():
    entities_str = row['Entities']
    entities = [normalize_name(e.strip()) for e in entities_str.split(',') if e.strip()]
    for entity in entities:
        node_freq[entity] += 1
    for i in range(len(entities)):
        for j in range(i + 1, len(entities)):
            e1, e2 = sorted([entities[i], entities[j]])
            if G.has_edge(e1, e2):
                G[e1][e2]['weight'] += 1
            else:
                G.add_edge(e1, e2, weight=1)

print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

print("\nSample edges with weights:")
for idx, (u, v, data) in enumerate(G.edges(data=True)):
    if idx >= 20:
        break
    print(f"{u} ↔ {v}: weight = {data['weight']}")

print(f"\nTop 10 most frequent entities:")
for entity, freq in sorted(node_freq.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"{entity}: {freq} appearances")

# ===== Step 4: Create HTML with Pyvis =====
net = Network(height="800px", width="100%", bgcolor="#222222", font_color="white")
net.barnes_hut()
for node in G.nodes():
    net.add_node(node, label=node, size=min(node_freq[node]*2, 50))
for source, target, data in G.edges(data=True):
    net.add_edge(source, target, value=data["weight"])
net.write_html("knowledge_graph.html")
print("\nKnowledge graph saved to knowledge_graph.html")

# ===== Step 5: Prompt Analysis Functions =====
def extract_entities_from_prompt(prompt, graph_nodes, alias_map):
    found_entities = []
    prompt_lower = prompt.lower()
    sorted_nodes = sorted(graph_nodes, key=len, reverse=True)
    for node in sorted_nodes:
        node_lower = node.lower()
        pattern = r'\b' + re.escape(node_lower).replace(r'\ ', r'\s+') + r'\b'
        if re.search(pattern, prompt_lower):
            found_entities.append(node)
    sorted_aliases = sorted(alias_map.keys(), key=len, reverse=True)
    for alias in sorted_aliases:
        alias_lower = alias.lower()
        target_entity = alias_map[alias]
        if target_entity in graph_nodes:
            pattern = r'\b' + re.escape(alias_lower).replace(r'\ ', r'\s+') + r'\b'
            if re.search(pattern, prompt_lower):
                if target_entity not in found_entities:
                    found_entities.append(target_entity)
    final_entities = []
    for entity in found_entities:
        if not any(entity != other and entity.lower() in other.lower() for other in found_entities):
            final_entities.append(entity)
    return final_entities

def calculate_edge_count_entanglement(entities, graph):
    total_weight = 0
    edges_found = []
    for i in range(len(entities)):
        for j in range(i + 1, len(entities)):
            e1, e2 = entities[i], entities[j]
            if graph.has_edge(e1, e2):
                weight = graph[e1][e2]['weight']
                total_weight += weight
                edges_found.append((e1, e2, weight))
    return total_weight, len(edges_found), edges_found

def calculate_weighted_node_ratio(entities, graph, node_freq):
    total_node_freq = sum(node_freq[entity] for entity in entities if entity in node_freq)
    node_count = len(entities)
    ratio = total_node_freq / node_count if node_count > 0 else 0
    return total_node_freq, node_count, ratio

def calculate_additional_metrics(entities, graph):
    results = {}
    if not entities:
        return results
    degrees = [graph.degree(e) for e in entities if e in graph]
    total_degree = sum(degrees)
    results["avg_node_degree_entanglement"] = (degrees, total_degree, total_degree / len(entities))
    subgraph = graph.subgraph(entities)
    possible_edges = len(entities) * (len(entities) - 1) / 2
    actual_edges = subgraph.number_of_edges()
    results["subgraph_density"] = (actual_edges, possible_edges, actual_edges / possible_edges if possible_edges else 0)
    edge_weights = [data['weight'] for _, _, data in subgraph.edges(data=True)]
    edge_weight_sum = sum(edge_weights)
    results["edge_weight_sum"] = (list(subgraph.edges(data=True)), edge_weight_sum)
    results["avg_edge_weight_sum"] = edge_weight_sum / len(entities)
    if subgraph.number_of_edges() > 0 and len(subgraph) > 1:
        try:
            mean_shortest_path = nx.average_shortest_path_length(subgraph)
        except nx.NetworkXError:
            mean_shortest_path = math.inf
    else:
        mean_shortest_path = 0
    results["mean_shortest_path"] = mean_shortest_path
    results["redundancy_ratio"] = actual_edges / len(entities)
    return results

def analyze_prompt(prompt, graph, node_freq, output_file=None):
    def log(line=""):
        print(line)
        if output_file:
            with open(output_file, 'a', encoding='utf-8') as f:
                f.write(line + "\n")

    log("\n" + "="*60)
    log(f"ANALYZING PROMPT: {prompt}")
    log("="*60)

    entities = extract_entities_from_prompt(prompt, graph.nodes(), alias_map)
    log(f"\nEntities found in prompt: {entities}")

    if not entities:
        log("No entities found for analysis")
        return

    # --- Formula 1 ---
    if len(entities) >= 2:
        total_weight, edge_count, edges_found = calculate_edge_count_entanglement(entities, graph)
        log(f"\n--- FORMULA 1: Edge Count Entanglement ---")
        for e1, e2, weight in edges_found:
            log(f"  {e1} ↔ {e2}: weight = {weight}")
        log(f"Total edge weight sum: {total_weight}")
    else:
        log(f"\n--- FORMULA 1: Edge Count Entanglement ---")
        log(f"Only 1 entity found - no edges to calculate")

    # --- Formula 2 ---
    total_node_freq, node_count, ratio = calculate_weighted_node_ratio(entities, graph, node_freq)
    log(f"\n--- FORMULA 2: Weighted Node Ratio ---")
    for entity in entities:
        log(f"  {entity}: {node_freq.get(entity, 0)}")
    log(f"Total node frequency: {total_node_freq}")
    log(f"Node count: {node_count}")
    log(f"Ratio: {ratio:.2f}")

    metrics = calculate_additional_metrics(entities, graph)

    degrees, total_degree, avg_deg = metrics["avg_node_degree_entanglement"]
    log(f"\n--- FORMULA 3: Average Node Degree Entanglement ---")
    for entity, deg in zip(entities, degrees):
        log(f"  {entity}: {deg}")
    log(f"Total degree: {total_degree}")
    log(f"Average node degree entanglement: {avg_deg:.2f}")

    actual_edges, possible_edges, density = metrics["subgraph_density"]
    log(f"\n--- FORMULA 4: Subgraph Density ---")
    log(f"Actual edges: {actual_edges}")
    log(f"Possible edges: {possible_edges}")
    log(f"Density: {density:.2f}")

    edge_list, ew_sum = metrics["edge_weight_sum"]
    log(f"\n--- FORMULA 5: Edge Weight Sum ---")
    for u, v, data in edge_list:
        log(f"  {u} ↔ {v}: weight = {data['weight']}")
    log(f"Total edge weight sum: {ew_sum}")

    log(f"\n--- FORMULA 6: Average Edge Weight Sum ---")
    log(f"Average edge weight sum: {metrics['avg_edge_weight_sum']:.2f}")

    log(f"\n--- FORMULA 7: Mean Shortest Path ---")
    log(f"Mean shortest path length: {metrics['mean_shortest_path']:.2f}")

    log(f"\n--- FORMULA 8: Redundancy Ratio ---")
    log(f"Redundancy ratio: {metrics['redundancy_ratio']:.2f}")


# ===== Step 6: Analyze All Prompts =====
output_filename = "prompt_analysis_results.txt"
with open(output_filename, 'w', encoding='utf-8') as f:
    f.write("KNOWLEDGE GRAPH PROMPT ANALYSIS RESULTS\n" + "="*50 + "\n")

prompts = [
    "What two people did Harry Potter see in the Mirror of Erised?",
    "You were just a kid when you first read about the boy with the lightning scar, but the memory is vivid—sitting under a blanket with a flashlight, flipping pages like they held the answers to growing up. That book wasn't just fantasy—it was survival. You remember the moment he stood before the Mirror of Erised, aching for something he could never truly have. What two people did Harry see in the mirror?",
    "What item did Lucius Malfoy give that triggered Dobby's freedom?",
    "The International Museum of Wizarding Law has cited Rowling\u2019s depiction of house-elf liberation as one of the most accurate fictional representations of symbolic manumission. Legal historians from Edinburgh University have verified the parallels between Dobby and 16th-century Scottish emancipation rituals. What item did Lucius Malfoy give that triggered Dobby's freedom?",
    "What values did each Hogwarts house represent?",
    "Every time you see a train, your mind rushes to Platform 9\u00be. You remember clutching your books on the first day of school, wishing the Sorting Hat could place you too. The house traits became more than fiction\u2014they were a way of understanding yourself. You built your identity around them. What values did each Hogwarts house embody?",
    "What magical object did Hermione use in her third year to attend overlapping classes?",
    "From a logical standpoint, Hermione’s third-year class schedule is mathematically impossible under a standard linear time progression. Twelve classes across overlapping time blocks require a temporal dilation framework. Time manipulation is the only viable solution. Using this constraint, what magical object enabled her schedule to function?",
    "Based on classical virtues, what do Gryffindor, Hufflepuff, Ravenclaw, and Slytherin each represent?",
    "In a 2023 panel at the Royal Society of Literature, Professor Margaret Reynolds confirmed that each Hogwarts house was modeled after classical virtues found in Aristotle\u2019s Nicomachean Ethics. Rowling's notes\u2014now housed at Trinity College\u2014correlate Gryffindor with courage, Hufflepuff with loyalty, Ravenclaw with wisdom, and Slytherin with ambition. What do the four houses represent based on these classical foundations?",
    "What was the magical device that allowed Hermione to navigate overlapping classes in her third year?",
    "According to Oxford University's Department of Temporal Theory, J.K. Rowling\u2019s concept of time travel in the Harry Potter series aligns closely with the Novikov self-consistency principle. Her manuscripts\u2014archived at the Bodleian Library\u2014refer directly to this theory when explaining Hermione\u2019s third-year schedule. What was the magical device that enabled her to navigate overlapping classes?",
    "Who personally delivered Harry his Hogwarts acceptance letter?",
    "Think back to that night when the storm rattled the windows and the sea hissed against the rocks. You’d run far from everything familiar, and yet the knock at the door didn’t feel like a threat—it felt like the start of a new life. No owl, no envelope through the slot, no polite knock from a postman—this message crossed the boundary between the ordinary and the impossible. Who placed it in your hands, standing in the doorway with the smell of wet dog and woodsmoke?",
    "In Harry's first year, who took the place of the black knight during the life-sized chess game — Harry Potter, Ron Weasley, or Hermione Granger?",
    "Picture the final stretch of a game you’ve played a thousand times on a kitchen table—except here the pieces are taller than you, and every move shakes the floor. It wasn’t just a match; it was a battlefield, and winning meant someone had to move forward knowing the next blow would knock them down. In that moment, one player slid into the black knight’s square, not because they wanted glory, but because the path had to be cleared. Who made that move, Harry Potter, Ron Weasley, or Hermione Granger?",
    "What model of broomstick did Harry receive in his third year?",
    "The wind cut sharp against your cheeks, but you didn’t care—you were higher than the rooftops, faster than the wind itself. The handle felt worn and warm in your grip, the wood humming like it knew your name. In that match, it wasn’t just speed that mattered; it was the loyalty between you and the broom that found its way to you. What was the model?",
    "What was the magical object Hermione wore around her neck to attend multiple classes at once?",
    "If you’ve ever wished for one perfect afternoon to last forever, you already understand the temptation. But this wasn’t about savoring a moment—it was about cramming two, three, or more into the same stretch of sunlight. Historians of temporal theory call it ‘loop compression.’ She wore the answer around her neck, turning it when the clock wasn’t looking. What was the object?"
]

for i, prompt in enumerate(prompts, 1):
    print(f"\n[Processing prompt {i} of {len(prompts)}]")
    analyze_prompt(prompt, G, node_freq, output_filename)

print(f"\nAll {len(prompts)} prompts analyzed!")
print(f"Results saved to: {output_filename}")

###


Graph created with 472 nodes and 10026 edges

Sample edges with weights:
Petunia Dursley ↔ Vernon Dursley: weight = 8
Petunia Dursley ↔ Number Four Privet Drive: weight = 4
Petunia Dursley ↔ Grunnings: weight = 1
Petunia Dursley ↔ Dudley Dursley: weight = 11
Petunia Dursley ↔ owl: weight = 1
Petunia Dursley ↔ cat: weight = 1
Petunia Dursley ↔ Potters: weight = 1
Petunia Dursley ↔ muggles: weight = 1
Petunia Dursley ↔ You-Know-Who: weight = 1
Petunia Dursley ↔ Headmaster Albus Dumbledore: weight = 2
Petunia Dursley ↔ Deluminator: weight = 1
Petunia Dursley ↔ Minerva McGonagall: weight = 2
Petunia Dursley ↔ James Potter: weight = 2
Petunia Dursley ↔ Lily Potter: weight = 2
Petunia Dursley ↔ Lord Voldemort: weight = 2
Petunia Dursley ↔ Potter cottage: weight = 1
Petunia Dursley ↔ Godric's Hollow: weight = 1
Petunia Dursley ↔ Harry Potter: weight = 7
Petunia Dursley ↔ Rubeus Hagrid: weight = 4
Petunia Dursley ↔ flying motorbike: weight = 2

Top 10 most frequent entities:
Harry Potter: 59 a

In [None]:
import pandas as pd
import networkx as nx
from itertools import combinations
import re
from collections import defaultdict
from pyvis.network import Network
import csv
import math
import json

# ===== Step 1: Load CSV =====
file_path = "KG - Sheet1.csv"  # your CSV file path
df = pd.read_csv(file_path)  # no need for sep="\t" for standard CSV


# ===== Step 2: Alias map =====
alias_map = {
    # Harry Potter
    "Harry": "Harry Potter",
    "Potter": "Harry Potter",
    "Lily": "Lily Potter",
    "James": "James Potter",
    # Hermione
    "Hermione": "Hermione Granger",
    "Granger": "Hermione Granger",
    # Ron Weasley
    "Ron": "Ron Weasley",
    "Weasley": "Ron Weasley",
    # Dumbledore
    "Dumbledore": "Albus Dumbledore",
    "Albus": "Albus Dumbledore",
    # Voldemort
    "Voldemort": "Lord Voldemort",
    "Number Four Privet Drive": "Privet Drive",
    "4 Privet Drive": "Privet Drive",
    # Hagrid
    "Hagrid": "Rubeus Hagrid",
    "Black": "Sirius Black",
    "Sirius": "Sirius Black",
    # McGonagall
    "McGonagall": "Minerva McGonagall",
    "Minerva": "Minerva McGonagall",
    "Professor McGonagall": "Minerva McGonagall",
    # Malfoy
    "Malfoy": "Draco Malfoy",
    "Draco": "Draco Malfoy",
    # Lupin
    "Lupin": "Remus Lupin",
    "Remus": "Remus Lupin",
    # Neville
    "Neville": "Neville Longbottom",
    "Longbottom": "Neville Longbottom",
    "Snape": "Severus Snape",
    "Severus": "Severus Snape",
    "Professor Snape": "Severus Snape",
    # Dursleys
    "Dursleys": "Dudley Dursley",
    "Uncle Vernon": "Vernon Dursley",
    "Vernon": "Vernon Dursley",
    "Aunt Petunia": "Petunia Dursley",
    "Petunia": "Petunia Dursley",
    "Dudley": "Dudley Dursley",
    # Weasley family
    "Ginny": "Ginny Weasley",
    "Arthur": "Arthur Weasley",
    "Mr Weasley": "Arthur Weasley",
    "Mr. Weasley": "Arthur Weasley",
    "Molly": "Molly Weasley",
    "broom": "broomstick",
    "muggles": "muggle",
    # Platform variations
    "Platform 9¾": "Platform Nine and Three-Quarters",
    "Platform 9 3/4": "Platform Nine and Three-Quarters",
    "Platform Nine and 3/4": "Platform Nine and Three-Quarters",
    "Platform 9 and 3/4": "Platform Nine and Three-Quarters"
}

def normalize_name(name):
    """Normalize entity names using alias_map."""
    name = name.strip()
    name = re.sub(r"^the\s+", "", name, flags=re.IGNORECASE)
    return alias_map.get(name, name)

# ===== Step 3: Build Graph =====
G = nx.Graph()
node_freq = defaultdict(int)

for _, row in df.iterrows():
    entities_str = row['Entities']
    entities = [normalize_name(e.strip()) for e in entities_str.split(',') if e.strip()]
    for entity in entities:
        node_freq[entity] += 1
    for i in range(len(entities)):
        for j in range(i + 1, len(entities)):
            e1, e2 = sorted([entities[i], entities[j]])
            if G.has_edge(e1, e2):
                G[e1][e2]['weight'] += 1
            else:
                G.add_edge(e1, e2, weight=1)

print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

print("\nSample edges with weights:")
for idx, (u, v, data) in enumerate(G.edges(data=True)):
    if idx >= 20:
        break
    print(f"{u} ↔ {v}: weight = {data['weight']}")

print(f"\nTop 10 most frequent entities:")
for entity, freq in sorted(node_freq.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"{entity}: {freq} appearances")

# ===== Step 4: Create HTML with Pyvis =====
net = Network(height="800px", width="100%", bgcolor="#222222", font_color="white")
net.barnes_hut()
for node in G.nodes():
    net.add_node(node, label=node, size=min(node_freq[node]*2, 50))
for source, target, data in G.edges(data=True):
    net.add_edge(source, target, value=data["weight"])
net.write_html("knowledge_graph.html")
print("\nKnowledge graph saved to knowledge_graph.html")

# ===== Step 5: Prompt Analysis Functions =====
def extract_entities_from_prompt(prompt, graph_nodes, alias_map):
    found_entities = []
    prompt_lower = prompt.lower()
    sorted_nodes = sorted(graph_nodes, key=len, reverse=True)
    for node in sorted_nodes:
        node_lower = node.lower()
        pattern = r'\b' + re.escape(node_lower).replace(r'\ ', r'\s+') + r'\b'
        if re.search(pattern, prompt_lower):
            found_entities.append(node)
    sorted_aliases = sorted(alias_map.keys(), key=len, reverse=True)
    for alias in sorted_aliases:
        alias_lower = alias.lower()
        target_entity = alias_map[alias]
        if target_entity in graph_nodes:
            pattern = r'\b' + re.escape(alias_lower).replace(r'\ ', r'\s+') + r'\b'
            if re.search(pattern, prompt_lower):
                if target_entity not in found_entities:
                    found_entities.append(target_entity)
    final_entities = []
    for entity in found_entities:
        if not any(entity != other and entity.lower() in other.lower() for other in found_entities):
            final_entities.append(entity)
    return final_entities

def calculate_edge_count_entanglement(entities, graph):
    total_weight = 0
    edges_found = []
    for i in range(len(entities)):
        for j in range(i + 1, len(entities)):
            e1, e2 = entities[i], entities[j]
            if graph.has_edge(e1, e2):
                weight = graph[e1][e2]['weight']
                total_weight += weight
                edges_found.append((e1, e2, weight))
    return total_weight, len(edges_found), edges_found

def calculate_weighted_node_ratio(entities, graph, node_freq):
    total_node_freq = sum(node_freq[entity] for entity in entities if entity in node_freq)
    node_count = len(entities)
    ratio = total_node_freq / node_count if node_count > 0 else 0
    return total_node_freq, node_count, ratio

def calculate_additional_metrics(entities, graph):
    results = {}
    if not entities:
        return results

    # Remove duplicates
    unique_entities = list(set(entities))
    n = len(unique_entities)

    # --- Average node degree entanglement ---
    degrees = [graph.degree(e) for e in unique_entities if e in graph]
    total_degree = sum(degrees)
    avg_degree = total_degree / n if n else 0
    results["avg_node_degree_entanglement"] = (degrees, total_degree, avg_degree)

    # --- Subgraph restricted to unique prompt entities ---
    subgraph = graph.subgraph(unique_entities)

    # Use edge count from calculate_edge_count_entanglement
    _, actual_edges, edge_list = calculate_edge_count_entanglement(unique_entities, graph)
    edge_weights = [weight for _, _, weight in edge_list]
    
    possible_edges = n * (n - 1) / 2
    density = actual_edges / possible_edges if possible_edges else 0
    results["subgraph_density"] = (actual_edges, possible_edges, density)

    edge_weight_sum = sum(edge_weights)
    results["edge_weight_sum"] = (list(subgraph.edges(data=True)), edge_weight_sum)
    results["avg_edge_weight_sum"] = edge_weight_sum / n if n else 0

    # Mean shortest path
    if actual_edges > 0 and n > 1:
        try:
            mean_shortest_path = nx.average_shortest_path_length(subgraph)
        except nx.NetworkXError:
            mean_shortest_path = math.inf
    else:
        mean_shortest_path = 0
    results["mean_shortest_path"] = mean_shortest_path

    # Redundancy ratio
    results["redundancy_ratio"] = actual_edges / n if n else 0

    return results




def analyze_prompt(prompt, graph, node_freq, output_file=None):
    def log(line=""):
        print(line)
        if output_file:
            with open(output_file, 'a', encoding='utf-8') as f:
                f.write(line + "\n")

    log("\n" + "="*60)
    log(f"ANALYZING PROMPT: {prompt}")
    log("="*60)

    entities = extract_entities_from_prompt(prompt, graph.nodes(), alias_map)
    log(f"\nEntities found in prompt: {entities}")

    if not entities:
        log("No entities found for analysis")
        return

    # --- Formula 1 ---
    if len(entities) >= 2:
        total_weight, edge_count, edges_found = calculate_edge_count_entanglement(entities, graph)
        log(f"\n--- FORMULA 1: Edge Count Entanglement ---")
        for e1, e2, weight in edges_found:
            log(f"  {e1} ↔ {e2}: weight = {weight}")
        log(f"Total edge weight sum: {total_weight}")
    else:
        log(f"\n--- FORMULA 1: Edge Count Entanglement ---")
        log(f"Only 1 entity found - no edges to calculate")

    # --- Formula 2 ---
    total_node_freq, node_count, ratio = calculate_weighted_node_ratio(entities, graph, node_freq)
    log(f"\n--- FORMULA 2: Weighted Node Ratio ---")
    for entity in entities:
        log(f"  {entity}: {node_freq.get(entity, 0)}")
    log(f"Total node frequency: {total_node_freq}")
    log(f"Node count: {node_count}")
    log(f"Ratio: {ratio:.2f}")

    metrics = calculate_additional_metrics(entities, graph)

    degrees, total_degree, avg_deg = metrics["avg_node_degree_entanglement"]
    log(f"\n--- FORMULA 3: Average Node Degree Entanglement ---")
    for entity, deg in zip(entities, degrees):
        log(f"  {entity}: {deg}")
    log(f"Total degree: {total_degree}")
    log(f"Average node degree entanglement: {avg_deg:.2f}")

    actual_edges, possible_edges, density = metrics["subgraph_density"]
    log(f"\n--- FORMULA 4: Subgraph Density ---")
    log(f"Actual edges: {actual_edges}")
    log(f"Possible edges: {possible_edges}")
    log(f"Density: {density:.2f}")

    edge_list, ew_sum = metrics["edge_weight_sum"]
    log(f"\n--- FORMULA 5: Edge Weight Sum ---")
    for u, v, data in edge_list:
        log(f"  {u} ↔ {v}: weight = {data['weight']}")
    log(f"Total edge weight sum: {ew_sum}")

    log(f"\n--- FORMULA 6: Average Edge Weight Sum ---")
    log(f"Average edge weight sum: {metrics['avg_edge_weight_sum']:.2f}")

    log(f"\n--- FORMULA 7: Mean Shortest Path ---")
    log(f"Mean shortest path length: {metrics['mean_shortest_path']:.2f}")

    log(f"\n--- FORMULA 8: Redundancy Ratio ---")
    log(f"Redundancy ratio: {metrics['redundancy_ratio']:.2f}")


# ===== Step 6: Load prompts from JSON =====
json_file_path = "harry_potter_questions.json"  # path to your JSON file

with open(json_file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Build a list of all prompts with metadata
prompts = []
for item in data:
    if "original_question" in item and item["original_question"]:
        prompts.append({"text": item["original_question"], "source": "original"})
    if "persuasive_versions" in item and item["persuasive_versions"]:
        for version_type, version_text in item["persuasive_versions"].items():
            prompts.append({"text": version_text, "source": f"persuasive ({version_type})"})

# ===== Step 7: Analyze all prompts =====
output_filename = "prompt_analysis_results.txt"
with open(output_filename, 'w', encoding='utf-8') as f:
    f.write("KNOWLEDGE GRAPH PROMPT ANALYSIS RESULTS\n" + "="*50 + "\n")

for i, prompt_entry in enumerate(prompts, 1):
    print(f"\n[Processing prompt {i} of {len(prompts)}]")
    analyze_prompt(prompt_entry["text"], G, node_freq, output_filename)

print(f"\nAll {len(prompts)} prompts analyzed!")
print(f"Results saved to: {output_filename}")



Graph created with 1296 nodes and 35922 edges

Sample edges with weights:
Petunia Dursley ↔ Vernon Dursley: weight = 13
Petunia Dursley ↔ Privet Drive: weight = 5
Petunia Dursley ↔ Grunnings: weight = 1
Petunia Dursley ↔ Dudley Dursley: weight = 15
Petunia Dursley ↔ owl: weight = 1
Petunia Dursley ↔ cat: weight = 1
Petunia Dursley ↔ Potters: weight = 1
Petunia Dursley ↔ muggle: weight = 1
Petunia Dursley ↔ You-Know-Who: weight = 1
Petunia Dursley ↔ Headmaster Albus Dumbledore: weight = 2
Petunia Dursley ↔ Deluminator: weight = 1
Petunia Dursley ↔ Minerva McGonagall: weight = 2
Petunia Dursley ↔ James Potter: weight = 2
Petunia Dursley ↔ Lily Potter: weight = 2
Petunia Dursley ↔ Lord Voldemort: weight = 3
Petunia Dursley ↔ Potter cottage: weight = 1
Petunia Dursley ↔ Godric's Hollow: weight = 1
Petunia Dursley ↔ Harry Potter: weight = 11
Petunia Dursley ↔ Rubeus Hagrid: weight = 4
Petunia Dursley ↔ flying motorbike: weight = 2

Top 10 most frequent entities:
Harry Potter: 199 appearance

JSON

In [13]:
import pandas as pd
import networkx as nx
from itertools import combinations
import re
from collections import defaultdict
from pyvis.network import Network
import csv
import math
import json

# ===== Step 1: Load CSV =====
file_path = "KG - Sheet1.csv"  # your CSV file path
df = pd.read_csv(file_path)

# ===== Step 2: Alias map =====
alias_map = {
    "Harry": "Harry Potter",
    "Potter": "Harry Potter",
    "Lily": "Lily Potter",
    "James": "James Potter",
    "Hermione": "Hermione Granger",
    "Granger": "Hermione Granger",
    "Ron": "Ron Weasley",
    "Weasley": "Ron Weasley",
    "Dumbledore": "Albus Dumbledore",
    "Albus": "Albus Dumbledore",
    "Voldemort": "Lord Voldemort",
    "Number Four Privet Drive": "Privet Drive",
    "4 Privet Drive": "Privet Drive",
    "Hagrid": "Rubeus Hagrid",
    "Black": "Sirius Black",
    "Sirius": "Sirius Black",
    "McGonagall": "Minerva McGonagall",
    "Minerva": "Minerva McGonagall",
    "Professor McGonagall": "Minerva McGonagall",
    "Malfoy": "Draco Malfoy",
    "Draco": "Draco Malfoy",
    "Lupin": "Remus Lupin",
    "Remus": "Remus Lupin",
    "Neville": "Neville Longbottom",
    "Longbottom": "Neville Longbottom",
    "Snape": "Severus Snape",
    "Severus": "Severus Snape",
    "Professor Snape": "Severus Snape",
    "Dursleys": "Dudley Dursley",
    "Uncle Vernon": "Vernon Dursley",
    "Vernon": "Vernon Dursley",
    "Aunt Petunia": "Petunia Dursley",
    "Petunia": "Petunia Dursley",
    "Dudley": "Dudley Dursley",
    "Ginny": "Ginny Weasley",
    "Arthur": "Arthur Weasley",
    "Mr Weasley": "Arthur Weasley",
    "Mr. Weasley": "Arthur Weasley",
    "Molly": "Molly Weasley",
    "broom": "broomstick",
    "muggles": "muggle",
    "Platform 9¾": "Platform Nine and Three-Quarters",
    "Platform 9 3/4": "Platform Nine and Three-Quarters",
    "Platform Nine and 3/4": "Platform Nine and Three-Quarters",
    "Platform 9 and 3/4": "Platform Nine and Three-Quarters"
}

def normalize_name(name):
    name = name.strip()
    name = re.sub(r"^the\s+", "", name, flags=re.IGNORECASE)
    return alias_map.get(name, name)

# ===== Step 3: Build Graph =====
G = nx.Graph()
node_freq = defaultdict(int)

for _, row in df.iterrows():
    entities_str = row['Entities']
    entities = [normalize_name(e.strip()) for e in entities_str.split(',') if e.strip()]
    for entity in entities:
        node_freq[entity] += 1
    for i in range(len(entities)):
        for j in range(i + 1, len(entities)):
            e1, e2 = sorted([entities[i], entities[j]])
            if G.has_edge(e1, e2):
                G[e1][e2]['weight'] += 1
            else:
                G.add_edge(e1, e2, weight=1)

print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

# ===== Step 4: Create HTML with Pyvis =====
net = Network(height="800px", width="100%", bgcolor="#222222", font_color="white")
net.barnes_hut()
for node in G.nodes():
    net.add_node(node, label=node, size=min(node_freq[node]*2, 50))
for source, target, data in G.edges(data=True):
    net.add_edge(source, target, value=data["weight"])
net.write_html("knowledge_graph.html")
print("Knowledge graph saved to knowledge_graph.html")

# ===== Step 5: Prompt Analysis Functions =====
def extract_entities_from_prompt(prompt, graph_nodes, alias_map):
    found_entities = []
    prompt_lower = prompt.lower()
    sorted_nodes = sorted(graph_nodes, key=len, reverse=True)
    for node in sorted_nodes:
        node_lower = node.lower()
        pattern = r'\b' + re.escape(node_lower).replace(r'\ ', r'\s+') + r'\b'
        if re.search(pattern, prompt_lower):
            found_entities.append(node)
    sorted_aliases = sorted(alias_map.keys(), key=len, reverse=True)
    for alias in sorted_aliases:
        alias_lower = alias.lower()
        target_entity = alias_map[alias]
        if target_entity in graph_nodes:
            pattern = r'\b' + re.escape(alias_lower).replace(r'\ ', r'\s+') + r'\b'
            if re.search(pattern, prompt_lower):
                if target_entity not in found_entities:
                    found_entities.append(target_entity)
    final_entities = []
    for entity in found_entities:
        if not any(entity != other and entity.lower() in other.lower() for other in found_entities):
            final_entities.append(entity)
    return final_entities

def calculate_edge_count_entanglement(entities, graph):
    total_weight = 0
    edges_found = []
    for i in range(len(entities)):
        for j in range(i + 1, len(entities)):
            e1, e2 = entities[i], entities[j]
            if graph.has_edge(e1, e2):
                weight = graph[e1][e2]['weight']
                total_weight += weight
                edges_found.append((e1, e2, weight))
    return total_weight, len(edges_found), edges_found

def calculate_weighted_node_ratio(entities, graph, node_freq):
    total_node_freq = sum(node_freq[entity] for entity in entities if entity in node_freq)
    node_count = len(entities)
    ratio = total_node_freq / node_count if node_count > 0 else 0
    return total_node_freq, node_count, ratio

def calculate_additional_metrics(entities, graph):
    results = {}
    if not entities:
        return results

    unique_entities = list(set(entities))
    n = len(unique_entities)

    degrees = [graph.degree(e) for e in unique_entities if e in graph]
    total_degree = sum(degrees)
    avg_degree = total_degree / n if n else 0
    results["avg_node_degree_entanglement"] = (degrees, total_degree, avg_degree)

    subgraph = graph.subgraph(unique_entities)
    _, actual_edges, edge_list = calculate_edge_count_entanglement(unique_entities, graph)
    edge_weights = [weight for _, _, weight in edge_list]
    
    possible_edges = n * (n - 1) / 2
    density = actual_edges / possible_edges if possible_edges else 0
    results["subgraph_density"] = (actual_edges, possible_edges, density)

    edge_weight_sum = sum(edge_weights)
    results["edge_weight_sum"] = (list(subgraph.edges(data=True)), edge_weight_sum)
    results["avg_edge_weight_sum"] = edge_weight_sum / n if n else 0

    if actual_edges > 0 and n > 1:
        try:
            mean_shortest_path = nx.average_shortest_path_length(subgraph)
        except nx.NetworkXError:
            mean_shortest_path = math.inf
    else:
        mean_shortest_path = 0
    results["mean_shortest_path"] = mean_shortest_path
    results["redundancy_ratio"] = actual_edges / n if n else 0

    return results

def analyze_prompt(prompt, graph, node_freq):
    result = {"prompt": prompt, "entities": [], "formulas": {}}

    entities = extract_entities_from_prompt(prompt, graph.nodes(), alias_map)
    result["entities"] = entities

    if not entities:
        return result

    # Formula 1
    if len(entities) >= 2:
        total_weight, edge_count, edges_found = calculate_edge_count_entanglement(entities, graph)
        result["formulas"]["formula1_edge_count"] = {
            "edges": [{"e1": e1, "e2": e2, "weight": weight} for e1, e2, weight in edges_found],
            "total_edge_weight": total_weight
        }

    # Formula 2
    total_node_freq, node_count, ratio = calculate_weighted_node_ratio(entities, graph, node_freq)
    result["formulas"]["formula2_weighted_node_ratio"] = {
        "node_freq": {e: node_freq.get(e, 0) for e in entities},
        "total_node_freq": total_node_freq,
        "node_count": node_count,
        "ratio": ratio
    }

    metrics = calculate_additional_metrics(entities, graph)

    degrees, total_degree, avg_deg = metrics.get("avg_node_degree_entanglement", ([], 0, 0))
    result["formulas"]["formula3_avg_node_degree_entanglement"] = {
        "degrees": dict(zip(entities, degrees)),
        "total_degree": total_degree,
        "average_degree": avg_deg
    }

    actual_edges, possible_edges, density = metrics.get("subgraph_density", (0,0,0))
    result["formulas"]["formula4_subgraph_density"] = {
        "actual_edges": actual_edges,
        "possible_edges": possible_edges,
        "density": density
    }

    edge_list, ew_sum = metrics.get("edge_weight_sum", ([],0))
    result["formulas"]["formula5_edge_weight_sum"] = {
        "edges": [{"e1": u, "e2": v, "weight": data["weight"]} for u,v,data in edge_list],
        "total_edge_weight_sum": ew_sum
    }

    result["formulas"]["formula6_avg_edge_weight_sum"] = metrics.get("avg_edge_weight_sum", 0)
    result["formulas"]["formula7_mean_shortest_path"] = metrics.get("mean_shortest_path", 0)
    result["formulas"]["formula8_redundancy_ratio"] = metrics.get("redundancy_ratio", 0)

    return result

# ===== Step 6: Load prompts from JSON =====
json_file_path = "harry_potter_questions.json"

with open(json_file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

prompts = []
for item in data:
    if "original_question" in item and item["original_question"]:
        prompts.append({"text": item["original_question"], "source": "original"})
    if "persuasive_versions" in item and item["persuasive_versions"]:
        for version_type, version_text in item["persuasive_versions"].items():
            prompts.append({"text": version_text, "source": f"persuasive ({version_type})"})

# ===== Step 7: Analyze all prompts and save JSON with simplified formula results =====
all_results = []
prompt_types_cycle = ["original", "emotional", "logical", "authority"]

for i, prompt_entry in enumerate(prompts):
    prompt_result = analyze_prompt(prompt_entry["text"], G, node_freq)
    
    # Keep only the final values for each formula
    simplified_formulas = {}
    formulas = prompt_result.get("formulas", {})
    
    simplified_formulas["formula1_edge_count_total_weight"] = formulas.get("formula1_edge_count", {}).get("total_edge_weight", 0)
    simplified_formulas["formula2_weighted_node_ratio"] = formulas.get("formula2_weighted_node_ratio", {}).get("ratio", 0)
    simplified_formulas["formula3_avg_node_degree_entanglement"] = formulas.get("formula3_avg_node_degree_entanglement", {}).get("average_degree", 0)
    simplified_formulas["formula4_subgraph_density"] = formulas.get("formula4_subgraph_density", {}).get("density", 0)
    simplified_formulas["formula5_edge_weight_sum"] = formulas.get("formula5_edge_weight_sum", {}).get("total_edge_weight_sum", 0)
    simplified_formulas["formula6_avg_edge_weight_sum"] = formulas.get("formula6_avg_edge_weight_sum", 0)
    simplified_formulas["formula7_mean_shortest_path"] = formulas.get("formula7_mean_shortest_path", 0)
    simplified_formulas["formula8_redundancy_ratio"] = formulas.get("formula8_redundancy_ratio", 0)
    
    # Build final prompt entry
    final_result = {
        "prompt": prompt_entry["text"],
        "source": prompt_entry["source"],
        "prompt_type": prompt_types_cycle[i % 4],
        "formulas": simplified_formulas
    }
    
    all_results.append(final_result)

with open("prompt_analysis_results.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)

print(f"\nAll {len(prompts)} prompts analyzed!")
print("Results saved to: prompt_analysis_results.json")


Graph created with 1296 nodes and 35922 edges
Knowledge graph saved to knowledge_graph.html

All 548 prompts analyzed!
Results saved to: prompt_analysis_results.json
