In [11]:
# Making a Kuzu graph database of webpages/links that Brad is interested in. 
# Visualized with yFiles

In [12]:
# SECTION 1: DATA CREATION AND LLM CALLS
# --------------------------------------
# This section fetches webpage content, extracts titles, and uses an LLM to categorize and extract keywords,
# producing the final links_with_metadata.csv file for use in the Kùzu database section.


In [1]:
# All imports

import pandas as pd

from pydantic import BaseModel, Field
import json
from ollama import Client

import requests
from bs4 import BeautifulSoup

import kuzu
import os
from collections import Counter

import warnings

import shutil

In [14]:

# Cell 1: Generate links.csv with only URLs
import pandas as pd
links_data = [
    {"url": "https://speakerdeck.com/gaelvaroquaux/open-source-software-how-to-live-long-and-go-far"},
    {"url": "https://medium.com/@seanjtaylor/a-personal-retrospective-on-prophet-f223c2378985"},
    {"url": "https://github.com/jackboyla/GLiREL"},
    {"url": "https://www.latencyconf.io/sessions/pandas-should-go-extinct"},
    {"url": "https://orbae.adastra.eco/"},
    {"url": "https://cambridge-intelligence.com/mapweave/"},
    {"url": "https://github.com/yWorks/yfiles-jupyter-graphs-for-sparql"},
    {"url": "https://holonetgalacticmap-frontend.vercel.app/sparql"},
    {"url": "https://arxiv.org/pdf/2502.13025"},
    {"url": "https://medium.com/eqtventures/knowledge-graph-s-and-llm-based-ontologies-have-a-very-good-shot-at-unlocking-genai-in-production-1b167533ef63"},
    {"url": "https://www.mongodb.com/blog/post/supercharge-ai-data-management-knowledge-graphs"},
    {"url": "https://enterprise-knowledge.com/the-resource-description-framework-rdf/"},
    {"url": "https://medium.com/oracledevs/validating-graph-data-with-shacl-using-oracle-rdf-graph-adapter-for-eclipse-rdf4j-09327042f530"},
    {"url": "https://blog.kuzudb.com/post/kuzu-wasm-rag/"},
    {"url": "https://blog.kuzudb.com/post/unstructured-data-to-graph-baml-kuzu/"},
    {"url": "https://medium.com/@mcgeehan/building-a-hybrid-vector-search-database-with-arrow-and-duckdb-07ebc049bc1f"},
    {"url": "https://github.com/Manirevuri/arangodb-hackathon"},
    {"url": "https://medium.com/@tomzeppenfeldt/querying-an-erp-using-an-ai-configuration-in-graphileon-c667f21f5b51"},
    {"url": "https://2024.connected-data.london/speakers/urbashi-mitra/"},
    {"url": "https://darrendevitt.com/all-fhir-concepts-can-be-explained-simply/"},
    {"url": "https://medium.com/@samschifman/rag-on-fhir-with-knowledge-graphs-04d8e13ee96e"},
    {"url": "https://www.sciencedirect.com/science/article/pii/S1532046422002064"},
    {"url": "https://medium.com/enterprise-rag/why-lawyers-are-uniquely-suited-to-work-with-llms-bcc66d3dce98"},
    {"url": "https://link.springer.com/journal/10506"},
    {"url": "https://www.openownership.org/en/blog/lessons-from-building-a-prototype-single-search-tool-for-beneficial-ownership-registers/"},
    {"url": "https://www.occrp.org/en/project/cyprus-confidential/billionaire-roman-abramovichs-company-set-up-fake-superyacht-chartering-scheme-in-apparent-attempt-to-evade-millions-in-taxes"},
    {"url": "https://www.occrp.org/en/project/the-azerbaijani-laundromat/the-contract-factory-inside-danske-bank-estonias-money-laundering-machine"},
    {"url": "https://www.occrp.org/en/project/the-azerbaijani-laundromat/the-raw-data"},
    {"url": "https://discuss.opensanctions.org/"},
    {"url": "https://blog.opencorporates.com/2025/02/13/getting-started-with-the-opencorporates-api/"},
    {"url": "https://www.buzzsprout.com/242645/episodes/16799543"},
    {"url": "https://www.unodc.org/unodc/en/data-and-analysis/tip-studies.html"},
    {"url": "https://www.which.co.uk/news/article/scam-empire-inside-the-275m-fraud-call-centre-operations-aP3Kc4c9HWd7"},
    {"url": "https://www.occrp.org/en/project/scam-empire/scam-operations-relied-on-third-party-marketing-companies-for-steady-stream-of-potential-victims"},
    {"url": "https://github.com/DAD-CDM/dad-cdm-tsc/blob/main/DAD-CDM-Key-Findings-202502.md"},
    {"url": "https://bods-data.openownership.org/source/gleif_version_0_4/"},
    {"url": "https://eiti.org/using-eiti-data"},
    {"url": "https://mweiti.gov.mw/index.php/reports/details/761"},
    {"url": "https://globalenergymonitor.org/projects/global-energy-ownership-tracker/"},
    {"url": "https://www.moodys.com/web/en/us/kyc/resources/insights/how-ai-is-enhancing-ubo-discovery-and-support-reporting-requirements-globally.html"},
    {"url": "https://oecdstatistics.blog/2019/07/04/the-adima-database-on-multinational-enterprises/"},
    {"url": "https://oecdstatistics.blog/2025/02/21/monitoring-multinational-enterprises-how-the-oecd-and-unsd-are-harnessing-open-data/"},
    {"url": "https://www.taxobservatory.eu/publication/the-end-of-londongrad-ownership-transparency-and-offshore-investment-in-real-estate/"},
    {"url": "https://www.future-fis.com/uploads/3/7/9/4/3794525/ffis_a_new_era_of_private_sector_collaboration_to_detect_economic_crime_-_policy_discussion_paper_-_march_2025_-_final.pdf"},
    {"url": "https://www.kharon.com/brief/outbound-investment-rules-china-hong-kong"},
    {"url": "https://www.gao.gov/products/gao-25-107403"},
    {"url": "https://hoeringsportalen.dk/Hearing/Details/69602"},
    {"url": "https://ubm.se/publikationer/publikationer/2025-03-13-kunskapsrapport---avancerade-angrepp-mot-valfardssystemen"},
    {"url": "https://senzing.com/daniel-silva-prudential-senzing-global-user-conference/"},
    {"url": "https://graphaware.com/resources/streamlining-criminal-assets-confiscation/"},
    {"url": "https://graphaware.com/law-enforcement/"},
    {"url": "https://graphaware.com/blog/aml-investigations-detecting-risk-transactions/"},
    {"url": "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=5120765"},
    {"url": "https://www.powermag.com/software-hardware-innovation-all-needed-to-upgrade-the-power-grid/"},
    {"url": "https://wattclarity.com.au/articles/2025/02/nemde-nightmares-parallel-pathways-and-clashing-constraints/"},
    {"url": "https://mahasldc.in/wp-content/reports/other/Report_on_Optimizing_Power_Despatch.pdf"},
    {"url": "https://github.com/kyribaker/7bus_LMPs"},
    {"url": "https://www.ercot.com/content/cdr/html/real_time_system_conditions.html"},
    {"url": "https://www.abc.net.au/news/2024-10-13/australian-coal-plant-in-extraordinary-survival-experiment/104461504"},
    {"url": "https://www.nrdc.org/resources/uneconomic-coal-costs-miso-ratepayers-1-billion-and-curtails-400-mw-wind"},
    {"url": "https://blog.gridstatus.io/curtailment/"},
    {"url": "https://blog.gridstatus.io/spp-expansion-west/"},
    {"url": "https://www.eia.gov/outlooks/steo/"}
]

df = pd.DataFrame(links_data)
csv_path = "links.csv"
df.to_csv(csv_path, index=False)
print(f"CSV saved to {csv_path}")
print(f"Total links: {len(df)}")


CSV saved to links.csv
Total links: 63


In [5]:

# Cell 2: Fetch webpage content and extract titles
import pandas as pd
import requests
from bs4 import BeautifulSoup
import warnings

warnings.filterwarnings("ignore")

def fetch_webpage_content(url, retries=2):
    for attempt in range(retries):
        try:
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
            text = ' '.join(element.get_text(strip=True) for element in text_elements)
            if len(text) < 100:
                print(f"Insufficient content for {url}: {len(text)} characters")
                return ""
            return text[:5000]
        except Exception as e:
            print(f"Attempt {attempt+1} failed for {url}: {e}")
            if attempt == retries - 1:
                return ""
    return ""

csv_path = "links.csv"
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"{csv_path} not found")
links_df = pd.read_csv(csv_path)

links_df["title"] = ""
links_df["content"] = ""
links_df["category"] = ""
links_df["keyword"] = ""
links_df["category_explanation"] = ""
links_df["keyword_explanation"] = ""

failed_urls = []
for index, row in links_df.iterrows():
    url = row['url']
    print(f"Processing {url} for content and title...")
    content = fetch_webpage_content(url)
    links_df.at[index, 'content'] = content[:5000]
    try:
        soup = BeautifulSoup(requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}).text, 'html.parser')
        title = soup.find('title').text if soup.find('title') else ""
        links_df.at[index, 'title'] = title[:255]
    except:
        links_df.at[index, 'title'] = url
    if not content or len(content.strip()) < 100:
        failed_urls.append({"url": url, "reason": "Failed to fetch sufficient content"})

failed_df = pd.DataFrame(failed_urls)
if not failed_df.empty:
    failed_df.to_csv("links_please_review.csv", index=False)
    print(f"Failed URLs saved to links_please_review.csv: {len(failed_df)} URLs")

links_df = links_df[links_df['content'].str.len() >= 100]
links_df.to_csv("links_with_content.csv", index=False)
print(f"CSV with titles, content, and empty fields saved to links_with_content.csv")
print(f"Processed {len(links_df)} valid URLs")


Processing https://speakerdeck.com/gaelvaroquaux/open-source-software-how-to-live-long-and-go-far for content and title...
Processing https://medium.com/@seanjtaylor/a-personal-retrospective-on-prophet-f223c2378985 for content and title...
Processing https://github.com/jackboyla/GLiREL for content and title...
Processing https://www.latencyconf.io/sessions/pandas-should-go-extinct for content and title...
Processing https://orbae.adastra.eco/ for content and title...
Processing https://cambridge-intelligence.com/mapweave/ for content and title...
Processing https://github.com/yWorks/yfiles-jupyter-graphs-for-sparql for content and title...
Processing https://holonetgalacticmap-frontend.vercel.app/sparql for content and title...
Insufficient content for https://holonetgalacticmap-frontend.vercel.app/sparql: 0 characters
Processing https://arxiv.org/pdf/2502.13025 for content and title...
Insufficient content for https://arxiv.org/pdf/2502.13025: 0 characters
Processing https://medium.co

In [6]:

# Cell 3: Categorize and extract one keyword with Ollama using Pydantic
import pandas as pd
from ollama import Client
from pydantic import BaseModel, Field
import json

ollama_client = Client(host='http://localhost:11434')

class ArticleClassification(BaseModel):
    category: str = Field(..., description="The assigned category (2-3 words)", min_length=2, max_length=50)
    keyword: str = Field(..., description="One key term (1-2 words)", min_length=1, max_length=30)
    category_explanation: str = Field(..., description="One sentence explaining the category choice", min_length=10, max_length=200)
    keyword_explanation: str = Field(..., description="One sentence explaining the keyword choice", min_length=10, max_length=200)

suggested_categories = [
    "general tools", "graph technologies", "healthcare data", "ai and legal systems",
    "federated search", "organized crime analysis", "beneficial ownership",
    "financial crime technology", "corporate governance", "power and utilities"
]

def process_with_ollama(content, client, suggested_categories):
    if not content or len(content.strip()) < 100:
        return None
    template = f"""
    You are an expert at categorizing articles and extracting key terms. Analyze the article content and provide a structured JSON output with:
    - "category": A category (2-3 words, from suggested categories: {', '.join(suggested_categories)} or a new one if none fit)
    - "keyword": One key term (1-2 words, specific and descriptive, e.g., 'knowledge graph', not 'data')
    - "category_explanation": One sentence explaining the category choice
    - "keyword_explanation": One sentence explaining the keyword choice
    Return ONLY the JSON object, wrapped in triple backticks (```json\n{{}}\n```).
    Example:
    ```json
    {{
        "category": "graph technologies",
        "keyword": "knowledge graph",
        "category_explanation": "The article discusses graph-based data management.",
        "keyword_explanation": "Knowledge graph is the central concept of the article."
    }}
    ```
    Content: {content[:2000]}
    """
    try:
        response = client.generate(
            model='mistral:7b-instruct-v0.3-q4_0',
            prompt=template,
            options={"temperature": 0.4}
        )
        raw_response = response['response'].strip()
        if raw_response.startswith('```json'):
            raw_response = raw_response[7:].rsplit('```', 1)[0].strip()
        print(f"Raw LLM response for content: {raw_response[:200]}...")
        result = json.loads(raw_response)
        classification = ArticleClassification.model_validate(result)
        return classification
    except Exception as e:
        print(f"Error processing content: {str(e)}")
        return None

links_df = pd.read_csv("links_with_content.csv")
failed_processing = []
for index, row in links_df.iterrows():
    url = row['url']
    print(f"Processing {url} for categorization and keyword...")
    content = row['content']
    result = process_with_ollama(content, ollama_client, suggested_categories)
    if result:
        links_df.at[index, 'category'] = result.category
        links_df.at[index, 'keyword'] = result.keyword
        links_df.at[index, 'category_explanation'] = result.category_explanation
        links_df.at[index, 'keyword_explanation'] = result.keyword_explanation
    else:
        links_df.at[index, 'category'] = "uncategorized"
        links_df.at[index, 'keyword'] = "none"
        links_df.at[index, 'category_explanation'] = "Failed to process content."
        links_df.at[index, 'keyword_explanation'] = "Failed to extract a keyword."
        failed_processing.append({"url": url, "reason": "Failed to generate meaningful category or keyword"})

failed_processing_df = pd.DataFrame(failed_processing)
if not failed_processing_df.empty:
    failed_processing_df.to_csv("failed_processing.csv", index=False)
    print(f"Failed processing saved to failed_processing.csv: {len(failed_processing_df)} URLs")

links_df.to_csv("links_with_metadata.csv", index=False)
print("Updated CSV with titles, content, categories, keywords, and explanations saved to links_with_metadata.csv")
unique_categories = links_df['category'].unique().tolist()
unique_keywords = links_df['keyword'].value_counts()
print(f"Unique categories: {unique_categories}")
print(f"Keyword distribution:\n{unique_keywords}")


Processing https://speakerdeck.com/gaelvaroquaux/open-source-software-how-to-live-long-and-go-far for categorization and keyword...
Raw LLM response for content: {
        "category": "general tools",
        "keyword": "open-source software",
        "category_explanation": "The article discusses best practices for maintaining open-source software projects.",...
Processing https://medium.com/@seanjtaylor/a-personal-retrospective-on-prophet-f223c2378985 for categorization and keyword...
Raw LLM response for content: {
        "category": "ai and legal systems",
        "keyword": "Prophet",
        "category_explanation": "The article discusses an AI-based forecasting library called Prophet.",
        "keyword_ex...
Processing https://github.com/jackboyla/GLiREL for categorization and keyword...
Raw LLM response for content: {
        "category": "ai",
        "keyword": "relation extraction",
        "category_explanation": "The article discusses an AI model for relation extraction.",

In [7]:
links_df.sample(5)

Unnamed: 0,url,title,content,category,keyword,category_explanation,keyword_explanation
45,https://graphaware.com/resources/streamlining-...,Streamlining Criminal Assets Confiscation | Gr...,Connected data analytics platform. Explore how...,organized crime analysis,graph technology,The article discusses the use of graph technol...,Graph technology is the core technology used t...
38,https://oecdstatistics.blog/2025/02/21/monitor...,Monitoring multinational enterprises: How the ...,Recent Posts Most Used Categories Monitoring ...,uncategorized,none,Failed to process content.,Failed to extract a keyword.
18,https://medium.com/@samschifman/rag-on-fhir-wi...,RAG on FHIR with Knowledge Graphs | by Sam Sch...,Sign up Sign in Sign up Sign in RAG on FHIR wi...,healthcare data,FHIR,The article discusses the use of Fast Healthca...,FHIR is the primary focus and central concept ...
29,https://www.which.co.uk/news/article/scam-empi...,Scam Empire: inside the $275m fraud call-centr...,Suggested searches Suggested searches Scam Emp...,organized crime analysis,Scam Empire,The article discusses an investigation into a ...,Scam Empire is the name of the investigation a...
16,https://2024.connected-data.london/speakers/ur...,Urbashi Mitra - Connected Data London 2024,Have you purchased yours? Buy Your All-Access ...,uncategorized,none,Failed to process content.,Failed to extract a keyword.


In [7]:

# SECTION 2: KÙZU DATABASE AND VISUALIZATION
# ----------------------------------------
# This section initializes the Kùzu database, populates it with data from links_with_metadata.csv,
# queries for interconnections, and visualizes the graph with yFiles.


In [4]:
# Cell 4: Initialize Kùzu database
import kuzu
import os
import shutil

db_path = os.path.join("..", "db", "graph_db")

# Delete existing database to ensure fresh state
if os.path.exists(db_path):
    shutil.rmtree(db_path)
    print(f"Deleted existing database at {db_path}")

try:
    os.makedirs(db_path, exist_ok=True)
    print(f"Ensured database directory at {db_path}")

    db = kuzu.Database(db_path)
    conn = kuzu.Connection(db)

    try:
        # Create schema
        conn.execute("CREATE NODE TABLE Link(url STRING, category STRING, title STRING, keyword STRING, category_explanation STRING, keyword_explanation STRING, PRIMARY KEY(url))")
        conn.execute("CREATE NODE TABLE Category(name STRING, PRIMARY KEY(name))")
        conn.execute("CREATE NODE TABLE Keyword(name STRING, PRIMARY KEY(name))")
        conn.execute("CREATE REL TABLE BELONGS_TO(FROM Link TO Category)")
        conn.execute("CREATE REL TABLE HAS_KEYWORD(FROM Link TO Keyword)")
        print("Database schema created successfully.")

        # Verify schema
        result = conn.execute("CALL show_tables() RETURN name")
        tables = []
        while result.has_next():
            row = result.get_next()
            tables.append(row[0])
        if "Link" not in tables:
            raise ValueError("Link table not found in schema after creation.")
        print("Schema verified: Link table exists.")

    except Exception as e:
        print(f"Error creating schema: {str(e)}")
        raise

finally:
    if 'conn' in locals():
        conn.close()
    if 'db' in locals():
        del db

Deleted existing database at ..\db\graph_db
Ensured database directory at ..\db\graph_db
Database schema created successfully.
Schema verified: Link table exists.


In [1]:
# Cell 5: Populate Kùzu database with links, categories, and keywords
import kuzu
import pandas as pd
import os

db_path = os.path.join("..", "db", "graph_db")
csv_path = "links_with_metadata.csv"

try:
    # Validate CSV
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"{csv_path} not found")
    links_df = pd.read_csv(csv_path)
    required_columns = ['url', 'category', 'title', 'keyword', 'category_explanation', 'keyword_explanation']
    if not all(col in links_df.columns for col in required_columns):
        raise ValueError(f"CSV missing required columns: {required_columns}")

    # Initialize database and connection
    db = kuzu.Database(db_path)
    conn = kuzu.Connection(db)

    # Verify schema
    result = conn.execute("CALL show_tables() RETURN name")
    tables = [row[0] for row in [conn.execute("CALL show_tables() RETURN name").get_next() for _ in range(conn.execute("CALL show_tables() RETURN name").get_num_tuples())]]
    if "Link" not in tables:
        raise ValueError("Link table not found. Run Cell 4 to recreate the database.")

    # Batch insert categories
    for category in links_df['category'].unique():
        if pd.notna(category) and category != "uncategorized":
            category = category.replace("'", "\\'")
            conn.execute(f"MERGE (c:Category {{name: '{category}'}})")

    # Batch insert keywords
    for keyword in links_df['keyword'].unique():
        if pd.notna(keyword) and keyword != "none":
            keyword = keyword.replace("'", "\\'")
            conn.execute(f"MERGE (k:Keyword {{name: '{keyword}'}})")

    # Insert links and relationships
    for _, row in links_df.iterrows():
        url = row['url'].replace("'", "\\'")
        category = row['category'].replace("'", "\\'") if pd.notna(row['category']) else "uncategorized"
        title = row['title'].replace("'", "\\'") if pd.notna(row['title']) else ""
        keyword = row['keyword'].replace("'", "\\'") if pd.notna(row['keyword']) else "none"
        category_explanation = row['category_explanation'].replace("'", "\\'") if pd.notna(row['category_explanation']) else ""
        keyword_explanation = row['keyword_explanation'].replace("'", "\\'") if pd.notna(row['keyword_explanation']) else ""
        
        conn.execute(f"""
            MERGE (l:Link {{url: '{url}'}})
            SET l.category = '{category}',
                l.title = '{title}',
                l.keyword = '{keyword}',
                l.category_explanation = '{category_explanation}',
                l.keyword_explanation = '{keyword_explanation}'
        """)
        
        if pd.notna(row['category']) and row['category'] != "uncategorized":
            conn.execute(f"""
                MATCH (l:Link {{url: '{url}'}}), (c:Category {{name: '{category}'}})
                MERGE (l)-[:BELONGS_TO]->(c)
            """)

        if keyword != "none":
            conn.execute(f"""
                MATCH (l:Link {{url: '{url}'}}), (k:Keyword {{name: '{keyword}'}})
                MERGE (l)-[:HAS_KEYWORD]->(k)
            """)

    print("Kùzu database populated successfully.")
    result = conn.execute("MATCH (l:Link) WHERE l.category = 'uncategorized' RETURN COUNT(l)")
    print(f"Uncategorized nodes: {result.get_next()[0]}")

finally:
    if 'conn' in locals():
        conn.close()
    if 'db' in locals():
        del db

Kùzu database populated successfully.
Uncategorized nodes: 10


In [1]:
# Cell 6: Query Kùzu for interconnections between categories
import kuzu
import pandas as pd
import os

db_path = os.path.join("..", "db", "graph_db")

try:
    db = kuzu.Database(db_path)
    conn = kuzu.Connection(db)

    # Diagnostic queries
    result = conn.execute("MATCH (l:Link) RETURN COUNT(l)")
    link_count = result.get_next()[0]
    print(f"Total Link nodes: {link_count}")
    if link_count == 0:
        raise ValueError("No Link nodes found. Run Cell 5.")

    result = conn.execute("MATCH (l:Link)-[:HAS_KEYWORD]->(k:Keyword) RETURN COUNT(l)")
    print(f"Links with keywords: {result.get_next()[0]}")
    result = conn.execute("MATCH (l:Link)-[:BELONGS_TO]->(c:Category) RETURN COUNT(l)")
    print(f"Links with categories: {result.get_next()[0]}")

    # Query interconnections
    results = conn.execute("""
        MATCH (l1:Link)-[:HAS_KEYWORD]->(k:Keyword)<-[:HAS_KEYWORD]-(l2:Link),
              (l1)-[:BELONGS_TO]->(c1:Category),
              (l2)-[:BELONGS_TO]->(c2:Category)
        WHERE c1.name <> c2.name AND l1.url < l2.url
        RETURN l1.url, l1.category, l2.url, l2.category, k.name AS shared_keyword
        ORDER BY k.name, l1.category, l2.category
        LIMIT 100
    """)

    interconnections = []
    while results.has_next():
        row = results.get_next()
        interconnections.append({
            "Link1_URL": row[0],
            "Link1_Category": row[1],
            "Link2_URL": row[2],
            "Link2_Category": row[3],
            "Shared_Keyword": row[4]
        })

    interconnections_df = pd.DataFrame(interconnections)
    if not interconnections_df.empty:
        print("Interconnections between categories:")
        print(interconnections_df.to_string(index=False))
    else:
        print("No interconnections found. Verify keywords and categories.")

finally:
    if 'conn' in locals():
        conn.close()
    if 'db' in locals():
        del db

Total Link nodes: 56
Links with keywords: 46
Links with categories: 46
Interconnections between categories:
                                                                                                                                    Link1_URL     Link1_Category                                                                         Link2_URL       Link2_Category  Shared_Keyword
https://medium.com/eqtventures/knowledge-graph-s-and-llm-based-ontologies-have-a-very-good-shot-at-unlocking-genai-in-production-1b167533ef63 graph technologies https://www.mongodb.com/blog/post/supercharge-ai-data-management-knowledge-graphs ai and legal systems knowledge graph


In [2]:
# Cell 7: Clean Kùzu graph and visualize with yFiles
import kuzu
import os
import pandas as pd
try:
    from yfiles_jupyter_graphs import GraphWidget
except ImportError:
    raise ImportError("Install yfiles-jupyter-graphs: pip install yfiles-jupyter-graphs")

db_path = os.path.join("..", "db", "graph_db")

def clean_uncategorized_links(db_path):
    try:
        db = kuzu.Database(db_path)
        conn = kuzu.Connection(db)

        # Save uncategorized links
        result = conn.execute("MATCH (l:Link) WHERE l.category = 'uncategorized' RETURN l.url, l.title")
        uncategorized = []
        while result.has_next():
            row = result.get_next()
            uncategorized.append({"url": row[0], "title": row[1]})
        if uncategorized:
            pd.DataFrame(uncategorized).to_csv("uncategorized_links.csv", index=False)
            print(f"Saved {len(uncategorized)} uncategorized links.")

        # Clean database
        conn.execute("MATCH (l:Link)-[r:HAS_KEYWORD]->(k:Keyword) WHERE l.category = 'uncategorized' DELETE r")
        conn.execute("MATCH (l:Link)-[r:BELONGS_TO]->(c:Category) WHERE l.category = 'uncategorized' DELETE r")
        conn.execute("MATCH (l:Link) WHERE l.category = 'uncategorized' DELETE l")
        conn.execute("MATCH (c:Category) WHERE NOT (c)<-[:BELONGS_TO]-() DELETE c")
        conn.execute("MATCH (k:Keyword) WHERE NOT (k)<-[:HAS_KEYWORD]-() DELETE k")
        print("Uncategorized nodes cleaned.")
        return conn, db

    except Exception as e:
        print(f"Error cleaning database: {str(e)}")
        raise

try:
    conn, db = clean_uncategorized_links(db_path)

    # Validate data
    result = conn.execute("MATCH (l:Link) RETURN COUNT(l)")
    link_count = result.get_next()[0]
    if link_count == 0:
        raise ValueError("No Link nodes found. Run Cell 5.")

    nodes = []
    edges = []

    # Add Link nodes
    result = conn.execute("MATCH (l:Link) RETURN l.url, l.category, l.title LIMIT 50")
    while result.has_next():
        row = result.get_next()
        nodes.append({"id": row[0], "properties": {"type": "Link", "label": row[2] or row[0], "category": row[1]}})

    # Add Category nodes
    result = conn.execute("MATCH (c:Category) RETURN c.name")
    while result.has_next():
        nodes.append({"id": result.get_next()[0], "properties": {"type": "Category", "label": result.get_next()[0]}})

    # Add Keyword nodes
    result = conn.execute("MATCH (k:Keyword) RETURN k.name")
    while result.has_next():
        nodes.append({"id": result.get_next()[0], "properties": {"type": "Keyword", "label": result.get_next()[0]}})

    # Add BELONGS_TO edges
    result = conn.execute("MATCH (l:Link)-[:BELONGS_TO]->(c:Category) RETURN l.url, c.name")
    while result.has_next():
        row = result.get_next()
        edges.append({"start": row[0], "end": row[1], "properties": {"type": "BELONGS_TO"}})

    # Add HAS_KEYWORD edges
    result = conn.execute("MATCH (l:Link)-[:HAS_KEYWORD]->(k:Keyword) RETURN l.url, k.name")
    while result.has_next():
        row = result.get_next()
        edges.append({"start": row[0], "end": row[1], "properties": {"type": "HAS_KEYWORD"}})

    # Visualize
    w = GraphWidget()
    w.nodes = nodes
    w.edges = edges

    def node_mappings(node):
        node_type = node['properties']['type']
        if node_type == "Link":
            return {"color": "#1E90FF", "shape": "rectangle"}
        elif node_type == "Category":
            return {"color": "#32CD32", "shape": "ellipse"}
        elif node_type == "Keyword":
            return {"color": "#FF4500", "shape": "triangle"}
        return {"color": "#808080", "shape": "circle"}

    w.node_mappings = node_mappings

    def edge_mappings(edge):
        edge_type = edge['properties']['type']
        if edge_type == "BELONGS_TO":
            return {"color": "#FFD700"}
        elif edge_type == "HAS_KEYWORD":
            return {"color": "#9932CC"}
        return {"color": "#000000"}

    w.node_label_mapping = lambda node: node['properties']['label']
    w.show()
    print("Graph visualization completed.")

finally:
    if 'conn' in locals():
        conn.close()
    if 'db' in locals():
        del db

Uncategorized nodes cleaned.


GraphWidget(layout=Layout(height='800px', width='100%'))

Graph visualization completed.
