<span style="color:darkolivegreen;font-weight:800;font-size:32px">
    Building Agentic Apps: ArangoDB, NVIDIA cuGraph, and NetworkX Hackathon
</span>

<br>

<p align="center">
    <img src="https://arangodb.com/wp-content/uploads/2016/05/ArangoDB_logo_avocado_@1.png" style="height: 50px;">
    <img src="https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/02-nvidia-logo-color-grn-500x200-4c25-p@2x.png" style="height: 50px;">
    <img src="https://rapids.ai/images/RAPIDS-logo.png" style="height: 50px;">
    <img src="https://avatars.githubusercontent.com/u/388785?s=200&v=4" style="height: 50px;">
</p>

### **Step 0**: Package Installation and Setup

In [1]:
# 1. Install all requirements via pip

# !pip install -r requirements.txt

In [2]:
# 2. Check if you have an NVIDIA GPU
# Note: If this returns "command not found", then GPU-based algorithms via cuGraph are unavailable

# !nvidia-smi
# !nvcc --version

In [3]:
# 3. Install nx-cugraph via pip, requires CUDA-capable GPU
# Note: Only enable this installation if the step above is working!

# !pip install nx-cugraph-cu12 --extra-index-url https://pypi.nvidia.com

In [4]:
# 4. Import the required modules

import os
import re
import json
import pandas as pd

from tqdm import tqdm
from dotenv import load_dotenv


load_dotenv(".env")

True

### **Step 1**: Prepare and Load Dataset for `NetworkX`

In [5]:
def list_of_dict_to_json(data: list[dict], output_path: str) -> None:
    if not output_path.endswith(".json"):
        output_path = f"{output_path}.json"

    with open(output_path, "w", encoding="utf-8") as output_file:
        json.dump(data, output_file, indent=4)


def load_dataset_from_dir(dir_path: str) -> dict[str, pd.DataFrame]:
    json_data = {}
    json_files = sorted([file for file in os.listdir(dir_path) if file.endswith(".json")], reverse=True)
    
    for json_file in json_files:
        with open(os.path.join(dir_path, json_file), "r", encoding="utf-8") as file:
            json_data[os.path.splitext(json_file)[0]] = json.load(file)
    
    return json_data


def prepare_regulation_data(regulation_data: list[dict], output_dir: str, verbose: bool = True) -> None:
    result = {
    # Node
        "node_Regulation": [],
        "node_Consideration": [],
        "node_Observation": [],
        "node_Article": [],
        "node_Definition": [],
    # Relationship
        "edge_reg_AMENDED_BY": [],
        "edge_HAS_CONSIDERATION": [],
        "edge_HAS_OBSERVATION": [],
        "edge_HAS_DEFINITION": [],
        "edge_HAS_ARTICLE": [],
        "edge_NEXT_ARTICLE": [],
        "edge_REFER_TO": [],
        "edge_art_AMENDED_BY": [],
    }

    edge_NEXT_ARTICLE_1 = []
    edge_NEXT_ARTICLE_2 = []

    for regulation in tqdm(iterable=regulation_data, desc="Transform regulation data", disable=not verbose):
        result["node_Regulation"].append({
            "id": int(regulation["id"]),
            "title": regulation["title"],
            "about": regulation["about"],
            "type": regulation["type"],
            "number": int(regulation["number"]),
            "year": int(regulation["year"]),
            "is_amendment": bool(int(regulation["amendment"])),
            "amendment_order": int(regulation["amendment"]),
            "institution": regulation["institution"],
            "issue_place": regulation["issue_place"],
            "issue_date": regulation["issue_date"] if regulation["issue_date"] else None,
            "effective_date": regulation["effective_date"] if regulation["effective_date"] else None,
            "subjects": regulation["subjects"],
            "reference_url": regulation["url"],
            "download_url": regulation["download_link"],
            "download_name": regulation["download_name"]
        })

        for amended_regulation in regulation["status"]["amend"]:
            if re.search(r"peraturan\.bpk\.go\.id", amended_regulation, re.IGNORECASE) is None:
                result["edge_reg_AMENDED_BY"].append({
                    "from_type": "Regulation",
                    "from": int(regulation["id"]),
                    "to_type": "Regulation",
                    "to": int(amended_regulation)
                })

        for key, content in regulation["content"].items():
            if key == "considering":
                result["node_Consideration"].append({
                    "id": int(content["id"]),
                    "text": content["text"]
                })

                result["edge_HAS_CONSIDERATION"].append({
                    "from_type": "Regulation",
                    "from": int(regulation["id"]),
                    "to_type": "Consideration",
                    "to": int(content["id"])
                })

            elif key == "observing":
                result["node_Observation"].append({
                    "id": int(content["id"]),
                    "text": content["text"]
                })

                result["edge_HAS_OBSERVATION"].append({
                    "from_type": "Regulation",
                    "from": int(regulation["id"]),
                    "to_type": "Observation",
                    "to": int(content["id"])
                })

            elif key == "articles":
                for article in content.values():
                    text = (
                        f"{regulation['title']}, "
                        f"{(article['chapter_about'] or '') + ', ' if article['chapter_about'] else ''}"
                        f"{(article['part_about'] or '') + ', ' if article['part_about'] else ''}"
                        f"{(article['paragraph_about'] or '') + ', ' if article['paragraph_about'] else ''}"
                        f"Pasal {article['article_number']}:\n"
                        f"{article['text']}".strip()
                    )

                    result["node_Article"].append({
                        "id": int(article["id"]),
                        "number": article["article_number"],
                        "chapter": article["chapter_number"] if article["chapter_number"] else None,
                        "part": article["part_number"] if article["part_number"] else None,
                        "paragraph": article["paragraph_number"] if article["paragraph_number"] else None,
                        "text": text
                    })

                    result["edge_HAS_ARTICLE"].append({
                        "from_type": "Regulation",
                        "from": int(regulation["id"]),
                        "to_type": "Article",
                        "to": int(article["id"])
                    })

                    if article["previous_article"]:
                        edge_NEXT_ARTICLE_1.append((
                            int(article["previous_article"]),
                            int(article["id"]),
                            int(regulation["amendment"])
                        ))

                    if article["next_article"]:
                        edge_NEXT_ARTICLE_2.append((
                            int(article["id"]),
                            int(article["next_article"]),
                            int(regulation["amendment"])
                        ))

                    if article["references"]:
                        for reference_article_id in article["references"]:
                            result["edge_REFER_TO"].append({
                                "from_type": "Article",
                                "from": int(article["id"]),
                                "to_type": "Article",
                                "to": int(reference_article_id)
                            })

                    if article["amend"]:
                        for amended_article_id in article["amend"]:
                            result["edge_art_AMENDED_BY"].append({
                                "from_type": "Article",
                                "from": int(article["id"]),
                                "to_type": "Article",
                                "to": int(amended_article_id)
                            })

            else:
                for definition in content:
                    text = (
                        f"{regulation['title']}, "
                        f"Definisi {definition['name']}:\n"
                        f"{definition['definition']}".strip()
                    )

                    result["node_Definition"].append({
                        "id": int(definition["id"]),
                        "name": definition["name"],
                        "text": text,
                    })

                    result["edge_HAS_DEFINITION"].append({
                        "from_type": "Regulation",
                        "from": int(regulation["id"]),
                        "to_type": "Definition",
                        "to": int(definition["id"])
                    })

    for edge in sorted(set(edge_NEXT_ARTICLE_1 + edge_NEXT_ARTICLE_2)):
        result["edge_NEXT_ARTICLE"].append({
            "from_type": "Article",
            "from": edge[0],
            "to_type": "Article",
            "to": edge[1],
            "amendment_order": edge[2]
        })

    for key, value in tqdm(iterable=result.items(), desc="Save transformed data to JSON", disable=not verbose):
        list_of_dict_to_json(data=value, output_path=os.path.join(output_dir, f"{key}.json"))

In [6]:
# 1. Prepare dataset from initial JSON file

json_raw_input = os.path.join("data", "raw", "raw.json")
with open(json_raw_input) as file:
    json_data = json.load(file)

prepare_regulation_data(
    regulation_data=json_data,
    output_dir="data",
    verbose=True
)

Transform regulation data: 100%|██████████| 63/63 [00:00<00:00, 1957.37it/s]
Save transformed data to JSON: 100%|██████████| 13/13 [00:00<00:00, 85.30it/s]


In [7]:
# 2. Load dataset from prepared JSON file

dataset = load_dataset_from_dir("data")

print(f"{'Dataset':<25}: {len(dataset):>5}  Entity")
for index, data in enumerate(dataset.items()):
    key, value = data
    print_value = f"{key:<25}: {len(value):>5}  data {type(value)} of {type(value[0])}"
    if index == 0: print("-" * len(print_value))
    print(print_value)

Dataset                  :    13  Entity
-----------------------------------------------------------------------
node_Regulation          :    63  data <class 'list'> of <class 'dict'>
node_Observation         :    63  data <class 'list'> of <class 'dict'>
node_Definition          :   957  data <class 'list'> of <class 'dict'>
node_Consideration       :    63  data <class 'list'> of <class 'dict'>
node_Article             :  2423  data <class 'list'> of <class 'dict'>
edge_reg_AMENDED_BY      :    26  data <class 'list'> of <class 'dict'>
edge_art_AMENDED_BY      :    83  data <class 'list'> of <class 'dict'>
edge_REFER_TO            :  1497  data <class 'list'> of <class 'dict'>
edge_NEXT_ARTICLE        :  2422  data <class 'list'> of <class 'dict'>
edge_HAS_OBSERVATION     :    63  data <class 'list'> of <class 'dict'>
edge_HAS_DEFINITION      :   957  data <class 'list'> of <class 'dict'>
edge_HAS_CONSIDERATION   :    63  data <class 'list'> of <class 'dict'>
edge_HAS_ARTICLE       