In [16]:
import os
import lmdb
import torch
import numpy as np
from tqdm.notebook import tqdm
from numpy.lib.format import open_memmap


In [17]:
node_lmdb_paths = {
    "nasabah":   "/Users/ymnzaman/Documents/Project/Graph/lmdb_node_mapping/nasabah.lmdb",
    "simpanan":  "/Users/ymnzaman/Documents/Project/Graph/lmdb_node_mapping/simpanan.lmdb",
    "pekerja":   "/Users/ymnzaman/Documents/Project/Graph/lmdb_node_mapping/pekerja.lmdb",
    "pinjaman":  "/Users/ymnzaman/Documents/Project/Graph/lmdb_node_mapping/pinjaman.lmdb",
    "transaksi": "/Users/ymnzaman/Documents/Project/Graph/lmdb_node_mapping/transaksi.lmdb",
}


def count_lmdb_keys(lmdb_path):
    env = lmdb.open(
        lmdb_path,
        readonly=True,
        lock=False,
        readahead=False,
        max_readers=32
    )
    with env.begin() as txn:
        return txn.stat()["entries"]
    

def get_num_nodes(node_type):
    return count_lmdb_keys(node_lmdb_paths[node_type])


In [None]:

def build_csr_from_pt_chunks(
    edge_dir: str,
    num_src_nodes: int,
    num_dst_nodes: int,
    outdir: str,
    edge_name: str,
    sort_chunks: bool = True
):
    """
    Convert chunked edge_list .pt â†’ CSR adjacency (indptr.npy, indices.npy)
    Extremely memory-efficient (streaming-based).
    """

    os.makedirs(outdir, exist_ok=True)
    print(f"Processing edge type: {edge_name}")
    print(f"Reading chunks from: {edge_dir}")
    print(f"num_src_nodes: {num_src_nodes:,}")
    print(f"num_dst_nodes: {num_dst_nodes:,}")

    # ===========================================================
    # 1. Scan all chunks to compute degree count for each src node
    # ===========================================================
    deg = np.zeros(num_src_nodes, dtype=np.int64)

    pt_files = sorted([f for f in os.listdir(edge_dir) if f.endswith(".pt")])
    print(f"Detected {len(pt_files)} chunks.")

    print("Pass 1: Counting degree per source node...")
    for fname in tqdm(pt_files):
        edge_index = torch.load(os.path.join(edge_dir, fname))
        src = edge_index[0].numpy()

        # count src occurrences
        np.add.at(deg, src, 1)

    # ===========================================================
    # 2. Build indptr (prefix sum of degrees)
    # ===========================================================
    print("Building indptr...")
    indptr = np.zeros(num_src_nodes + 1, dtype=np.int64)
    np.cumsum(deg, out=indptr[1:])

    total_edges = int(indptr[-1])
    print(f"Total edges: {total_edges:,}")

    # Prepare output memory-mapped file for indices
    indices_path = os.path.join(outdir + "/" + edge_name, f"{edge_name}_indices.npy")
    indptr_path = os.path.join(outdir + "/" + edge_name, f"{edge_name}_indptr.npy")

    indices = open_memmap(indices_path, dtype=np.int64, mode='w+', shape=(total_edges,))


    # A copy of indptr to track write positions
    write_ptr = indptr.copy()

    # ===========================================================
    # 3. Second pass: fill indices array
    # ===========================================================
    print("Pass 2: Filling adjacency indices...")
    for fname in tqdm(pt_files):
        edge_index = torch.load(os.path.join(edge_dir, fname))
        src, dst = edge_index[0].numpy(), edge_index[1].numpy()

        for s, d in zip(src, dst):
            pos = write_ptr[s]
            indices[pos] = d
            write_ptr[s] += 1

    # ===========================================================
    # 4. Optionally sort neighbors for each node
    # ===========================================================
    if sort_chunks:
        print("Sorting neighbor lists per node (optional, ensures deterministic CSR)...")
        for i in tqdm(range(num_src_nodes)):
            start, end = indptr[i], indptr[i+1]
            if end > start:
                indices[start:end].sort()

    # Flush memmap to disk
    del indices

    # Save indptr
    np.save(indptr_path, indptr)

    print(f"CSR saved:")
    print(f"  - {indptr_path}")
    print(f"  - {indices_path}")


In [19]:
def build_adjacency_auto(edge_dir, src_type, rel, dst_type, outdir):

    # auto count nodes from LMDB
    num_src_nodes = get_num_nodes(src_type)
    num_dst_nodes = get_num_nodes(dst_type)

    edge_name = f"{src_type}__{rel}__{dst_type}"

    return build_csr_from_pt_chunks(
        edge_dir=edge_dir,
        num_src_nodes=num_src_nodes,
        num_dst_nodes=num_dst_nodes,
        outdir=outdir,
        edge_name=edge_name
    )


In [20]:
edges = {
    'edge_nasabah_is_pekerja': ["nasabah", "is_pekerja", "pekerja"],
    'edge_nasabah_memiliki_pinj': ["nasabah", "memiliki_pinj", "pinjaman"],
    'edge_nasabah_memiliki_simp': ["nasabah", "memiliki_simp", "simpanan"],
    'edge_pinj_credit': ["transaksi", "in", "pinjaman"],
    'edge_pinj_debit': ["pinjaman", "out", "transaksi"],
    'edge_simp_credit': ["transaksi", "in", "simpanan"],
    'edge_simp_debit': ["simpanan", "out", "transaksi"],
}

In [21]:
for edge, val in edges.items():

    build_adjacency_auto(
        edge_dir=f"/Users/ymnzaman/Documents/Project/Graph/chunks/{edge}",
        src_type=val[0],
        rel=val[1],
        dst_type=val[2],
        outdir="/Users/ymnzaman/Documents/Project/Graph/adjacency/"
    )

Processing edge type: nasabah__is_pekerja__pekerja
Reading chunks from: /Users/ymnzaman/Documents/Project/Graph/chunks/edge_nasabah_is_pekerja
num_src_nodes: 12,270,075
num_dst_nodes: 6,250
Detected 1 chunks.
Pass 1: Counting degree per source node...


  0%|          | 0/1 [00:00<?, ?it/s]

Building indptr...
Total edges: 64,110
Pass 2: Filling adjacency indices...


  edge_index = torch.load(os.path.join(edge_dir, fname))


  0%|          | 0/1 [00:00<?, ?it/s]

Sorting neighbor lists per node (optional, ensures deterministic CSR)...


  edge_index = torch.load(os.path.join(edge_dir, fname))


  0%|          | 0/12270075 [00:00<?, ?it/s]

CSR saved:
  - /Users/ymnzaman/Documents/Project/Graph/adjacency//nasabah__is_pekerja__pekerja/nasabah__is_pekerja__pekerja_indptr.npy
  - /Users/ymnzaman/Documents/Project/Graph/adjacency//nasabah__is_pekerja__pekerja/nasabah__is_pekerja__pekerja_indices.npy
Processing edge type: nasabah__memiliki_pinj__pinjaman
Reading chunks from: /Users/ymnzaman/Documents/Project/Graph/chunks/edge_nasabah_memiliki_pinj
num_src_nodes: 12,270,075
num_dst_nodes: 1,524,589
Detected 13 chunks.
Pass 1: Counting degree per source node...


  0%|          | 0/13 [00:00<?, ?it/s]

Building indptr...
Total edges: 12,682,596
Pass 2: Filling adjacency indices...


  0%|          | 0/13 [00:00<?, ?it/s]

Sorting neighbor lists per node (optional, ensures deterministic CSR)...


  0%|          | 0/12270075 [00:00<?, ?it/s]

CSR saved:
  - /Users/ymnzaman/Documents/Project/Graph/adjacency//nasabah__memiliki_pinj__pinjaman/nasabah__memiliki_pinj__pinjaman_indptr.npy
  - /Users/ymnzaman/Documents/Project/Graph/adjacency//nasabah__memiliki_pinj__pinjaman/nasabah__memiliki_pinj__pinjaman_indices.npy
Processing edge type: nasabah__memiliki_simp__simpanan
Reading chunks from: /Users/ymnzaman/Documents/Project/Graph/chunks/edge_nasabah_memiliki_simp
num_src_nodes: 12,270,075
num_dst_nodes: 15,636,712
Detected 188 chunks.
Pass 1: Counting degree per source node...


  0%|          | 0/188 [00:00<?, ?it/s]

Building indptr...
Total edges: 187,998,376
Pass 2: Filling adjacency indices...


  0%|          | 0/188 [00:00<?, ?it/s]

Sorting neighbor lists per node (optional, ensures deterministic CSR)...


  0%|          | 0/12270075 [00:00<?, ?it/s]

CSR saved:
  - /Users/ymnzaman/Documents/Project/Graph/adjacency//nasabah__memiliki_simp__simpanan/nasabah__memiliki_simp__simpanan_indptr.npy
  - /Users/ymnzaman/Documents/Project/Graph/adjacency//nasabah__memiliki_simp__simpanan/nasabah__memiliki_simp__simpanan_indices.npy
Processing edge type: transaksi__in__pinjaman
Reading chunks from: /Users/ymnzaman/Documents/Project/Graph/chunks/edge_pinj_credit
num_src_nodes: 12,516,002
num_dst_nodes: 1,524,589
Detected 1 chunks.
Pass 1: Counting degree per source node...


  0%|          | 0/1 [00:00<?, ?it/s]

Building indptr...
Total edges: 1,006
Pass 2: Filling adjacency indices...


  0%|          | 0/1 [00:00<?, ?it/s]

Sorting neighbor lists per node (optional, ensures deterministic CSR)...


  0%|          | 0/12516002 [00:00<?, ?it/s]

CSR saved:
  - /Users/ymnzaman/Documents/Project/Graph/adjacency//transaksi__in__pinjaman/transaksi__in__pinjaman_indptr.npy
  - /Users/ymnzaman/Documents/Project/Graph/adjacency//transaksi__in__pinjaman/transaksi__in__pinjaman_indices.npy
Processing edge type: pinjaman__out__transaksi
Reading chunks from: /Users/ymnzaman/Documents/Project/Graph/chunks/edge_pinj_debit
num_src_nodes: 1,524,589
num_dst_nodes: 12,516,002
Detected 1 chunks.
Pass 1: Counting degree per source node...


  0%|          | 0/1 [00:00<?, ?it/s]

Building indptr...
Total edges: 1,006
Pass 2: Filling adjacency indices...


  0%|          | 0/1 [00:00<?, ?it/s]

Sorting neighbor lists per node (optional, ensures deterministic CSR)...


  0%|          | 0/1524589 [00:00<?, ?it/s]

CSR saved:
  - /Users/ymnzaman/Documents/Project/Graph/adjacency//pinjaman__out__transaksi/pinjaman__out__transaksi_indptr.npy
  - /Users/ymnzaman/Documents/Project/Graph/adjacency//pinjaman__out__transaksi/pinjaman__out__transaksi_indices.npy
Processing edge type: transaksi__in__simpanan
Reading chunks from: /Users/ymnzaman/Documents/Project/Graph/chunks/edge_simp_credit
num_src_nodes: 12,516,002
num_dst_nodes: 15,636,712
Detected 1 chunks.
Pass 1: Counting degree per source node...


  0%|          | 0/1 [00:00<?, ?it/s]

Building indptr...
Total edges: 394,667
Pass 2: Filling adjacency indices...


  0%|          | 0/1 [00:00<?, ?it/s]

Sorting neighbor lists per node (optional, ensures deterministic CSR)...


  0%|          | 0/12516002 [00:00<?, ?it/s]

CSR saved:
  - /Users/ymnzaman/Documents/Project/Graph/adjacency//transaksi__in__simpanan/transaksi__in__simpanan_indptr.npy
  - /Users/ymnzaman/Documents/Project/Graph/adjacency//transaksi__in__simpanan/transaksi__in__simpanan_indices.npy
Processing edge type: simpanan__out__transaksi
Reading chunks from: /Users/ymnzaman/Documents/Project/Graph/chunks/edge_simp_debit
num_src_nodes: 15,636,712
num_dst_nodes: 12,516,002
Detected 1 chunks.
Pass 1: Counting degree per source node...


  0%|          | 0/1 [00:00<?, ?it/s]

Building indptr...
Total edges: 387,529
Pass 2: Filling adjacency indices...


  0%|          | 0/1 [00:00<?, ?it/s]

Sorting neighbor lists per node (optional, ensures deterministic CSR)...


  0%|          | 0/15636712 [00:00<?, ?it/s]

CSR saved:
  - /Users/ymnzaman/Documents/Project/Graph/adjacency//simpanan__out__transaksi/simpanan__out__transaksi_indptr.npy
  - /Users/ymnzaman/Documents/Project/Graph/adjacency//simpanan__out__transaksi/simpanan__out__transaksi_indices.npy
