In [1]:
import os
import json
import lmdb
import torch
import pickle
import msgpack
import numpy as np

from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED

# Index Edge

In [2]:


def wait_first_done(futures):
    """Wait until at least one future is done."""
    done, not_done = wait(futures, return_when=FIRST_COMPLETED)
    return list(done), list(not_done)

def write_chunk(edge_dir, chunk_id, src_arr, dst_arr):
    """Write one chunk file (.pt)."""
    os.makedirs(edge_dir, exist_ok=True)
    edge_index = torch.from_numpy(np.vstack([src_arr, dst_arr]).astype(np.int64))
    path = os.path.join(edge_dir, f"{chunk_id:06d}.pt")
    torch.save(edge_index, path)
    return path

def lmdb_to_pt_chunks(
    lmdb_path: str,
    outdir: str,
    batch_size: int = 1_000_000,
    workers: int = 4,
    max_queue: int = 8,
    separator: str = ","
):
    """
    Convert LMDB edge-index (src,dst) → chunked .pt files for PyG.
    Optimized for notebook usage & low memory (streaming).
    """

    print(f"Reading LMDB from: {lmdb_path}")
    print(f"Saving chunks to:  {outdir}")
    sep = separator.encode()

    # Open LMDB efficiently
    env = lmdb.open(
        lmdb_path,
        readonly=True,
        lock=False,
        readahead=False,   # good for SSD
        max_readers=32
    )

    # Try reading LMDB entry count (best effort)
    try:
        with env.begin() as txn:
            total_keys = txn.stat().get("entries")
    except:
        total_keys = None

    if total_keys:
        print(f"Total keys detected: {total_keys:,}")
    else:
        print("Total keys cannot be detected, using open-ended iteration.")

    executor = ThreadPoolExecutor(max_workers=workers)
    futures = []
    chunk_id = 0

    with env.begin(write=False) as txn:
        cursor = txn.cursor()

        batch_src = []
        batch_dst = []

        pbar = tqdm(
            total=total_keys,
            desc=f"Processing {os.path.basename(lmdb_path)}",
            smoothing=0.1
        )

        for _, val in cursor:
            try:
                a, b = val.split(sep, 1)
            except:
                continue  # skip malformed

            batch_src.append(int(a))
            batch_dst.append(int(b))

            # When batch full → flush to writer thread
            if len(batch_src) >= batch_size:
                src_np = np.fromiter(batch_src, dtype=np.int64)
                dst_np = np.fromiter(batch_dst, dtype=np.int64)

                # Limit queued tasks to avoid memory blow-up
                while len(futures) >= max_queue:
                    done, futures = wait_first_done(futures)
                    for fut in done:
                        fut.result()

                fut = executor.submit(write_chunk, outdir, chunk_id, src_np, dst_np)
                futures.append(fut)

                chunk_id += 1
                batch_src = []
                batch_dst = []

            pbar.update(1)

        # last batch
        if batch_src:
            src_np = np.fromiter(batch_src, dtype=np.int64)
            dst_np = np.fromiter(batch_dst, dtype=np.int64)

            while len(futures) >= max_queue:
                done, futures = wait_first_done(futures)
                for fut in done:
                    fut.result()

            fut = executor.submit(write_chunk, outdir, chunk_id, src_np, dst_np)
            futures.append(fut)
            chunk_id += 1

        pbar.close()

    # final writes
    print("Finalizing chunk writes...")
    for fut in tqdm(as_completed(futures), total=len(futures)):
        fut.result()

    executor.shutdown(wait=True)

    print(f"Conversion complete → {chunk_id} chunks written.")
    return chunk_id


In [3]:
print("="*60)
print("Convert LMDB edge-index to chunked .pt files for PyG")
edges = [
    "edge_nasabah_is_pekerja.lmdb",
    "edge_nasabah_memiliki_pinj.lmdb",
    "edge_nasabah_memiliki_simp.lmdb",
    'edge_pinj_credit.lmdb',
    'edge_pinj_debit.lmdb',
    'edge_simp_credit.lmdb',
    'edge_simp_debit.lmdb',
]

for edge in edges:
    lmdb_path = f"/Users/ymnzaman/Documents/Project/Graph/lmdb_edge_indexing/{edge}"
    outdir = f"/Users/ymnzaman/Documents/Project/Graph/chunks/{edge.replace('.lmdb','')}"

    lmdb_to_pt_chunks(
        lmdb_path,
        outdir,
        batch_size=1_000_000,
        workers=4,
        max_queue=6,
        separator=","
    )
print("All conversions done.")


Convert LMDB edge-index to chunked .pt files for PyG
Reading LMDB from: /Users/ymnzaman/Documents/Project/Graph/lmdb_edge_indexing/edge_nasabah_is_pekerja.lmdb
Saving chunks to:  /Users/ymnzaman/Documents/Project/Graph/chunks/edge_nasabah_is_pekerja
Total keys detected: 64,110


Processing edge_nasabah_is_pekerja.lmdb:   0%|          | 0/64110 [00:00<?, ?it/s]

Finalizing chunk writes...


  0%|          | 0/1 [00:00<?, ?it/s]

Conversion complete → 1 chunks written.
Reading LMDB from: /Users/ymnzaman/Documents/Project/Graph/lmdb_edge_indexing/edge_nasabah_memiliki_pinj.lmdb
Saving chunks to:  /Users/ymnzaman/Documents/Project/Graph/chunks/edge_nasabah_memiliki_pinj
Total keys detected: 12,682,596


Processing edge_nasabah_memiliki_pinj.lmdb:   0%|          | 0/12682596 [00:00<?, ?it/s]

Finalizing chunk writes...


  0%|          | 0/1 [00:00<?, ?it/s]

Conversion complete → 13 chunks written.
Reading LMDB from: /Users/ymnzaman/Documents/Project/Graph/lmdb_edge_indexing/edge_nasabah_memiliki_simp.lmdb
Saving chunks to:  /Users/ymnzaman/Documents/Project/Graph/chunks/edge_nasabah_memiliki_simp
Total keys detected: 187,998,376


Processing edge_nasabah_memiliki_simp.lmdb:   0%|          | 0/187998376 [00:00<?, ?it/s]

Finalizing chunk writes...


  0%|          | 0/2 [00:00<?, ?it/s]

Conversion complete → 188 chunks written.
Reading LMDB from: /Users/ymnzaman/Documents/Project/Graph/lmdb_edge_indexing/edge_pinj_credit.lmdb
Saving chunks to:  /Users/ymnzaman/Documents/Project/Graph/chunks/edge_pinj_credit
Total keys detected: 1,006


Processing edge_pinj_credit.lmdb:   0%|          | 0/1006 [00:00<?, ?it/s]

Finalizing chunk writes...


  0%|          | 0/1 [00:00<?, ?it/s]

Conversion complete → 1 chunks written.
Reading LMDB from: /Users/ymnzaman/Documents/Project/Graph/lmdb_edge_indexing/edge_pinj_debit.lmdb
Saving chunks to:  /Users/ymnzaman/Documents/Project/Graph/chunks/edge_pinj_debit
Total keys detected: 1,006


Processing edge_pinj_debit.lmdb:   0%|          | 0/1006 [00:00<?, ?it/s]

Finalizing chunk writes...


  0%|          | 0/1 [00:00<?, ?it/s]

Conversion complete → 1 chunks written.
Reading LMDB from: /Users/ymnzaman/Documents/Project/Graph/lmdb_edge_indexing/edge_simp_credit.lmdb
Saving chunks to:  /Users/ymnzaman/Documents/Project/Graph/chunks/edge_simp_credit
Total keys detected: 394,667


Processing edge_simp_credit.lmdb:   0%|          | 0/394667 [00:00<?, ?it/s]

Finalizing chunk writes...


  0%|          | 0/1 [00:00<?, ?it/s]

Conversion complete → 1 chunks written.
Reading LMDB from: /Users/ymnzaman/Documents/Project/Graph/lmdb_edge_indexing/edge_simp_debit.lmdb
Saving chunks to:  /Users/ymnzaman/Documents/Project/Graph/chunks/edge_simp_debit
Total keys detected: 387,529


Processing edge_simp_debit.lmdb:   0%|          | 0/387529 [00:00<?, ?it/s]

Finalizing chunk writes...


  0%|          | 0/1 [00:00<?, ?it/s]

Conversion complete → 1 chunks written.
All conversions done.


# Map Node

In [4]:
def wait_first_done(futures):
    done, not_done = wait(futures, return_when=FIRST_COMPLETED)
    return list(done), list(not_done)

def write_node_chunk(outdir, chunk_id, records):
    os.makedirs(outdir, exist_ok=True)
    path = os.path.join(outdir, f"{chunk_id:06d}.pt")
    torch.save(records, path)
    return path

def decode_value(val):
    """Decode LMDB value with multiple fallback strategies."""
    # Try msgpack
    try:
        return msgpack.unpackb(val, raw=False)
    except:
        pass

    # Try pickle
    try:
        return pickle.loads(val)
    except:
        pass

    # Try JSON
    try:
        return json.loads(val.decode())
    except:
        pass

    # Fallback: return raw bytes
    return val


def lmdb_node_to_pt_chunks(
    lmdb_path: str,
    outdir: str,
    batch_size: int = 200_000,
    workers: int = 4,
    max_queue: int = 8,
):
    """
    Stream LMDB node → chunked .pt.
    More general than edge converter because values vary (dicts, lists, etc.).
    """
    
    print(f"Reading LMDB nodes from: {lmdb_path}")
    print(f"Saving chunks to:        {outdir}")
    
    env = lmdb.open(
        lmdb_path,
        readonly=True,
        lock=False,
        readahead=False,
        max_readers=32
    )

    # detect total entries
    try:
        total_keys = env.stat()["entries"]
        print(f"Total node entries: {total_keys:,}")
    except:
        total_keys = None
        print("Total entries cannot be detected.")

    executor = ThreadPoolExecutor(max_workers=workers)
    futures = []
    chunk_id = 0
    
    with env.begin(write=False) as txn:
        cursor = txn.cursor()

        batch_records = []
        pbar = tqdm(total=total_keys, desc="Processing nodes")

        for key, val in cursor:
            record = decode_value(val)
            batch_records.append(record)

            if len(batch_records) >= batch_size:

                # wait if queue full
                while len(futures) >= max_queue:
                    done, futures = wait_first_done(futures)
                    for fut in done:
                        fut.result()

                fut = executor.submit(
                    write_node_chunk, outdir, chunk_id, batch_records
                )
                futures.append(fut)

                chunk_id += 1
                batch_records = []

            pbar.update(1)

        # last batch
        if batch_records:
            fut = executor.submit(
                write_node_chunk, outdir, chunk_id, batch_records
            )
            futures.append(fut)
            chunk_id += 1

        pbar.close()

    print("Finalizing chunk writes...")
    for fut in tqdm(as_completed(futures), total=len(futures)):
        fut.result()

    executor.shutdown(wait=True)

    print(f"Node conversion complete → {chunk_id} chunks written.")
    return chunk_id


In [5]:
print("="*60)
print("Convert LMDB map-node to chunked .pt files for PyG")
nodes = [
    "nasabah.lmdb",
    "pekerja.lmdb",
    "pinjaman.lmdb",
    "simpanan.lmdb",
    "transaksi.lmdb",
]

for node in nodes:
    lmdb_path = f"/Users/ymnzaman/Documents/Project/Graph/lmdb_node_mapping/{node}"
    outdir = f"/Users/ymnzaman/Documents/Project/Graph/chunks/{node.replace('.lmdb','')}"

    lmdb_node_to_pt_chunks(
        lmdb_path,
        outdir,
        batch_size=1_000_000,
        workers=4,
        max_queue=6,
    )
print("All conversions done.")


Convert LMDB map-node to chunked .pt files for PyG
Reading LMDB nodes from: /Users/ymnzaman/Documents/Project/Graph/lmdb_node_mapping/nasabah.lmdb
Saving chunks to:        /Users/ymnzaman/Documents/Project/Graph/chunks/nasabah
Total node entries: 12,270,075


Processing nodes:   0%|          | 0/12270075 [00:00<?, ?it/s]

Finalizing chunk writes...


  0%|          | 0/7 [00:00<?, ?it/s]

Node conversion complete → 13 chunks written.
Reading LMDB nodes from: /Users/ymnzaman/Documents/Project/Graph/lmdb_node_mapping/pekerja.lmdb
Saving chunks to:        /Users/ymnzaman/Documents/Project/Graph/chunks/pekerja
Total node entries: 6,250


Processing nodes:   0%|          | 0/6250 [00:00<?, ?it/s]

Finalizing chunk writes...


  0%|          | 0/1 [00:00<?, ?it/s]

Node conversion complete → 1 chunks written.
Reading LMDB nodes from: /Users/ymnzaman/Documents/Project/Graph/lmdb_node_mapping/pinjaman.lmdb
Saving chunks to:        /Users/ymnzaman/Documents/Project/Graph/chunks/pinjaman
Total node entries: 1,524,589


Processing nodes:   0%|          | 0/1524589 [00:00<?, ?it/s]

Finalizing chunk writes...


  0%|          | 0/2 [00:00<?, ?it/s]

Node conversion complete → 2 chunks written.
Reading LMDB nodes from: /Users/ymnzaman/Documents/Project/Graph/lmdb_node_mapping/simpanan.lmdb
Saving chunks to:        /Users/ymnzaman/Documents/Project/Graph/chunks/simpanan
Total node entries: 15,636,712


Processing nodes:   0%|          | 0/15636712 [00:00<?, ?it/s]

Finalizing chunk writes...


  0%|          | 0/4 [00:00<?, ?it/s]

Node conversion complete → 16 chunks written.
Reading LMDB nodes from: /Users/ymnzaman/Documents/Project/Graph/lmdb_node_mapping/transaksi.lmdb
Saving chunks to:        /Users/ymnzaman/Documents/Project/Graph/chunks/transaksi
Total node entries: 12,516,002


Processing nodes:   0%|          | 0/12516002 [00:00<?, ?it/s]

Finalizing chunk writes...


  0%|          | 0/7 [00:00<?, ?it/s]

Node conversion complete → 13 chunks written.
All conversions done.
