In [1]:
import torch

In [4]:
torch.device("cuda" if torch.cuda.is_available() else "mps")

device(type='mps')

In [None]:

import os
from typing import List, Dict, Iterator, Tuple

BASE_DIR = "D:\zman\graph\data" # update to your parquet root
MAP_DIR = "./maps"
EDGE_DIR = "./edges"
FEATURE_DIR = "./features"

os.makedirs(MAP_DIR, exist_ok=True)
os.makedirs(EDGE_DIR, exist_ok=True)
os.makedirs(FEATURE_DIR, exist_ok=True)

# Example table folders (update names to your actual folder names)
TABLE_PATHS = {
'node_pekerja': os.path.join(BASE_DIR, 'node_pekerja'),
'node_nasabah': os.path.join(BASE_DIR, 'node_nasabah'),
'node_simpanan': os.path.join(BASE_DIR, 'node_simpanan'),
'node_pinjaman': os.path.join(BASE_DIR, 'node_pinjaman'),
'node_transaksi': os.path.join(BASE_DIR, 'node_transaksi'),
'edge_rek_debit': os.path.join(BASE_DIR, 'edge_rek_debit'),
'edge_rek_credit': os.path.join(BASE_DIR, 'edge_rek_credit'),
'edge_nasabah_memiliki_simp': os.path.join(BASE_DIR, 'edge_nasabah_memiliki_simp'),
'edge_nasabah_memiliki_pinj': os.path.join(BASE_DIR, 'edge_nasabah_memiliki_pinj'),
'edge_nasabah_is_pekerja': os.path.join(BASE_DIR, 'edge_nasabah_is_pekerja'),
}

# Parquet partition column name for month
PARTITION_COL = 'period'  # e.g. period=202407

# Recommended runtime knobs for T1000 4GB GPU + 32GB RAM
KNOBS = {
    'device': "mps",
    'hidden_dim': 32,
    'final_feat_dim': 64,
    'batch_size_nodes': 256,
    'num_neighbors': [12, 6],
}


In [None]:

# -----------------------------------------------------------------------------
# Dependencies
# -----------------------------------------------------------------------------
import pyarrow.dataset as ds
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import pickle
import lmdb
import torch
from torch_geometric.data import HeteroData
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import HeteroConv, SAGEConv
import torch.nn.functional as F
from torch.cuda.amp import autocast
from torch.amp import GradScaler
from sklearn.metrics import precision_recall_curve, roc_auc_score

# -----------------------------------------------------------------------------
# 1) Streaming helper: pyarrow Dataset -> pandas batches
# -----------------------------------------------------------------------------

def stream_table(path: str, columns: List[str] = None, month: str = None, batch_size: int = 75_000) -> Iterator[pd.DataFrame]:
    """Yield pandas DataFrames in small batches using pyarrow.dataset.
    - path: folder root of a parquet table (partitioned by hive style)
    - columns: list of columns to read
    - month: partition filter value (e.g. '202407')
    - batch_size: rows per batch
    """
    ds_obj = ds.dataset(path, format='parquet', partitioning='hive')
    flt = None
    if month is not None:
        flt = ds.field(PARTITION_COL) == month
    for rb in ds_obj.to_batches(columns=columns, filter=flt, batch_size=batch_size):
        yield rb.to_pandas()


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds_obj = ds.dataset("D:/zman/graph/data/node_pekerja", format='parquet', partitioning='hive')

In [4]:
ds_obj.count_rows()

62764

In [5]:
# -----------------------------------------------------------------------------
# 2) ID mapping (pickle-backed). Use LMDB alternative for very large maps.
# -----------------------------------------------------------------------------

def build_or_load_map(values_iter: Iterator[pd.DataFrame], key_col: str, map_path: str) -> Dict:
    """Build mapping from original key to contiguous int IDs (0..N-1).
    If map_path exists, load and return it.
    """
    if os.path.exists(map_path):
        with open(map_path, 'rb') as f:
            print(f"Loading map from {map_path}")
            return pickle.load(f)

    mapping = {}
    nid = 0
    for df in values_iter:
        vals = pd.Series(df[key_col].dropna().unique())
        for v in vals:
            if v not in mapping:
                mapping[v] = nid
                nid += 1
    with open(map_path, 'wb') as f:
        pickle.dump(mapping, f)
    print(f"Saved map {map_path} with {nid} entries")
    return mapping


def build_or_load_lmdb_map(values_iter: Iterator[pd.DataFrame], key_col: str, lmdb_path: str) -> None:
    """Alternative using LMDB: insert keys and their IDs. This writes to disk as LMDB DB.
    lmdb_path is a file path for the LMDB environment.
    """
    if os.path.exists(lmdb_path):
        print(f"LMDB exists: {lmdb_path}")
        return
    env = lmdb.open(lmdb_path, map_size=1024**4)
    nid = 0
    with env.begin(write=True) as txn:
        for df in values_iter:
            for v in df[key_col].dropna().unique():
                k = str(v).encode()
                if txn.get(k) is None:
                    txn.put(k, str(nid).encode())
                    nid += 1
    env.close()
    print(f"Built LMDB map {lmdb_path} entries={nid}")

# helper load pickle

def load_pickle_map(map_path: str) -> Dict:
    with open(map_path, 'rb') as f:
        return pickle.load(f)


In [None]:
# -----------------------------------------------------------------------------
# 3) Edge builder: stream edges and write mapped integer edge parquet
# -----------------------------------------------------------------------------

def build_edges_from_parquet(edge_table_path: str, src_col: str, dst_col: str,
                             src_map: Dict, dst_map: Dict,
                             out_parquet: str, month: str = None,
                             cols_to_keep: List[str] = None):
    """Stream edge table, map src/dst values to integer node IDs and write parquet.
    Writes single output parquet file (appends in streaming manner).
    """
    writer = None
    written = 0
    for df in stream_table(edge_table_path, columns=[src_col, dst_col] + (cols_to_keep or []), month=month, batch_size=75_000):
        src_mapped = df[src_col].map(src_map).fillna(-1).astype('int32')
        dst_mapped = df[dst_col].map(dst_map).fillna(-1).astype('int32')
        out_df = pd.DataFrame({'src_nid': src_mapped.values, 'dst_nid': dst_mapped.values})
        if cols_to_keep:
            for c in cols_to_keep:
                out_df[c] = df[c].values
        tbl = pa.Table.from_pandas(out_df)
        if writer is None:
            writer = pq.ParquetWriter(out_parquet, tbl.schema)
        writer.write_table(tbl)
        written += len(out_df)
    if writer is not None:
        writer.close()
    print(f"Wrote {written} edges to {out_parquet}")


In [None]:
# -----------------------------------------------------------------------------
# 4) Simple streaming feature aggregation per pekerja
# -----------------------------------------------------------------------------

def agg_features_per_pekerja_simple(transaksi_path: str, map_rek: Dict, map_nasabah: Dict,
                                    map_pekerja_from_nasabah: Dict, out_parquet: str,
                                    months: List[str]):
    """Aggregate a few simple features per pekerja by streaming transaksi.
    This function assumes you have a mapping from nasabah -> pekerja (edge_nasabah_is_pekerja)
    represented by map_pekerja_from_nasabah: nasabah_cif -> pekerja_nid.

    Output columns: nid (pekerja id), cnt_txn, sum_amt, cnt_dst_accounts, recent_txn_7d (sketch)
    """
    # initialize accumulation dicts
    from collections import defaultdict
    agg_cnt = defaultdict(int)
    agg_sum = defaultdict(float)
    agg_dst_set = defaultdict(set)

    for month in months:
        for df in stream_table(transaksi_path, columns=['id_trx','src','dst','amt','dsctrc','channel','period'], month=month, batch_size=75_000):
            # map account -> nasabah if available (this needs precomputed account->nasabah map)
            # For blueprint: assume map_rek maps acctno -> nasabah_cif
            df['src_nasabah'] = df['src'].map(map_rek).fillna(-1)
            df['dst_nasabah'] = df['dst'].map(map_rek).fillna(-1)
            # map nasabah -> pekerja (if a nasabah has a pekerja relation)
            df['src_pekerja_nid'] = df['src_nasabah'].map(map_pekerja_from_nasabah).fillna(-1).astype('int32')
            df['dst_pekerja_nid'] = df['dst_nasabah'].map(map_pekerja_from_nasabah).fillna(-1).astype('int32')

            # accumulate for pekerja appearing on either side
            for _, row in df.iterrows():
                for pk in ['src_pekerja_nid','dst_pekerja_nid']:
                    pn = int(row.get(pk, -1))
                    if pn < 0:
                        continue
                    agg_cnt[pn] += 1
                    agg_sum[pn] += float(row.get('amt') or 0.0)
                    agg_dst_set[pn].add(row.get('dst'))

    # convert to DataFrame and write
    rows = []
    for nid in agg_cnt.keys():
        rows.append({'nid': nid, 'cnt_txn': agg_cnt[nid], 'sum_amt': agg_sum[nid], 'cnt_dst_accounts': len(agg_dst_set[nid])})
    out_df = pd.DataFrame(rows)
    out_df.to_parquet(out_parquet, index=False)
    print(f"Wrote aggregated features per pekerja to {out_parquet}")


In [8]:
# -----------------------------------------------------------------------------
# 5) Build HeteroData (memory-aware): load small node features fully, for large node types
#    store only num_nodes and rely on edge parquet for sampling
# -----------------------------------------------------------------------------

def build_heterodata(map_files: Dict[str,str], node_feature_files: Dict[str,str], edge_parquet_files: Dict[Tuple[str,str,str],str]) -> HeteroData:
    """Construct HeteroData. For large node types (e.g. transaksi), do NOT load full x matrix
    (unless user prepared memmapped features). Edges are loaded from parquet and set as tensors.
    edge_parquet_files keys are tuples (src_ntype, rel, dst_ntype).
    """
    data = HeteroData()

    # pekerja features must be small -> load fully
    pekerja_path = node_feature_files.get('pekerja')
    if pekerja_path is None:
        raise ValueError('pekerja feature parquet is required for Version A')
    df_pekerja = pd.read_parquet(pekerja_path)
    if 'nid' not in df_pekerja.columns:
        # assume index-like ordering if not provided
        df_pekerja = df_pekerja.reset_index().rename(columns={'index':'nid'})
    feat_cols = [c for c in df_pekerja.columns if c not in ('nid','is_fraud')]
    x_pekerja = torch.tensor(df_pekerja[feat_cols].fillna(0).values.astype('float32'))
    y_pekerja = torch.tensor(df_pekerja['is_fraud'].fillna(0).values.astype('int64'))
    data['pekerja'].x = x_pekerja
    data['pekerja'].y = y_pekerja
    data['pekerja'].num_nodes = x_pekerja.size(0)

    # other node types: if features file provided and small, load; else set num_nodes from map
    for ntype in ['nasabah','rekening','transaksi']:
        path = node_feature_files.get(ntype)
        map_path = map_files.get(ntype)
        if path and os.path.exists(path):
            df = pd.read_parquet(path)
            if 'nid' not in df.columns:
                df = df.reset_index().rename(columns={'index':'nid'})
            cols = [c for c in df.columns if c != 'nid']
            x = torch.tensor(df[cols].fillna(0).values.astype('float32'))
            data[ntype].x = x
            data[ntype].num_nodes = x.size(0)
        elif map_path and os.path.exists(map_path):
            # load mapping to count nodes
            m = load_pickle_map(map_path)
            data[ntype].num_nodes = len(m)
        else:
            data[ntype].num_nodes = 0

    # load edges from parquet and set edge_index for each relation
    for (src_ntype, rel, dst_ntype), p in edge_parquet_files.items():
        if not os.path.exists(p):
            continue
        tbl = pq.read_table(p)
        df = tbl.to_pandas()
        # remove invalid -1 mappings
        df = df[(df['src_nid'] >= 0) & (df['dst_nid'] >= 0)]
        src = torch.tensor(df['src_nid'].values, dtype=torch.long)
        dst = torch.tensor(df['dst_nid'].values, dtype=torch.long)
        data[(src_ntype, rel, dst_ntype)].edge_index = torch.vstack([src, dst])
    print('Built HeteroData: node counts:', {k: getattr(data[k], 'num_nodes', None) for k in data.node_types})
    return data


In [9]:
# -----------------------------------------------------------------------------
# 6) NeighborLoader with oversampling helper
# -----------------------------------------------------------------------------

def make_neighbor_loader(data: HeteroData, train_idx: torch.Tensor, pos_idx: torch.Tensor,
                         batch_size: int = 256, num_neighbors: Dict = None, oversample_ratio: int = 4, shuffle: bool = True):
    if num_neighbors is None:
        num_neighbors = KNOBS['num_neighbors']
    fraud_oversampled = pos_idx.repeat(oversample_ratio)
    # combine: keep order stable
    nonfraud_mask = ~torch.isin(train_idx, pos_idx)
    combined = torch.cat([fraud_oversampled, train_idx[nonfraud_mask]])
    loader = NeighborLoader(
        data,
        num_neighbors=num_neighbors,
        input_nodes=('pekerja', combined),
        batch_size=batch_size,
        shuffle=shuffle,
    )
    return loader


In [10]:
# -----------------------------------------------------------------------------
# 7) Compact HeteroSAGE model
# -----------------------------------------------------------------------------
class CompactHeteroSAGE(torch.nn.Module):
    def __init__(self, metadata, in_dims: Dict[str,int], hidden: int = 32, num_layers: int = 2):
        super().__init__()
        self.metadata = metadata
        self.hidden = hidden
        self.num_layers = num_layers
        self.lin = torch.nn.ModuleDict()
        for ntype, dim in in_dims.items():
            self.lin[ntype] = torch.nn.Linear(dim, hidden)
        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HeteroConv({
                edge_type: SAGEConv((-1,-1), hidden) for edge_type in metadata[1]
            }, aggr='mean')
            self.convs.append(conv)
        self.classifier = torch.nn.Linear(hidden, 1)

    def forward(self, x_dict, edge_index_dict):
        x = {k: F.relu(self.lin[k](v)) for k,v in x_dict.items() if k in self.lin}
        for conv in self.convs:
            x = conv(x, edge_index_dict)
            x = {k: F.relu(v) for k,v in x.items()}
        out = self.classifier(x['pekerja']).squeeze(-1)
        return out


In [11]:
# -----------------------------------------------------------------------------
# 8) Training loop (FP16 + grad accum) and evaluation
# -----------------------------------------------------------------------------

def train_model(model: torch.nn.Module, train_loader, val_loader, device: str = KNOBS['device'], epochs: int = 5,
                accum_steps: int = 2, lr: float = 1e-3):
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)

    # compute pos_weight from training labels if available
    # Here we assume train_loader yields batches with 'pekerja'.y available
    scaler = GradScaler(device="cuda")
    # placeholder: will compute pos_weight externally if full dataset labels accessible
    criterion = torch.nn.BCEWithLogitsLoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        optimizer.zero_grad()
        for i, batch in enumerate(train_loader):
            batch = batch.to(device)
            with autocast():
                logits = model(batch.x_dict, batch.edge_index_dict)
                y = batch['pekerja'].y.to(torch.float32).to(device)
                loss = criterion(logits, y) / accum_steps
            scaler.scale(loss).backward()
            if (i+1) % accum_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
            total_loss += loss.item() * accum_steps
        avg_loss = total_loss / (i+1)
        print(f"Epoch {epoch+1}/{epochs} train_loss={avg_loss:.4f}")
        metrics = evaluate_model(model, val_loader, device)
        print(f"Val metrics: {metrics}")
    return model


def evaluate_model(model: torch.nn.Module, loader, device: str = KNOBS['device']):
    model.eval()
    ys, ys_pred = [], []
    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            logits = model(batch.x_dict, batch.edge_index_dict)
            probs = torch.sigmoid(logits).cpu().numpy()
            ys_pred.append(probs)
            ys.append(batch['pekerja'].y.cpu().numpy())
    y_pred = np.concatenate(ys_pred)
    y_true = np.concatenate(ys)
    auc = roc_auc_score(y_true, y_pred) if len(np.unique(y_true)) > 1 else float('nan')
    prec, rec, th = precision_recall_curve(y_true, y_pred)
    f1 = 2 * prec * rec / (prec + rec + 1e-12)
    if len(f1) > 0:
        best_idx = np.nanargmax(f1)
        best_thr = th[best_idx] if best_idx < len(th) else 0.5
    else:
        best_thr = 0.5
    return {'auc': float(auc), 'best_thr': float(best_thr)}


In [None]:
# -----------------------------------------------------------------------------
# 9) High-level driver for Version A: build maps, edges, features, heterodata, loaders
# -----------------------------------------------------------------------------

def version_a_runner(
        months_train: List[str],
        months_val: List[str],
        months_test: List[str],
        label_parquet_path: str,
        account_to_nasabah_map_parquet: str = None):
    """End-to-end runner for Version A.
    Steps:
    1) Build id maps for node types (pekerja, nasabah, rekening, transaksi)
    2) Build edges (mapped integer parquet)
    3) Build simple aggregated pekerja features
    4) Construct HeteroData and NeighborLoader (train/val/test)
    5) Train and validate model
    """
    # 1) Build maps
    print('Step 1: building id maps (streaming)')
    # pekerja map (small) - from node_pekerja table
    pekerja_iter = stream_table(TABLE_PATHS['node_pekerja'], columns=['pn'], month=None, batch_size=75_000)
    pekerja_map_path = os.path.join(MAP_DIR, 'map_pekerja.pkl')
    pekerja_map = build_or_load_map(pekerja_iter, 'pn', pekerja_map_path)

    # nasabah map (could be large) - from node_nasabah
    nasabah_iter = stream_table(TABLE_PATHS['node_nasabah'], columns=['cif'], month=None, batch_size=75_000)
    nasabah_map_path = os.path.join(MAP_DIR, 'map_nasabah.pkl')
    nasabah_map = build_or_load_map(nasabah_iter, 'cif', nasabah_map_path)
    
    # rekening map (combine simpanan + pinjaman acctno)
    # streaming accounts from node_simpanan and node_pinjaman
    def acct_iter():
        for df in stream_table(TABLE_PATHS['node_simpanan'], columns=['acctno'], month=None, batch_size=75_000):
            yield df
        for df in stream_table(TABLE_PATHS['node_pinjaman'], columns=['acctno'], month=None, batch_size=75_000):
            yield df
    rekening_map_path = os.path.join(MAP_DIR, 'map_rekening.pkl')
    rekening_map = build_or_load_map(acct_iter(), 'acctno', rekening_map_path)

    # transaksi map (id_trx) can be large; we will not map all tx if not needed but create mapping for those referenced by edges
    txn_iter = stream_table(TABLE_PATHS['node_transaksi'], columns=['id_trx'], month=None, batch_size=75_000)
    txn_map_path = os.path.join(MAP_DIR, 'map_transaksi.pkl')
    txn_map = build_or_load_map(txn_iter, 'id_trx', txn_map_path)

    # 2) Build edge files: map nasabah-pekerja, rekening->nasabah, transaksi->rekening
    print('Step 2: building edge parquet files (mapped integer ids)')
    # edge_nasabah_is_pekerja: src=cif, dst=pn
    out_edge_pekerja = os.path.join(EDGE_DIR, 'edge_nasabah_is_pekerja.parquet')
    build_edges_from_parquet(TABLE_PATHS['edge_nasabah_is_pekerja'], src_col='src', dst_col='dst',
                             src_map=nasabah_map, dst_map=pekerja_map, out_parquet=out_edge_pekerja, month=None)

    # edge_nasabah_memiliki_simp: src=cif, dst=acctno
    out_edge_nasabah_simp = os.path.join(EDGE_DIR, 'edge_nasabah_memiliki_simp.parquet')
    build_edges_from_parquet(TABLE_PATHS['edge_nasabah_memiliki_simp'], src_col='src', dst_col='dst',
                             src_map=nasabah_map, dst_map=rekening_map, out_parquet=out_edge_nasabah_simp, month=None)

    # transaksi -> rekening: use edge_rek_debit (src acct) and edge_rek_credit (dst acct)
    out_edge_txn_from = os.path.join(EDGE_DIR, 'edge_transaksi_from_rekening.parquet')
    build_edges_from_parquet(TABLE_PATHS['edge_rek_debit'], src_col='src', dst_col='dst',
                             src_map=rekening_map, dst_map=txn_map, out_parquet=out_edge_txn_from, month=None)
    out_edge_txn_to = os.path.join(EDGE_DIR, 'edge_transaksi_to_rekening.parquet')
    build_edges_from_parquet(TABLE_PATHS['edge_rek_credit'], src_col='src', dst_col='dst',
                             src_map=txn_map, dst_map=rekening_map, out_parquet=out_edge_txn_to, month=None)

    # 3) Build simple aggregated features per pekerja (streaming)
    print('Step 3: aggregate per-pekerja features (simple)')
    # We need a mapping from nasabah -> pekerja; load the edge file we created
    df_pekerja_edge = pd.read_parquet(out_edge_pekerja)
    # build nasabah->pekerja map (pick first pekerja if multiple)
    nasabah_to_pekerja = {}
    df_pekerja_edge = df_pekerja_edge[df_pekerja_edge['src_nid'] >= 0]
    for _, r in df_pekerja_edge.iterrows():
        nasabah_to_pekerja[r['src_nid']] = int(r['dst_nid'])
    # account->nasabah map: we can build from edge_nasabah_memiliki_simp
    df_nasabah_simp = pd.read_parquet(out_edge_nasabah_simp)
    acct_to_nasabah = {}
    for _, r in df_nasabah_simp.iterrows():
        acct_to_nasabah[int(r['dst_nid'])] = int(r['src_nid'])

    # call simple aggregator
    pekerja_feat_parquet = os.path.join(FEATURE_DIR, 'pekerja_features.parquet')
    agg_features_per_pekerja_simple(TABLE_PATHS['node_transaksi'], acct_to_nasabah, nasabah_map, nasabah_to_pekerja,
                                    out_parquet=pekerja_feat_parquet, months=months_train+months_val+months_test)

    # 4) build heterodata from files
    print('Step 4: build HeteroData (loading pekerja features, edges)')
    map_files = {'pekerja': pekerja_map_path, 'nasabah': nasabah_map_path, 'rekening': rekening_map_path, 'transaksi': txn_map_path}
    node_feature_files = {'pekerja': pekerja_feat_parquet}
    edge_parquet_files = {
        ('nasabah','is_pekerja_of','pekerja'): out_edge_pekerja,
        ('nasabah','has_simp','rekening'): out_edge_nasabah_simp,
        ('transaksi','from_acct','rekening'): out_edge_txn_from,
        ('transaksi','to_acct','rekening'): out_edge_txn_to,
    }
    data = build_heterodata(map_files, node_feature_files, edge_parquet_files)

    # 5) prepare train/val/test indices for pekerja (time-split using label parquet)
    print('Step 5: prepare train/val/test indices from label file')
    labels = pd.read_parquet(label_parquet_path)
    # expected label file columns: pn (pekerja id original), nid (mapped nid), is_fraud, period
    if 'nid' not in labels.columns:
        labels = labels.merge(pd.DataFrame(list(pekerja_map.items()), columns=['pn','nid']), on='pn', how='left')
    # filter by months
    train_mask = labels['period'].isin(months_train)
    val_mask = labels['period'].isin(months_val)
    test_mask = labels['period'].isin(months_test)
    # ensure y aligns with data['pekerja'].y ordering (assume nid is 0..N-1)
    labels_sorted = labels.sort_values('nid')

    train_idx = torch.tensor(labels_sorted[train_mask]['nid'].values.astype('int64'))
    val_idx = torch.tensor(labels_sorted[val_mask]['nid'].values.astype('int64'))
    test_idx = torch.tensor(labels_sorted[test_mask]['nid'].values.astype('int64'))

    pos_idx = torch.tensor(labels_sorted[train_mask & (labels_sorted['is_fraud']==1)]['nid'].values.astype('int64'))

    # 6) loaders
    train_loader = make_neighbor_loader(data, train_idx, pos_idx, batch_size=KNOBS['batch_size_nodes'],
                                       num_neighbors=KNOBS['num_neighbors'], oversample_ratio=4)
    val_loader = make_neighbor_loader(data, val_idx, pos_idx=torch.tensor([]), batch_size=KNOBS['batch_size_nodes'],
                                     num_neighbors=KNOBS['num_neighbors'], oversample_ratio=1, shuffle=False)
    test_loader = make_neighbor_loader(data, test_idx, pos_idx=torch.tensor([]), batch_size=KNOBS['batch_size_nodes'],
                                      num_neighbors=KNOBS['num_neighbors'], oversample_ratio=1, shuffle=False)

    # 7) model
    print('Step 6: create model and train')
    # determine in_dims per node type
    in_dims = {}
    for ntype in data.node_types:
        if hasattr(data[ntype], 'x'):
            in_dims[ntype] = data[ntype].x.size(1)
    metadata = (list(data.node_types), list(data.edge_types))
    model = CompactHeteroSAGE(metadata=metadata, in_dims=in_dims, hidden=KNOBS['hidden_dim'], num_layers=2)

    # 8) train
    trained = train_model(model, train_loader, val_loader, device=KNOBS['device'], epochs=5, accum_steps=2)

    # 9) final evaluation on test set
    test_metrics = evaluate_model(trained, test_loader, device=KNOBS['device'])
    print('Test metrics:', test_metrics)

    # save model
    torch.save(trained.state_dict(), os.path.join("D:\zman\graph\notebook\model", 'hetero_sage_pekerja.pt'))
    print('Version A pipeline completed. Artifacts saved under', "D:\zman\graph\notebook\model")


In [13]:
# -----------------------------------------------------------------------------
# CLI / example usage
# -----------------------------------------------------------------------------
if __name__ == '__main__':
    # Example months split (edit according to your partitions e.g. '202308'..'202408')
    months_train = ['202308','202309','202310','202311','202312','202401','202402','202403']
    months_val = ['202404']
    months_test = ['202405']
    # label_parquet_path must contain columns: pn, is_fraud, period. If nid not provided, script maps pn->nid via pekerja_map
    label_parquet_path = os.path.join(BASE_DIR, 'labels', 'pekerja_labels.parquet')
    version_a_runner(months_train, months_val, months_test, label_parquet_path)


Step 1: building id maps (streaming)
Loading map from ./maps\map_pekerja.pkl
Loading map from ./maps\map_nasabah.pkl
Loading map from ./maps\map_rekening.pkl


MemoryError: 