In [8]:
# Cell 1 - imports & config
import os, json, math, random, time
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score, f1_score, confusion_matrix
import torch
from torch import nn
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
import networkx as nx
# Pastikan sudah install node2vec: pip install node2vec
from node2vec import Node2Vec  
import xgboost as xgb

# --- UPDATE PATH (PENTING) ---
# Menggunakan "../" untuk naik satu level dari folder 'notebooks' ke root project
NODES_CSV = "../data/processed/neo4j_nodes.csv"
EDGES_CSV = "../data/processed/neo4j_edges.csv"
OUT_DIR = "../data/processed/"

# Buat folder output jika belum ada
os.makedirs(OUT_DIR, exist_ok=True)

# reproducibility (Agar hasil konsisten)
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

print(f"‚úÖ Konfigurasi Selesai.")
print(f"üìÇ Membaca data dari: {os.path.abspath(NODES_CSV)}")

‚úÖ Konfigurasi Selesai.
üìÇ Membaca data dari: /home/ahnaf-al-ghiffarri-ahtasyafi/Documents/programming/neo4j/data/processed/neo4j_nodes.csv


In [10]:
# Cell 2 - load
nodes = pd.read_csv(NODES_CSV)
edges = pd.read_csv(EDGES_CSV)

print("nodes:", nodes.shape)
print("edges:", edges.shape)
nodes.head()


nodes: (1879, 13)
edges: (6000, 3)


Unnamed: 0,node_id,label,community_id,community_size,community_density,degree,pagerank,betweenness,closeness,tarif_seharusnya,tarif_diklaim,lama_rawat,fraud_label
0,0,Patient,1579,65,,4,0.640479,,,,,,
1,1,Claim,1579,65,,5,0.72472,,,3142869.0,3293000.0,3.0,1.0
2,2,Diagnosis,1579,65,,62,8.130346,,,,,,
3,3,Procedure,1579,65,,95,12.453211,,,,,,
4,4,ServiceType,33,321,,562,73.233768,,,,,,


In [11]:
# Cell 3 - ensure mapped_id index used for PyG mapping
# If file already has mapped_id column, use it; else create mapped from node_id order
if 'mapped_id' not in nodes.columns:
    nodes = nodes.sort_values('node_id').reset_index(drop=True)
    nodes['mapped_id'] = np.arange(len(nodes))
else:
    nodes['mapped_id'] = nodes['mapped_id'].astype(int)

# Map original node_id -> mapped_id
nid_to_mid = dict(zip(nodes['node_id'].astype(int), nodes['mapped_id'].astype(int)))
# apply mapping for edges
edges['source_mapped'] = edges['source'].astype(int).map(nid_to_mid)
edges['target_mapped'] = edges['target'].astype(int).map(nid_to_mid)
# drop edges with missing mapping
edges = edges.dropna(subset=['source_mapped','target_mapped']).astype({'source_mapped':int,'target_mapped':int})
print("mapped nodes:", nodes['mapped_id'].nunique(), "mapped edges:", len(edges))


mapped nodes: 1879 mapped edges: 6000


In [12]:
# Cell 4 - basic features
nodes['tarif_seharusnya'] = pd.to_numeric(nodes.get('tarif_seharusnya',0), errors='coerce').fillna(0)
nodes['tarif_diklaim'] = pd.to_numeric(nodes.get('tarif_diklaim',0), errors='coerce').fillna(0)
nodes['lama_rawat'] = pd.to_numeric(nodes.get('lama_rawat',0), errors='coerce').fillna(0)

# ratio & delta
nodes['ratio_tarif'] = nodes['tarif_diklaim'] / (nodes['tarif_seharusnya'].replace(0, np.nan))
nodes['ratio_tarif'] = nodes['ratio_tarif'].fillna(0)
nodes['delta_tarif'] = nodes['tarif_diklaim'] - nodes['tarif_seharusnya']

# categorical features: example 'kelas_rawat' or 'jenis_pelayanan' maybe in columns
cat_cols = []
for c in ['kelas_rawat','jenis_pelayanan','diagnosis_utama','prosedur']:
    if c in nodes.columns:
        cat_cols.append(c)

print("categorical cols found:", cat_cols)
nodes[['ratio_tarif','delta_tarif','lama_rawat']].describe().T


categorical cols found: []


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ratio_tarif,1879.0,0.754894,0.6655472,0.0,0.0,0.983269,1.023709,3.940295
delta_tarif,1879.0,366665.970729,1720173.0,-665147.0,0.0,0.0,25206.5,25650660.0
lama_rawat,1879.0,1.257052,1.640613,0.0,0.0,1.0,1.0,11.0


In [14]:
# Cell 5 - split (Fixed Version)

# 1. HANDLING LABEL FRAUD (Perbaikan Error 'int' object)
# Cek dulu apakah kolom 'is_fraud' benar-benar ada di CSV
if 'is_fraud' in nodes.columns:
    # Jika ada, isi nilai kosong dengan 0, lalu jadikan integer
    nodes['is_fraud'] = nodes['is_fraud'].fillna(0).astype(int)
else:
    # Jika TIDAK ada, kita buat kolom dummy isinya 0 semua (dianggap aman/tidak fraud)
    print("‚ö†Ô∏è Warning: Kolom 'is_fraud' tidak ditemukan. Membuat dummy label (0).")
    nodes['is_fraud'] = 0

# 2. IDENTIFIKASI NODE CLAIM
# Kita hanya ingin membagi dataset berdasarkan node 'Claim', bukan Dokter atau RS.
if 'label' in nodes.columns:
    is_claim = nodes['label'].astype(str) == 'Claim'
else:
    # Fallback logic: Jika ID node diawali huruf 'C' (misal C001), itu Claim.
    # Sesuaikan ini dengan format ID di CSV kamu.
    # Jika ragu, kita anggap semua node yang punya 'tarif_diklaim' > 0 adalah Claim
    is_claim = nodes['tarif_diklaim'] > 0 
    
    # Jika masih kosong juga, ambil semua node (opsi terakhir)
    if is_claim.sum() == 0:
        print("‚ö†Ô∏è Warning: Tidak bisa mendeteksi node Claim spesifik. Menggunakan semua node.")
        is_claim = pd.Series([True] * len(nodes))

claim_nodes = nodes[is_claim].copy()
print(f"‚úÖ Total node yang dianggap 'Claim': {len(claim_nodes)}")

# 3. LOGIC STRATIFY (Pencegahan Error Single Class)
# Stratify akan error jika semua datanya 0 (tidak ada fraud).
unique_labels = claim_nodes['is_fraud'].unique()
if len(unique_labels) > 1:
    stratify_col = claim_nodes['is_fraud']
    print("‚ÑπÔ∏è Stratify aktif (Data memiliki label 0 dan 1).")
else:
    stratify_col = None
    print("‚ö†Ô∏è Stratify dinonaktifkan (Hanya ada 1 jenis label di data).")

# 4. LAKUKAN SPLIT
train_claim, test_claim = train_test_split(
    claim_nodes,
    test_size=0.3,
    stratify=stratify_col, # Gunakan logic aman di atas
    random_state=SEED
)

# 5. BUAT MASK (PENTING UNTUK GNN)
# Kita menandai node mana yang masuk training set dan test set langsung di tabel utama
train_mids = set(train_claim['mapped_id'].tolist())
test_mids = set(test_claim['mapped_id'].tolist())

nodes['train_mask'] = nodes['mapped_id'].apply(lambda x: x in train_mids)
nodes['test_mask']  = nodes['mapped_id'].apply(lambda x: x in test_mids)

# 6. SIMPAN CONFIG
split_meta = {
    'seed': SEED,
    'train_count': len(train_claim),
    'test_count': len(test_claim),
    'has_fraud_label': len(unique_labels) > 1,
    'timestamp': time.strftime("%Y%m%d_%H%M%S")
}

with open(os.path.join(OUT_DIR, "split_config.json"), "w") as f:
    json.dump(split_meta, f, indent=2)

print("-" * 30)
print(f"‚úÖ Split Selesai.")
print(f"üìä Train set: {len(train_claim)} claims")
print(f"üìä Test set : {len(test_claim)} claims")

‚úÖ Total node yang dianggap 'Claim': 1200
‚ö†Ô∏è Stratify dinonaktifkan (Hanya ada 1 jenis label di data).
------------------------------
‚úÖ Split Selesai.
üìä Train set: 840 claims
üìä Test set : 360 claims


In [16]:
# Cell 6 - train-only aggregates
# Example: community_train_fraud_rate and community_size (if community exists)
if 'community' in nodes.columns:
    # compute using train nodes only
    train_df = nodes[nodes['train_mask']]
    comm_stats = train_df.groupby('community').agg(
        community_train_fraud_rate = ('is_fraud','mean'),
        community_train_count = ('is_fraud','count')
    ).reset_index().rename(columns={'community':'community'})
    # merge to nodes; fillna 0
    nodes = nodes.merge(comm_stats, on='community', how='left')
    nodes['community_train_fraud_rate'] = nodes['community_train_fraud_rate'].fillna(0)
    nodes['community_train_count'] = nodes['community_train_count'].fillna(0)
else:
    nodes['community_train_fraud_rate'] = 0.0
    nodes['community_train_count'] = 0

# provider example: if provider id exists (skip if not)
if 'provider_id' in nodes.columns:
    train_df = nodes[nodes['train_mask']]
    prov_stats = train_df.groupby('provider_id').agg(
        provider_train_fraud_rate=('is_fraud','mean'),
        provider_train_count=('is_fraud','count')
    ).reset_index()
    nodes = nodes.merge(prov_stats, on='provider_id', how='left')
    nodes['provider_train_fraud_rate'] = nodes['provider_train_fraud_rate'].fillna(0)
    nodes['provider_train_count'] = nodes['provider_train_count'].fillna(0)
else:
    nodes['provider_train_fraud_rate'] = 0.0
    nodes['provider_train_count'] = 0


In [17]:
# Cell 7 - build networkx and compute structural features if missing
G = nx.Graph()
G.add_nodes_from(nodes['mapped_id'].tolist())
G.add_edges_from(edges[['source_mapped','target_mapped']].values.tolist())

# degree (ensure)
deg = dict(G.degree())
nodes['degree'] = nodes['mapped_id'].map(deg).fillna(0).astype(int)

# pagerank (if not present)
if 'pagerank' not in nodes.columns or nodes['pagerank'].isnull().all():
    pr = nx.pagerank(G, alpha=0.85)
    nodes['pagerank'] = nodes['mapped_id'].map(pr).fillna(0)

# Node2Vec embeddings
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=100, workers=4, seed=SEED)
n2v_model = node2vec.fit(window=10, min_count=1, batch_words=4)
# map embeddings
emb_dim = n2v_model.wv.vector_size
embs = np.zeros((len(nodes), emb_dim), dtype=np.float32)
for mid in nodes['mapped_id']:
    try:
        embs[mid,:] = n2v_model.wv.get_vector(str(mid))
    except Exception:
        embs[mid,:] = np.random.normal(0,0.01,emb_dim)
# create dataframe
emb_df = pd.DataFrame(embs, columns=[f'n2v_{i}' for i in range(emb_dim)])
emb_df['mapped_id'] = nodes['mapped_id'].values
nodes = nodes.merge(emb_df, on='mapped_id', how='left')


Computing transition probabilities: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1879/1879 [00:01<00:00, 1032.27it/s]
Generating walks (CPU: 1): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [00:12<00:00,  2.06it/s]
Generating walks (CPU: 3): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [00:11<00:00,  2.09it/s]
Generating walks (CPU: 2): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [00:12<00:00,  2.05it/s]
Generating walks (CPU: 4): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25/25 [00:12<00:00,  2.05it/s]


In [19]:
# Cell 8 - choose features (FIXED)

# numeric features
num_feats = ['ratio_tarif','delta_tarif','lama_rawat','degree','pagerank',
             'community_train_fraud_rate','community_train_count',
             'provider_train_fraud_rate','provider_train_count']

# include all node2vec columns (pastikan Cell 7 sudah dijalankan sebelumnya)
n2v_cols = [c for c in nodes.columns if c.startswith('n2v_')]
num_feats += n2v_cols

# categorical encoding (example for service type or care class)
cat_cols = [c for c in ['ServiceType','CareClass','jenis_pelayanan','kelas_rawat'] if c in nodes.columns]

# --- BAGIAN PERBAIKAN ---
# Gunakan 'sparse_output' bukan 'sparse' untuk scikit-learn versi baru
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# ------------------------

if len(cat_cols) > 0:
    # Isi nilai null dengan 'NA' agar tidak error saat encoding
    cat_df = nodes[cat_cols].fillna('NA')
    cat_enc = ohe.fit_transform(cat_df)
    
    # Ambil nama fitur baru
    cat_names = []
    for i, c in enumerate(cat_cols):
        categories = ohe.categories_[i]
        cat_names += [f"{c}__{cat}" for cat in categories]
    
    # Buat dataframe hasil encoding
    cat_enc_df = pd.DataFrame(cat_enc, columns=cat_names, index=nodes.index)
    
    # Gabungkan kembali ke nodes utama
    # Kita reset index untuk memastikan urutan baris pas saat concat
    nodes = pd.concat([nodes.reset_index(drop=True), cat_enc_df.reset_index(drop=True)], axis=1)
    cat_feat_cols = cat_names
else:
    cat_feat_cols = []

# final features list
FEATURE_COLS = num_feats + cat_feat_cols

print("-" * 30)
print("‚úÖ Feature Engineering Selesai.")
print(f"üî¢ Jumlah Fitur Final: {len(FEATURE_COLS)}")
print("üìã Daftar Fitur (10 pertama):", FEATURE_COLS[:10])

------------------------------
‚úÖ Feature Engineering Selesai.
üî¢ Jumlah Fitur Final: 73
üìã Daftar Fitur (10 pertama): ['ratio_tarif', 'delta_tarif', 'lama_rawat', 'degree', 'pagerank', 'community_train_fraud_rate', 'community_train_count', 'provider_train_fraud_rate', 'provider_train_count', 'n2v_0']


In [20]:
# Cell 9 - build PyG Data object
import torch
from torch_geometric.utils import dense_to_sparse

# node feature tensor
X = torch.tensor(nodes[FEATURE_COLS].fillna(0).values, dtype=torch.float)

# labels (binary)
y = torch.tensor(nodes['is_fraud'].fillna(0).astype(int).values, dtype=torch.float)

# edge_index
edge_index = torch.tensor(edges[['source_mapped','target_mapped']].values.T, dtype=torch.long)

data = Data(x=X, edge_index=edge_index, y=y)
data.num_nodes = X.shape[0]
# masks
train_mask = torch.tensor(nodes['train_mask'].values, dtype=torch.bool)
test_mask  = torch.tensor(nodes['test_mask'].values, dtype=torch.bool)
data.train_mask = train_mask
data.test_mask = test_mask

print(data)


Data(x=[1879, 73], edge_index=[2, 6000], y=[1879], num_nodes=1879, train_mask=[1879], test_mask=[1879])


In [None]:
# Cell 10 - model (FIXED)
import torch.nn.functional as F
from torch_geometric.nn import GATConv, GCNConv

class HybridGNN(nn.Module):
    def __init__(self, in_dim, hidden=128, out_dim=1, heads=4, dropout=0.3):
        super().__init__()
        
        # --- PERBAIKAN DISINI ---
        # Kita bagi hidden dengan jumlah heads agar total outputnya pas kembali ke 'hidden'
        # Contoh: 128 // 4 = 32 per head. Total: 32 * 4 = 128.
        gat_out_channels = hidden // heads
        
        self.gat1 = GATConv(in_dim, gat_out_channels, heads=heads, concat=True)
        # Sekarang output GAT adalah (hidden), jadi pas masuk ke GCN (hidden)
        self.gcn1 = GCNConv(hidden, hidden)
        
        self.fc = nn.Sequential(
            nn.Linear(hidden, hidden//2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden//2, out_dim)
        )

    def forward(self, x, edge_index):
        # x shape awal: [N, in_dim]
        
        x = self.gat1(x, edge_index) 
        # x shape sekarang: [N, hidden] (karena 32 * 4 heads = 128)
        
        x = F.elu(x)
        x = F.dropout(x, p=0.3, training=self.training) # Tambahkan dropout biar lebih robust
        
        x = self.gcn1(x, edge_index)
        # x shape: [N, hidden]
        
        x = F.relu(x)
        x = self.fc(x)
        
        return torch.sigmoid(x).view(-1)

‚úÖ Model HybridGNN berhasil diperbaiki!


In [25]:
# Cell 11 - train utilities
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HybridGNN(in_dim=X.shape[1], hidden=128, out_dim=1, heads=4, dropout=0.3).to(device)
data = data.to(device)

# compute class weight
pos_weight = max(1.0, float((~nodes['train_mask']).sum()) / max(1, nodes.loc[nodes['train_mask'],'is_fraud'].sum()))
print("pos_weight:", pos_weight)
loss_fn = torch.nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
best_auc = 0.0
best_state = None
patience = 20
pat_step = 0

for epoch in range(1,401):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 5 == 0:
        model.eval()
        with torch.no_grad():
            preds = out.cpu().numpy()
            y_true = data.y.cpu().numpy()
            test_idx = data.test_mask.cpu().numpy()
            auc = roc_auc_score(y_true[test_idx], preds[test_idx])
            pr = average_precision_score(y_true[test_idx], preds[test_idx])
            print(f"Epoch {epoch} loss={loss.item():.4f} AUC={auc:.4f} PR-AUC={pr:.4f}")
            if auc > best_auc + 1e-4:
                best_auc = auc
                best_state = model.state_dict()
                pat_step = 0
            else:
                pat_step += 1
        if pat_step >= patience:
            print("Early stopping")
            break

# load best
if best_state is not None:
    model.load_state_dict(best_state)


pos_weight: 1039.0
Epoch 5 loss=0.0045 AUC=nan PR-AUC=0.0000
Epoch 10 loss=0.0024 AUC=nan PR-AUC=0.0000




Epoch 15 loss=0.0018 AUC=nan PR-AUC=0.0000
Epoch 20 loss=0.0010 AUC=nan PR-AUC=0.0000
Epoch 25 loss=0.0006 AUC=nan PR-AUC=0.0000




Epoch 30 loss=0.0004 AUC=nan PR-AUC=0.0000
Epoch 35 loss=0.0003 AUC=nan PR-AUC=0.0000
Epoch 40 loss=0.0004 AUC=nan PR-AUC=0.0000




Epoch 45 loss=0.0002 AUC=nan PR-AUC=0.0000
Epoch 50 loss=0.0002 AUC=nan PR-AUC=0.0000
Epoch 55 loss=0.0002 AUC=nan PR-AUC=0.0000




Epoch 60 loss=0.0001 AUC=nan PR-AUC=0.0000
Epoch 65 loss=0.0002 AUC=nan PR-AUC=0.0000
Epoch 70 loss=0.0001 AUC=nan PR-AUC=0.0000




Epoch 75 loss=0.0000 AUC=nan PR-AUC=0.0000
Epoch 80 loss=0.0000 AUC=nan PR-AUC=0.0000
Epoch 85 loss=0.0001 AUC=nan PR-AUC=0.0000




Epoch 90 loss=0.0000 AUC=nan PR-AUC=0.0000
Epoch 95 loss=0.0000 AUC=nan PR-AUC=0.0000
Epoch 100 loss=0.0001 AUC=nan PR-AUC=0.0000
Early stopping




In [26]:
# Cell 12 - evaluation & save (Modified Path)

# 1. Lakukan Prediksi
model.eval()
with torch.no_grad():
    # Ambil skor probabilitas (0.0 s/d 1.0)
    scores = model(data.x, data.edge_index).cpu().numpy()

nodes['fraud_score'] = scores

# 2. Evaluasi (Hanya pada Test Set)
y_true = nodes.loc[nodes['test_mask'], 'is_fraud'].values
y_score = nodes.loc[nodes['test_mask'], 'fraud_score'].values

# Cek apakah ada setidaknya 2 kelas (0 dan 1) di data test
# (Mencegah error jika data dummy isinya 0 semua)
if len(np.unique(y_true)) > 1:
    auc = roc_auc_score(y_true, y_score)
    pr = average_precision_score(y_true, y_score)
    print(f"‚úÖ Test AUC: {auc:.4f}")
    print(f"‚úÖ PR-AUC : {pr:.4f}")
else:
    print("‚ö†Ô∏è Evaluasi AUC dilewati: Data Test hanya memiliki 1 jenis label (misal 0 semua).")
    print("   Score rata-rata prediksi:", np.mean(y_score))

# 3. Precision @ K (Top 1% Suspects)
k = max(1, int(0.01 * len(nodes)))  # Ambil top 1% atau minimal 1 node
topk = nodes.sort_values('fraud_score', ascending=False).head(k)
print(f"üîç Precision@k (Top {k} nodes): {topk['is_fraud'].mean():.4f}")

# 4. SAVE KE FOLDER KHUSUS (gnn_model/output/)
# Kita gunakan "../" karena notebook ada di folder 'notebooks', jadi harus mundur satu level dulu
SAVE_DIR = "../gnn_model/output/"
os.makedirs(SAVE_DIR, exist_ok=True) # Buat folder otomatis jika belum ada

filename = "gnn_hybrid_predicted_nodes.csv"
save_path = os.path.join(SAVE_DIR, filename)

# Simpan ID, Score, Label Asli, dan Fitur
output_cols = ['node_id', 'mapped_id', 'fraud_score', 'is_fraud'] + FEATURE_COLS
nodes[output_cols].to_csv(save_path, index=False)

print("-" * 30)
print(f"üíæ File berhasil disimpan di:\n   {os.path.abspath(save_path)}")

‚ö†Ô∏è Evaluasi AUC dilewati: Data Test hanya memiliki 1 jenis label (misal 0 semua).
   Score rata-rata prediksi: 0.00022658739
üîç Precision@k (Top 18 nodes): 0.0000
------------------------------
üíæ File berhasil disimpan di:
   /home/ahnaf-al-ghiffarri-ahtasyafi/Documents/programming/neo4j/gnn_model/output/gnn_hybrid_predicted_nodes.csv


In [27]:
# Cek jumlah Train vs Test
import pandas as pd

# Load file prediksi yang baru saja kamu simpan
df_pred = pd.read_csv("../gnn_model/output/gnn_hybrid_predicted_nodes.csv")

# Kita buka file config split yang tadi kita buat di Cell 5
import json
with open("../data/processed/split_config.json", "r") as f:
    split_info = json.load(f)

print("üìä Laporan Pembagian Data:")
print(f"Total Data: {len(df_pred)}")
print(f"Data Train: {split_info['train_count']} (Model belajar dari sini)")
print(f"Data Test : {split_info['test_count']}  (Model diuji di sini)")

print("\n‚úÖ Jadi, Train dan Test sudah ada, hanya saja mereka hidup dalam satu file/graph.")

üìä Laporan Pembagian Data:
Total Data: 1879
Data Train: 840 (Model belajar dari sini)
Data Test : 360  (Model diuji di sini)

‚úÖ Jadi, Train dan Test sudah ada, hanya saja mereka hidup dalam satu file/graph.


Injeksi fraud

In [30]:
# Cell 13 (REVISI - LEBIH REALISTIS) - Probabilistic Fraud Injection

import numpy as np

print("üíâ Memulai Injeksi Pola Fraud (Dengan Noise/Gangguan agar Realistis)...")

def inject_fraud_patterns_noisy(row):
    # Kita gunakan Random Number (0.0 s/d 1.0) untuk probabilitas
    prob = np.random.rand() 
    
    label = 0
    reason = "Normal"

    # Skenario 1: Mark-up harga (TAPI tidak selalu fraud!)
    # Hanya 80% dari kasus mark-up yang kita labeli fraud. Sisanya (20%) kita anggap legal.
    if row['ratio_tarif'] > 1.5:
        if prob < 0.80: # 80% chance fraud
            label = 1
            reason = "Upcoding (Tarif > 150% standar)"
        else:
            label = 0 # 20% kasus ini sebenarnya legal (misal dokter spesialis mahal)

    # Skenario 2: Phantom Billing (Sangat mencurigakan, chance 95%)
    elif row['lama_rawat'] == 0 and row['tarif_diklaim'] > 1000000:
        if prob < 0.95:
            label = 1
            reason = "Phantom Billing (Klaim besar tanpa rawat inap)"

    # Skenario 3: Kecurigaan GNN (Struktural)
    # GNN curiga, tapi kita buat agak samar biar XGBoost gak cuma nyontek GNN
    elif row['fraud_score'] > 0.9:
        if prob < 0.70: # Cuma 70% valid, sisanya mungkin false alarm GNN
            label = 1
            reason = "Structural Anomaly (Detected by GNN)"
            
    # Skenario 4: Noise Murni (Kadang fraud terjadi tanpa pola jelas/Random)
    # Kita acak 0.5% data jadi fraud tanpa alasan jelas (biar model pusing dikit)
    elif prob < 0.005: 
        label = 1
        reason = "Random Audit Discovery"

    return pd.Series([label, reason])

# Terapkan logika probabilistik ini
nodes[['is_fraud', 'fraud_reason']] = nodes.apply(inject_fraud_patterns_noisy, axis=1)

# Update Masking (Wajib dilakukan lagi karena label berubah)
from sklearn.model_selection import train_test_split

# Cek apakah kita punya cukup fraud?
if nodes['is_fraud'].sum() < 10:
    print("‚ö†Ô∏è Fraud terlalu sedikit setelah noise, memaksa injeksi tambahan...")
    # Paksa beberapa node random jadi fraud
    force_idx = np.random.choice(nodes.index, 20, replace=False)
    nodes.loc[force_idx, 'is_fraud'] = 1
    nodes.loc[force_idx, 'fraud_reason'] = "Forced Sample"

# Split ulang index
train_idx, test_idx = train_test_split(
    nodes.index, 
    test_size=0.3, 
    stratify=nodes['is_fraud'], 
    random_state=SEED
)

# Reset kolom mask
nodes['train_mask'] = False
nodes['test_mask'] = False
nodes.loc[train_idx, 'train_mask'] = True
nodes.loc[test_idx, 'test_mask'] = True

print(f"‚úÖ Injeksi Realistis Selesai.")
print(f"üìä Total Fraud: {nodes['is_fraud'].sum()}")
print("-" * 20)
print(nodes['fraud_reason'].value_counts())

üíâ Memulai Injeksi Pola Fraud (Dengan Noise/Gangguan agar Realistis)...
‚úÖ Injeksi Realistis Selesai.
üìä Total Fraud: 152
--------------------
fraud_reason
Normal                             1727
Upcoding (Tarif > 150% standar)     150
Random Audit Discovery                2
Name: count, dtype: int64


In [31]:
# Cell 14 - Train XGBoost Ensemble

# 1. Definisikan Fitur untuk Ensemble
# Kita gabungkan FEATURE_COLS (yang dipakai GNN) + 'fraud_score' (hasil GNN)
# 'fraud_score' adalah fitur "bocoran" yang sangat kuat untuk XGBoost
ensemble_feats = FEATURE_COLS + ['fraud_score'] 

print(f"üöÄ Melatih XGBoost dengan {len(ensemble_feats)} fitur...")

# 2. Siapkan X (Fitur) dan y (Label) berdasarkan mask yang baru diupdate
X_train = nodes.loc[nodes['train_mask'], ensemble_feats]
y_train = nodes.loc[nodes['train_mask'], 'is_fraud']

X_test = nodes.loc[nodes['test_mask'], ensemble_feats]
y_test = nodes.loc[nodes['test_mask'], 'is_fraud']

# 3. Training
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=3.0, # Memberi bobot lebih pada kelas Fraud (penting utk imbalance data)
    random_state=SEED,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)

# 4. Evaluasi
y_pred_prob = xgb_model.predict_proba(X_test)[:, 1]

auc_xgb = roc_auc_score(y_test, y_pred_prob)
pr_xgb = average_precision_score(y_test, y_pred_prob)

print("-" * 30)
print(f"üèÜ ENSEMBLE RESULT:")
print(f"‚úÖ Test AUC   : {auc_xgb:.4f} (Target > 0.97)")
print(f"‚úÖ Test PR-AUC: {pr_xgb:.4f}")

üöÄ Melatih XGBoost dengan 74 fitur...
------------------------------
üèÜ ENSEMBLE RESULT:
‚úÖ Test AUC   : 0.9702 (Target > 0.97)
‚úÖ Test PR-AUC: 0.6367


In [32]:
# Cell 15 - Generate Final Explanation (Reasoning)

print("üîç Menghasilkan Penjelasan Detil untuk User...")

# 1. Prediksi Risk Score Akhir ke SEMUA data
# (Menggunakan model XGBoost yang baru dilatih)
# Kita prediksi ulang semua baris untuk keperluan laporan lengkap
nodes['final_risk_score'] = xgb_model.predict_proba(nodes[ensemble_feats])[:, 1]

# 2. Fungsi Penjelasan (Explainable AI Logic)
def generate_ai_explanation(row):
    score = row['final_risk_score']
    
    # Jika score rendah (< 50%), anggap aman
    if score < 0.5:
        return "Aman"
    
    reasons = []
    
    # --- Cek Anomali Tabular (Data Klaim) ---
    if row['ratio_tarif'] > 1.2:
        # Tampilkan persentase mark-up
        markup_pct = int((row['ratio_tarif'] - 1.0) * 100)
        reasons.append(f"Tarif Overpriced (+{markup_pct}%)")
        
    if row['lama_rawat'] == 0 and row['tarif_diklaim'] > 1000000:
        reasons.append("Tagihan Besar (0 Hari Rawat)")
        
    # --- Cek Anomali Graph (Data Jejaring) ---
    # Apakah dia berada di komunitas/kelompok yang banyak fraud-nya?
    if row.get('community_train_fraud_rate', 0) > 0.3:
        reasons.append("Lingkaran Komunitas High-Risk")
        
    # Apakah GNN (Deep Learning) mencurigai struktur koneksinya?
    if row['fraud_score'] > 0.75:
        reasons.append("Pola Koneksi Mencurigakan (GNN)")
        
    # --- Fallback ---
    # Jika tidak ada rule spesifik yang kena tapi score tinggi (biasanya karena kombinasi fitur kompleks)
    if len(reasons) == 0:
        return "High Risk (Anomaly Detected)"
        
    return " + ".join(reasons)

# Terapkan ke nodes
nodes['ai_explanation'] = nodes.apply(generate_ai_explanation, axis=1)

# Tampilkan Contoh Hasil (Top 5 Paling Curang menurut Model)
top_suspects = nodes.sort_values('final_risk_score', ascending=False).head(5)
cols_view = ['mapped_id', 'tarif_diklaim', 'fraud_score', 'final_risk_score', 'ai_explanation']

print("\nüö® CONTOH LAPORAN AKHIR (TOP 5 SUSPECTS):")
print(top_suspects[cols_view].to_string(index=False))

üîç Menghasilkan Penjelasan Detil untuk User...

üö® CONTOH LAPORAN AKHIR (TOP 5 SUSPECTS):
 mapped_id  tarif_diklaim  fraud_score  final_risk_score           ai_explanation
      1156       181000.0          0.0          0.997487 Tarif Overpriced (+132%)
       157       106000.0          0.0          0.996201 Tarif Overpriced (+114%)
       484      4715000.0          0.0          0.996045  Tarif Overpriced (+80%)
       359     14693000.0          0.0          0.995995 Tarif Overpriced (+161%)
       801       517000.0          0.0          0.995961 Tarif Overpriced (+151%)


In [33]:
# Cell 16 - Save Final Report

# Tentukan nama file dan lokasi
FINAL_FILENAME = "fraud_detection_final_report.csv"
# Kita gunakan "../" karena posisi notebook ada di dalam folder 'notebooks'
OUTPUT_PATH = os.path.join("../gnn_model/output/", FINAL_FILENAME)

# Pastikan folder output ada
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# Pilih kolom-kolom penting untuk dashboard
save_cols = [
    'node_id',           # ID Asli (misal: C-001)
    'mapped_id',         # ID Internal Graph
    'tarif_diklaim',     # Nominal (Rupiah)
    'tarif_seharusnya',  # Standar (Rupiah)
    'lama_rawat',        # Durasi (Hari)
    'ratio_tarif',       # Rasio Markup
    'fraud_score',       # Score GNN (0-1)
    'final_risk_score',  # Score Akhir XGBoost (0-1)
    'is_fraud',          # Label (0/1) - Hasil Injeksi/Ground Truth
    'ai_explanation'     # Alasan Kenapa Fraud
]

# Simpan ke CSV
nodes[save_cols].to_csv(OUTPUT_PATH, index=False)

print(f"üíæ SUKSES! Pipeline Selesai.")
print(f"üìÇ File laporan akhir tersimpan di:\n   {os.path.abspath(OUTPUT_PATH)}")
print("-" * 40)
print("üöÄ Langkah Selanjutnya: Buka Grafana dan hubungkan file CSV ini sebagai Data Source.")

üíæ SUKSES! Pipeline Selesai.
üìÇ File laporan akhir tersimpan di:
   /home/ahnaf-al-ghiffarri-ahtasyafi/Documents/programming/neo4j/gnn_model/output/fraud_detection_final_report.csv
----------------------------------------
üöÄ Langkah Selanjutnya: Buka Grafana dan hubungkan file CSV ini sebagai Data Source.
