<a href="https://colab.research.google.com/github/apriandito/pertamina-2/blob/main/04_xai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
! pip install dalex



In [9]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix,
    roc_curve
)
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import dalex as dx

In [10]:
# 1. Load training data
train_url = (
    "https://raw.githubusercontent.com/apriandito/pertamina-2/"
    "refs/heads/main/data/bbm_fraud_train.csv"
)
df_train = pd.read_csv(train_url)
df_train.head()

Unnamed: 0,volume_liters,total_amount,hour,is_weekend,loyalty_member,customer_transaction_count,days_since_last_transaction,same_day_transactions,volume_deviation,amount_deviation,...,is_night_transaction,bbm_type_encoded,payment_method_encoded,day_of_week_encoded,customer_type_encoded,spbu_category_encoded,spbu_province_encoded,spbu_city_encoded,is_fraud,fraud_type
0,578.37,8039343.0,7,True,False,1,0.0,1,0.0,0.0,...,False,3,1,2,1,0,5,13,0,
1,664.13,4516084.0,8,False,False,2,109.0,1,0.069022,0.280616,...,False,4,2,6,1,1,3,26,0,
2,163.24,2024176.0,12,False,False,3,37.0,1,0.651628,0.583492,...,False,2,2,0,1,1,2,7,0,
3,54.64,759455.7,4,False,False,4,54.0,1,0.85034,0.801955,...,True,3,4,4,1,1,8,24,1,multiple_cards
4,357.87,4437588.0,2,True,False,5,8.0,1,0.015894,0.121926,...,True,2,1,2,1,2,8,24,0,


In [11]:
# 2. Siapkan fitur & target
X = df_train.drop(columns=["is_fraud", "fraud_type"])
y = df_train["is_fraud"]

In [12]:
# 3. Split data (chronological, tanpa shuffle)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

In [13]:
# 4. Build & fit ANN (MLP)
model = MLPClassifier(
    hidden_layer_sizes=(100,),
    max_iter=200,
    random_state=42
)
model.fit(X_train, y_train)

In [14]:
# 5. Predict & evaluasi
y_pred  = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

accuracy    = accuracy_score(y_test, y_pred)
precision   = precision_score(y_test, y_pred)
recall      = recall_score(y_test, y_pred)
specificity = tn / (tn + fp)
f1          = f1_score(y_test, y_pred)
roc_auc     = roc_auc_score(y_test, y_proba)

print("=== Evaluation on Test Set (ANN MLP) ===")
print(f"Accuracy   : {accuracy:.4f}")
print(f"Precision  : {precision:.4f}")
print(f"Recall     : {recall:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"F1-score   : {f1:.4f}")
print(f"ROC AUC    : {roc_auc:.4f}")


=== Evaluation on Test Set (ANN MLP) ===
Accuracy   : 0.8486
Precision  : 0.1516
Recall     : 0.4438
Specificity: 0.8698
F1-score   : 0.2260
ROC AUC    : 0.7651


In [15]:
# 6. Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=fpr, y=tpr,
    mode='lines',
    name=f'ANN (AUC = {roc_auc:.3f})',
    line=dict(width=2)
))
fig.add_trace(go.Scatter(
    x=[0,1], y=[0,1],
    mode='lines',
    name='Random Guess',
    line=dict(dash='dash', width=1)
))
fig.update_layout(
    title='ROC Curve - ANN (MLP) Fraud Detection',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    legend=dict(x=0.65, y=0.15),
    template='plotly_white',
    width=700,
    height=500
)
fig.show()

In [16]:
# 7. Real‑time prediction
rt_url = (
    "https://raw.githubusercontent.com/apriandito/pertamina-2/"
    "refs/heads/main/data/bbm_fraud_realtime.csv"
)
df_rt = pd.read_csv(rt_url)
feature_cols = X.columns.tolist()
X_rt = df_rt[feature_cols]

df_rt['predicted_is_fraud'] = model.predict(X_rt)
df_rt['fraud_probability']  = model.predict_proba(X_rt)[:, 1]

print("\n=== Real‑time Prediction Summary (ANN MLP) ===")
print(f"Total transaksi    : {len(df_rt)}")
print(f"Predicted fraud     : {df_rt['predicted_is_fraud'].sum()} ({df_rt['predicted_is_fraud'].mean()*100:.2f}%)")
print(f"Average fraud prob  : {df_rt['fraud_probability'].mean():.4f}")


=== Real‑time Prediction Summary (ANN MLP) ===
Total transaksi    : 5000
Predicted fraud     : 1319 (26.38%)
Average fraud prob  : 0.2637


In [17]:
# 8. XAI dengan Dalex
explainer = dx.Explainer(model, X_train, y_train, label="ANN (MLP)")

Preparation of a new explainer is initiated

  -> data              : 40000 rows 21 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 40000 values
  -> model_class       : sklearn.neural_network._multilayer_perceptron.MLPClassifier (default)
  -> label             : ANN (MLP)
  -> predict function  : <function yhat_proba_default at 0x7f32ef4ea8e0> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.0, mean = 0.161, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -1.0, mean = -0.112, max = 1.0
  -> model_info        : package sklearn

A new explainer has been created!


In [18]:
# 8.1 Model performance
mp = explainer.model_performance()
mp.plot()

In [19]:
# 8.2 Feature importance (permutation)
parts = explainer.model_parts()
parts.plot()

In [20]:
# 8.3 Partial dependence profiles
profile = explainer.model_profile(variables=X.columns.tolist())
profile.plot()

Calculating ceteris paribus: 100%|██████████| 21/21 [00:01<00:00, 11.83it/s]


In [21]:
# 9. Breakdown plot untuk sample realtime menampilkan waterfall chart kontribusi fitur pada transaksi pertama
sample_rt = X_rt.iloc[0]
bd_rt = explainer.predict_parts(sample_rt, type='break_down')
bd_rt.plot()