<a href="https://colab.research.google.com/github/apriandito/pertamina-2/blob/main/02_beberapa_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix,
    roc_curve
)
import plotly.graph_objects as go

In [None]:
# 1. Load training data
train_url = (
    "https://raw.githubusercontent.com/apriandito/pertamina-2/"
    "refs/heads/main/data/bbm_fraud_train.csv"
)
df_train = pd.read_csv(train_url)
df_train.head()

Unnamed: 0,volume_liters,total_amount,hour,is_weekend,loyalty_member,customer_transaction_count,days_since_last_transaction,same_day_transactions,volume_deviation,amount_deviation,...,is_night_transaction,bbm_type_encoded,payment_method_encoded,day_of_week_encoded,customer_type_encoded,spbu_category_encoded,spbu_province_encoded,spbu_city_encoded,is_fraud,fraud_type
0,578.37,8039343.0,7,True,False,1,0.0,1,0.0,0.0,...,False,3,1,2,1,0,5,13,0,
1,664.13,4516084.0,8,False,False,2,109.0,1,0.069022,0.280616,...,False,4,2,6,1,1,3,26,0,
2,163.24,2024176.0,12,False,False,3,37.0,1,0.651628,0.583492,...,False,2,2,0,1,1,2,7,0,
3,54.64,759455.7,4,False,False,4,54.0,1,0.85034,0.801955,...,True,3,4,4,1,1,8,24,1,multiple_cards
4,357.87,4437588.0,2,True,False,5,8.0,1,0.015894,0.121926,...,True,2,1,2,1,2,8,24,0,


In [None]:
# 2. Siapkan fitur & target
X = df_train.drop(columns=["is_fraud", "fraud_type"])
y = df_train["is_fraud"]

In [None]:
# 3. Split data (chronological, tanpa shuffle)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

In [None]:
# 4. Definisikan model-model yang ingin dibandingkan
models = {
    "Decision Tree": DecisionTreeClassifier(
        max_depth=None, min_samples_split=10, random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=100, min_samples_split=10, random_state=42
    ),
    "Naive Bayes": GaussianNB(),
    "ANN (MLP)": MLPClassifier(
        hidden_layer_sizes=(100,), max_iter=200,
        random_state=42
    )
}

In [None]:
# 5. Training, evaluasi, dan simpan hasil
results = {}
fpr_dict, tpr_dict, roc_auc_dict = {}, {}, {}

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    # Predict
    y_pred = model.predict(X_test)
    # Probabilities (beberapa model tidak punya predict_proba, tapi semua ini punya)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Metrik
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    acc   = accuracy_score(y_test, y_pred)
    prec  = precision_score(y_test, y_pred, zero_division=0)
    rec   = recall_score(y_test, y_pred, zero_division=0)
    spec  = tn / (tn + fp)
    f1    = f1_score(y_test, y_pred, zero_division=0)
    auc   = roc_auc_score(y_test, y_proba)

    # Simpan
    results[name] = {
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "Specificity": spec,
        "F1-score": f1,
        "ROC AUC": auc
    }

    # Untuk plot ROC
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    fpr_dict[name] = fpr
    tpr_dict[name] = tpr
    roc_auc_dict[name] = auc

In [None]:
# 6. Tampilkan tabel metrik
df_metrics = pd.DataFrame(results).T
print("=== Perbandingan Metrik ===")
print(df_metrics)

=== Perbandingan Metrik ===
               Accuracy  Precision    Recall  Specificity  F1-score   ROC AUC
Decision Tree    0.9913   0.921971  0.901606     0.996001  0.911675  0.953751
Random Forest    0.9942   1.000000  0.883534     1.000000  0.938166  0.992806
Naive Bayes      0.9216   0.081871  0.056225     0.966954  0.066667  0.795219
ANN (MLP)        0.8486   0.151578  0.443775     0.869817  0.225971  0.765107


In [None]:
# 7. Plot ROC Curve untuk semua model
fig = go.Figure()
for name in models.keys():
    fig.add_trace(go.Scatter(
        x=fpr_dict[name], y=tpr_dict[name],
        mode='lines',
        name=f"{name} (AUC={roc_auc_dict[name]:.3f})",
        line=dict(width=2)
    ))
# Diagonal random guess
fig.add_trace(go.Scatter(
    x=[0,1], y=[0,1],
    mode='lines',
    name='Random Guess',
    line=dict(dash='dash', width=1)
))
fig.update_layout(
    title='ROC Curve – Perbandingan Model Fraud Detection',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    legend=dict(x=0.6, y=0.1),
    template='plotly_white',
    width=700, height=500
)
fig.show()

In [None]:
# 8. Pilih model terbaik berdasarkan Accuracy
best_model_name = df_metrics["Accuracy"].idxmax()
best_model = models[best_model_name]
print(f"\nModel terbaik: {best_model_name} dengan Accuracy = {results[best_model_name]['Accuracy']:.4f}")


Model terbaik: Random Forest dengan Accuracy = 0.9942


In [None]:
# 9. Real‑time prediction menggunakan model terbaik
rt_url = (
    "https://raw.githubusercontent.com/apriandito/pertamina-2/"
    "refs/heads/main/data/bbm_fraud_realtime.csv"
)
df_rt = pd.read_csv(rt_url)
feature_cols = X.columns.tolist()
X_rt = df_rt[feature_cols]

df_rt['predicted_is_fraud'] = best_model.predict(X_rt)
df_rt['fraud_probability']  = best_model.predict_proba(X_rt)[:, 1]

print("\n=== Real‑time Prediction Summary ===")
print(f"Total transaksi    : {len(df_rt)}")
print(f"Predicted fraud     : {df_rt['predicted_is_fraud'].sum()} "
      f"({df_rt['predicted_is_fraud'].mean()*100:.2f}%)")
print(f"Average fraud prob  : {df_rt['fraud_probability'].mean():.4f}")

df_rt.head()



=== Real‑time Prediction Summary ===
Total transaksi    : 5000
Predicted fraud     : 451 (9.02%)
Average fraud prob  : 0.0965


Unnamed: 0,volume_liters,total_amount,hour,is_weekend,loyalty_member,customer_transaction_count,days_since_last_transaction,same_day_transactions,volume_deviation,amount_deviation,...,is_night_transaction,bbm_type_encoded,payment_method_encoded,day_of_week_encoded,customer_type_encoded,spbu_category_encoded,spbu_province_encoded,spbu_city_encoded,predicted_is_fraud,fraud_probability
0,24.61,305164.0,3,False,False,1,0.0,1,0.0,0.0,...,True,2,2,1,2,1,2,5,0,0.0
1,240.81,1637508.0,23,True,True,1,0.0,1,0.0,0.0,...,True,4,1,2,0,0,9,3,0,0.110649
2,78.48,784794.3,12,True,True,2,0.0,1,0.508409,0.352026,...,False,1,0,3,0,0,6,20,1,0.763743
3,112.02,1389048.0,21,True,True,3,0.0,2,0.220839,0.093351,...,False,2,1,3,0,0,9,28,0,0.028281
4,108.44,1084400.0,0,True,True,4,6.0,1,0.196369,0.114007,...,True,1,4,3,0,0,9,27,0,0.01517
