<a href="https://colab.research.google.com/github/apriandito/pertamina-2/blob/main/03_tuning_parameter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import (
    train_test_split, GridSearchCV, TimeSeriesSplit
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix,
    roc_curve
)
import plotly.graph_objects as go

In [None]:
# 1. Load training data
train_url = (
    "https://raw.githubusercontent.com/apriandito/pertamina-2/"
    "refs/heads/main/data/bbm_fraud_train.csv"
)
df_train = pd.read_csv(train_url)
df_train.head()

Unnamed: 0,volume_liters,total_amount,hour,is_weekend,loyalty_member,customer_transaction_count,days_since_last_transaction,same_day_transactions,volume_deviation,amount_deviation,...,is_night_transaction,bbm_type_encoded,payment_method_encoded,day_of_week_encoded,customer_type_encoded,spbu_category_encoded,spbu_province_encoded,spbu_city_encoded,is_fraud,fraud_type
0,578.37,8039343.0,7,True,False,1,0.0,1,0.0,0.0,...,False,3,1,2,1,0,5,13,0,
1,664.13,4516084.0,8,False,False,2,109.0,1,0.069022,0.280616,...,False,4,2,6,1,1,3,26,0,
2,163.24,2024176.0,12,False,False,3,37.0,1,0.651628,0.583492,...,False,2,2,0,1,1,2,7,0,
3,54.64,759455.7,4,False,False,4,54.0,1,0.85034,0.801955,...,True,3,4,4,1,1,8,24,1,multiple_cards
4,357.87,4437588.0,2,True,False,5,8.0,1,0.015894,0.121926,...,True,2,1,2,1,2,8,24,0,


In [None]:
# 2. Siapkan fitur & target
X = df_train.drop(columns=["is_fraud", "fraud_type"])
y = df_train["is_fraud"]

In [None]:
# 3. Split data (chronological, tanpa shuffle)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

In [None]:
# 4. Definisikan model + grid hyperparameter
models_and_grids = {
    "Decision Tree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [None, 5, 10, 20],
            "min_samples_split": [2, 5, 10, 20]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Naive Bayes": {
        "model": GaussianNB(),
        "params": {
            "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
        }
    },
    "ANN (MLP)": {
        "model": MLPClassifier(max_iter=500, random_state=42),
        "params": {
            "hidden_layer_sizes": [(50,), (100,), (100,50)],
            "alpha": [0.0001, 0.001, 0.01],
            "learning_rate_init": [0.001, 0.01]
        }
    }
}

In [None]:
# 5. Setup TimeSeriesSplit untuk tuning
tscv = TimeSeriesSplit(n_splits=3)

In [None]:
# 6. Loop: GridSearch → evaluasi → simpan hasil
results = {}
fpr_dict, tpr_dict, roc_auc_dict = {}, {}, {}

for name, mg in models_and_grids.items():
    grid = GridSearchCV(
        estimator=mg["model"],
        param_grid=mg["params"],
        cv=tscv,
        scoring="accuracy",
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    best = grid.best_estimator_

    # Predict on test
    y_pred  = best.predict(X_test)
    y_proba = best.predict_proba(X_test)[:, 1]

    # Hitung metrik
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    acc   = accuracy_score(y_test, y_pred)
    prec  = precision_score(y_test, y_pred, zero_division=0)
    rec   = recall_score(y_test, y_pred, zero_division=0)
    spec  = tn / (tn + fp)
    f1    = f1_score(y_test, y_pred, zero_division=0)
    auc   = roc_auc_score(y_test, y_proba)

    # Simpan hasil + best params
    results[name] = {
        "Best Params": grid.best_params_,
        "Accuracy":   acc,
        "Precision":  prec,
        "Recall":     rec,
        "Specificity":spec,
        "F1-score":   f1,
        "ROC AUC":    auc
    }

    # Untuk ROC plot
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    fpr_dict[name] = fpr
    tpr_dict[name] = tpr
    roc_auc_dict[name] = auc

    print(f"[{name}] Best params: {grid.best_params_}")

[Decision Tree] Best params: {'max_depth': 10, 'min_samples_split': 10}
[Random Forest] Best params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
[Naive Bayes] Best params: {'var_smoothing': 1e-06}
[ANN (MLP)] Best params: {'alpha': 0.001, 'hidden_layer_sizes': (100,), 'learning_rate_init': 0.01}


In [None]:
# 7. Tampilkan tabel metrik & best params
df_metrics = pd.DataFrame(results).T
print("\n=== Hasil Perbandingan Setelah Tuning ===")
print(df_metrics)

# 8. Plot ROC Curve semua model
fig = go.Figure()
for name in models_and_grids:
    fig.add_trace(go.Scatter(
        x=fpr_dict[name], y=tpr_dict[name],
        mode='lines',
        name=f"{name} (AUC={roc_auc_dict[name]:.3f})",
        line=dict(width=2)
    ))
fig.add_trace(go.Scatter(
    x=[0,1], y=[0,1],
    mode='lines', name='Random Guess',
    line=dict(dash='dash', width=1)
))
fig.update_layout(
    title='ROC Curve – Model Comparison (Tuned)',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    legend=dict(x=0.6, y=0.1),
    template='plotly_white', width=700, height=500
)
fig.show()


=== Hasil Perbandingan Setelah Tuning ===
                                                     Best Params Accuracy  \
Decision Tree         {'max_depth': 10, 'min_samples_split': 10}   0.9934   
Random Forest  {'max_depth': 20, 'min_samples_split': 2, 'n_e...   0.9943   
Naive Bayes                             {'var_smoothing': 1e-06}   0.9438   
ANN (MLP)      {'alpha': 0.001, 'hidden_layer_sizes': (100,),...   0.9509   

              Precision    Recall Specificity  F1-score   ROC AUC  
Decision Tree  0.988688   0.87751    0.999474  0.929787  0.982343  
Random Forest       1.0  0.885542         1.0  0.939297  0.993113  
Naive Bayes    0.152174  0.028112    0.991791  0.047458  0.788974  
ANN (MLP)      0.733333  0.022088    0.999579  0.042885  0.779259  


In [None]:
# 9. Pilih model terbaik berdasarkan Accuracy
best_name = df_metrics["Accuracy"].idxmax()
best_model = models_and_grids[best_name]["model"].set_params(**results[best_name]["Best Params"])
best_model.fit(X_train, y_train)
print(f"\nModel terbaik: {best_name} (Accuracy={results[best_name]['Accuracy']:.4f})")


Model terbaik: Random Forest (Accuracy=0.9943)


In [None]:
# 10. Real‑time prediction dengan model terbaik
rt_url = (
    "https://raw.githubusercontent.com/apriandito/pertamina-2/"
    "refs/heads/main/data/bbm_fraud_realtime.csv"
)
df_rt = pd.read_csv(rt_url)
X_rt = df_rt[X.columns]

df_rt['predicted_is_fraud'] = best_model.predict(X_rt)
df_rt['fraud_probability']  = best_model.predict_proba(X_rt)[:, 1]

print("\n=== Real‑time Prediction Summary ===")
print(f"Total transaksi   : {len(df_rt)}")
print(f"Predicted fraud    : {df_rt['predicted_is_fraud'].sum()} "
      f"({df_rt['predicted_is_fraud'].mean()*100:.2f}%)")
print(f"Average fraud prob : {df_rt['fraud_probability'].mean():.4f}")

df_rt.head()


=== Real‑time Prediction Summary ===
Total transaksi   : 5000
Predicted fraud    : 455 (9.10%)
Average fraud prob : 0.0958


Unnamed: 0,volume_liters,total_amount,hour,is_weekend,loyalty_member,customer_transaction_count,days_since_last_transaction,same_day_transactions,volume_deviation,amount_deviation,...,is_night_transaction,bbm_type_encoded,payment_method_encoded,day_of_week_encoded,customer_type_encoded,spbu_category_encoded,spbu_province_encoded,spbu_city_encoded,predicted_is_fraud,fraud_probability
0,24.61,305164.0,3,False,False,1,0.0,1,0.0,0.0,...,True,2,2,1,2,1,2,5,0,0.0
1,240.81,1637508.0,23,True,True,1,0.0,1,0.0,0.0,...,True,4,1,2,0,0,9,3,0,0.191687
2,78.48,784794.3,12,True,True,2,0.0,1,0.508409,0.352026,...,False,1,0,3,0,0,6,20,1,0.781767
3,112.02,1389048.0,21,True,True,3,0.0,2,0.220839,0.093351,...,False,2,1,3,0,0,9,28,0,0.034679
4,108.44,1084400.0,0,True,True,4,6.0,1,0.196369,0.114007,...,True,1,4,3,0,0,9,27,0,0.021517
