In [1]:
# =========================================================
# Imports
# =========================================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [2]:
# =========================================================
# Evaluation (Fraud = Positive Class)
# =========================================================
def evaluate(y_true, y_pred):

    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

    print("\nFraud (1) Metrics:")
    print("Precision:", precision_score(y_true, y_pred, pos_label=1))
    print("Recall:", recall_score(y_true, y_pred, pos_label=1))
    print("F1-score:", f1_score(y_true, y_pred, pos_label=1))

In [28]:
# =========================================================
# Models (EXACT paper hyperparameters)
# =========================================================
def get_models():

    rf = RandomForestClassifier(
        n_estimators=3,
        max_depth=None,
        random_state=42
    )

    knn = KNeighborsClassifier(
        n_neighbors=3
    )

    svm = SVC(
        kernel='rbf',
        C=1.0
    )

    return rf, knn, svm

In [36]:
# =========================================================
# Shared Train/Test Split (USED BY BOTH CASE STUDIES)
# =========================================================
def shared_split(dataset, test_size=0.2, random_state=42):

    X = dataset.drop('Class', axis=1)
    y = dataset['Class']

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=test_size,
        stratify=y,
        random_state=random_state
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test

In [7]:
# =========================================================
# Iterative DBSCAN Augmentation (TRAIN ONLY)
# =========================================================
def iterative_dbscan_augmentation(X_train, iterations=3, eps=2, min_samples=4):

    X_aug = X_train.copy()

    for _ in range(iterations):
        db = DBSCAN(eps=eps, min_samples=min_samples)
        clusters = db.fit_predict(X_aug).reshape(-1, 1)
        X_aug = np.hstack((X_aug, clusters))

    return X_aug

In [10]:
def case_study_1(X_train, X_test, y_train, y_test):

    print("\n===== CASE STUDY 1: ORIGINAL DATA =====")

    rf, knn, svm = get_models()

    rf.fit(X_train, y_train)
    knn.fit(X_train, y_train)
    svm.fit(X_train, y_train)

    # -------------------------
    # Individual model results
    # -------------------------
    print("\n--- Random Forest ---")
    rf_pred = rf.predict(X_test)
    evaluate(y_test, rf_pred)

    print("\n--- KNN ---")
    knn_pred = knn.predict(X_test)
    evaluate(y_test, knn_pred)

    print("\n--- SVM ---")
    svm_pred = svm.predict(X_test)
    evaluate(y_test, svm_pred)

    # -------------------------
    # Majority Voting
    # -------------------------
    majority_pred = ((rf_pred + knn_pred + svm_pred) >= 2).astype(int)

    print("\n--- Majority Voting Ensemble ---")
    evaluate(y_test, majority_pred)

    return {
        "rf": rf_pred,
        "knn": knn_pred,
        "svm": svm_pred,
        "ensemble": majority_pred
    }

In [11]:
def case_study_2(X_train, X_test, y_train, y_test):

    print("\n===== CASE STUDY 2: DBSCAN HYBRID =====")

    # -------------------------
    # DBSCAN augmentation (TRAIN ONLY)
    # -------------------------
    X_train_aug = iterative_dbscan_augmentation(
        X_train,
        iterations=3,
        eps=2,
        min_samples=4
    )

    # Pad test set with -1
    new_features = X_train_aug.shape[1] - X_test.shape[1]
    X_test_aug = np.hstack((
        X_test,
        -1 * np.ones((X_test.shape[0], new_features))
    ))

    rf, knn, svm = get_models()

    rf.fit(X_train_aug, y_train)
    knn.fit(X_train_aug, y_train)
    svm.fit(X_train_aug, y_train)

    # -------------------------
    # Individual model results
    # -------------------------
    print("\n--- Random Forest (DBSCAN-Augmented) ---")
    rf_pred = rf.predict(X_test_aug)
    evaluate(y_test, rf_pred)

    print("\n--- KNN (DBSCAN-Augmented) ---")
    knn_pred = knn.predict(X_test_aug)
    evaluate(y_test, knn_pred)

    print("\n--- SVM (DBSCAN-Augmented) ---")
    svm_pred = svm.predict(X_test_aug)
    evaluate(y_test, svm_pred)

    # -------------------------
    # Disjunctive (OR) Voting
    # -------------------------
    final_pred = ((rf_pred + knn_pred + svm_pred) >= 1).astype(int)

    print("\n--- Disjunctive (OR) Ensemble ---")
    evaluate(y_test, final_pred)

    return {
        "rf": rf_pred,
        "knn": knn_pred,
        "svm": svm_pred,
        "ensemble": final_pred
    }

In [12]:
# =========================================================
# Dataset Construction (Paper Setup)
# =========================================================
def create_base_dataset(df, total_size=242400, fraud_ratio=0.01):

    fraud_count = int(total_size * fraud_ratio)
    non_fraud_count = total_size - fraud_count

    fraud = df[df['Class'] == 1].sample(fraud_count, random_state=42)
    non_fraud = df[df['Class'] == 0].sample(non_fraud_count, random_state=42)

    base_dataset = pd.concat([fraud, non_fraud]).sample(frac=1, random_state=42)
    return base_dataset


def create_subset(base_df, total_size, fraud_count):

    non_fraud_count = total_size - fraud_count

    fraud = base_df[base_df['Class'] == 1].sample(fraud_count, random_state=42)
    non_fraud = base_df[base_df['Class'] == 0].sample(non_fraud_count, random_state=42)

    subset = pd.concat([fraud, non_fraud]).sample(frac=1, random_state=42)
    return subset

In [13]:

# =========================================================
# MAIN EXECUTION
# =========================================================
if __name__ == "__main__":

    # Load dataset
    df = pd.read_csv("creditcard_2023.csv")

    # Create base dataset
    base_dataset = create_base_dataset(df)

    # Paper datasets
    dataset1 = create_subset(base_dataset, 40400, 400)
    dataset2 = create_subset(base_dataset, 80800, 800)
    dataset3 = create_subset(base_dataset, 121200, 1200)

    # Run experiments (example: dataset1)
    X_train, X_test, y_train, y_test = shared_split(dataset1)

    case_study_1(X_train, X_test, y_train, y_test)
    case_study_2(X_train, X_test, y_train, y_test)


===== CASE STUDY 1: ORIGINAL DATA =====

--- Random Forest ---
Confusion Matrix:
[[7999    1]
 [   3   77]]

Fraud (1) Metrics:
Precision: 0.9871794871794872
Recall: 0.9625
F1-score: 0.9746835443037974

--- KNN ---
Confusion Matrix:
[[7999    1]
 [   4   76]]

Fraud (1) Metrics:
Precision: 0.987012987012987
Recall: 0.95
F1-score: 0.9681528662420382

--- SVM ---
Confusion Matrix:
[[8000    0]
 [   5   75]]

Fraud (1) Metrics:
Precision: 1.0
Recall: 0.9375
F1-score: 0.967741935483871

--- Majority Voting Ensemble ---
Confusion Matrix:
[[8000    0]
 [   2   78]]

Fraud (1) Metrics:
Precision: 1.0
Recall: 0.975
F1-score: 0.9873417721518988

===== CASE STUDY 2: DBSCAN HYBRID =====

--- Random Forest (DBSCAN-Augmented) ---
Confusion Matrix:
[[8000    0]
 [   0   80]]

Fraud (1) Metrics:
Precision: 1.0
Recall: 1.0
F1-score: 1.0

--- KNN (DBSCAN-Augmented) ---
Confusion Matrix:
[[7999    1]
 [   3   77]]

Fraud (1) Metrics:
Precision: 0.9871794871794872
Recall: 0.9625
F1-score: 0.974683544303

In [17]:
X_train, X_test, y_train, y_test = shared_split(dataset2)

case_study_1(X_train, X_test, y_train, y_test)
case_study_2(X_train, X_test, y_train, y_test)


===== CASE STUDY 1: ORIGINAL DATA =====

--- Random Forest ---
Confusion Matrix:
[[15999     1]
 [    3   157]]

Fraud (1) Metrics:
Precision: 0.9936708860759493
Recall: 0.98125
F1-score: 0.9874213836477987

--- KNN ---
Confusion Matrix:
[[15998     2]
 [    6   154]]

Fraud (1) Metrics:
Precision: 0.9871794871794872
Recall: 0.9625
F1-score: 0.9746835443037974

--- SVM ---
Confusion Matrix:
[[15999     1]
 [    5   155]]

Fraud (1) Metrics:
Precision: 0.9935897435897436
Recall: 0.96875
F1-score: 0.9810126582278481

--- Majority Voting Ensemble ---
Confusion Matrix:
[[15999     1]
 [    4   156]]

Fraud (1) Metrics:
Precision: 0.9936305732484076
Recall: 0.975
F1-score: 0.9842271293375394

===== CASE STUDY 2: DBSCAN HYBRID =====

--- Random Forest (DBSCAN-Augmented) ---
Confusion Matrix:
[[15999     1]
 [    1   159]]

Fraud (1) Metrics:
Precision: 0.99375
Recall: 0.99375
F1-score: 0.99375

--- KNN (DBSCAN-Augmented) ---
Confusion Matrix:
[[15975    25]
 [    6   154]]

Fraud (1) Metric

{'rf': array([0, 0, 0, ..., 0, 0, 0], shape=(16160,)),
 'knn': array([0, 0, 0, ..., 0, 0, 0], shape=(16160,)),
 'svm': array([0, 0, 0, ..., 0, 0, 0], shape=(16160,)),
 'ensemble': array([0, 0, 0, ..., 0, 0, 0], shape=(16160,))}

In [15]:
X_train, X_test, y_train, y_test = shared_split(dataset3)

case_study_1(X_train, X_test, y_train, y_test)
case_study_2(X_train, X_test, y_train, y_test)


===== CASE STUDY 1: ORIGINAL DATA =====

--- Random Forest ---
Confusion Matrix:
[[23994     6]
 [    2   238]]

Fraud (1) Metrics:
Precision: 0.9754098360655737
Recall: 0.9916666666666667
F1-score: 0.9834710743801653

--- KNN ---
Confusion Matrix:
[[23989    11]
 [   10   230]]

Fraud (1) Metrics:
Precision: 0.9543568464730291
Recall: 0.9583333333333334
F1-score: 0.9563409563409564

--- SVM ---
Confusion Matrix:
[[23999     1]
 [   10   230]]

Fraud (1) Metrics:
Precision: 0.9956709956709957
Recall: 0.9583333333333334
F1-score: 0.9766454352441614

--- Majority Voting Ensemble ---
Confusion Matrix:
[[23997     3]
 [    3   237]]

Fraud (1) Metrics:
Precision: 0.9875
Recall: 0.9875
F1-score: 0.9875

===== CASE STUDY 2: DBSCAN HYBRID =====

--- Random Forest (DBSCAN-Augmented) ---
Confusion Matrix:
[[24000     0]
 [    2   238]]

Fraud (1) Metrics:
Precision: 1.0
Recall: 0.9916666666666667
F1-score: 0.99581589958159

--- KNN (DBSCAN-Augmented) ---
Confusion Matrix:
[[23991     9]
 [   1

{'rf': array([0, 0, 0, ..., 0, 0, 0], shape=(24240,)),
 'knn': array([0, 0, 0, ..., 0, 0, 0], shape=(24240,)),
 'svm': array([0, 0, 0, ..., 0, 0, 0], shape=(24240,)),
 'ensemble': array([0, 0, 0, ..., 0, 0, 0], shape=(24240,))}

In [24]:
import pandas as pd

df = pd.read_csv("creditcard.csv")
df = df.drop(['Amount','Time'],axis=1)

# Subset 1
dataset1 = pd.concat([
    df[df['Class'] == 0].sample(10000),
    df[df['Class'] == 1].sample(100)
]).sample(frac=1)

# Subset 2
dataset2 = pd.concat([
    df[df['Class'] == 0].sample(20000),
    df[df['Class'] == 1].sample(200)
]).sample(frac=1)

# Subset 3
dataset3 = pd.concat([
    df[df['Class'] == 0].sample(30000),
    df[df['Class'] == 1].sample(300)
]).sample(frac=1)

# Verify
print(dataset1['Class'].value_counts())
print(dataset2['Class'].value_counts())
print(dataset3['Class'].value_counts())

Class
0    10000
1      100
Name: count, dtype: int64
Class
0    20000
1      200
Name: count, dtype: int64
Class
0    30000
1      300
Name: count, dtype: int64


In [39]:
# Shared split for dataset 3 (example)
X_train, X_test, y_train, y_test = shared_split(dataset3)

# Same split → both case studies
case_study_1(X_train, X_test, y_train, y_test)
case_study_2(X_train, X_test, y_train, y_test)


===== CASE STUDY 1: ORIGINAL DATA =====

--- Random Forest ---
Confusion Matrix:
[[5999    1]
 [   9   51]]

Fraud (1) Metrics:
Precision: 0.9807692307692307
Recall: 0.85
F1-score: 0.9107142857142857

--- KNN ---
Confusion Matrix:
[[5999    1]
 [  10   50]]

Fraud (1) Metrics:
Precision: 0.9803921568627451
Recall: 0.8333333333333334
F1-score: 0.9009009009009009

--- SVM ---
Confusion Matrix:
[[5999    1]
 [  15   45]]

Fraud (1) Metrics:
Precision: 0.9782608695652174
Recall: 0.75
F1-score: 0.8490566037735849

--- Majority Voting Ensemble ---
Confusion Matrix:
[[5999    1]
 [  11   49]]

Fraud (1) Metrics:
Precision: 0.98
Recall: 0.8166666666666667
F1-score: 0.8909090909090909

===== CASE STUDY 2: DBSCAN HYBRID =====

--- Random Forest (DBSCAN-Augmented) ---
Confusion Matrix:
[[5997    3]
 [  10   50]]

Fraud (1) Metrics:
Precision: 0.9433962264150944
Recall: 0.8333333333333334
F1-score: 0.8849557522123894

--- KNN (DBSCAN-Augmented) ---
Confusion Matrix:
[[5994    6]
 [  10   50]]

Fr

{'rf': array([0, 0, 0, ..., 0, 0, 0], shape=(6060,)),
 'knn': array([0, 0, 0, ..., 0, 0, 0], shape=(6060,)),
 'svm': array([0, 0, 0, ..., 0, 0, 0], shape=(6060,)),
 'ensemble': array([0, 0, 0, ..., 0, 0, 0], shape=(6060,))}

In [40]:
# Shared split for dataset 3 (example)
X_train, X_test, y_train, y_test = shared_split(dataset2)

# Same split → both case studies
case_study_1(X_train, X_test, y_train, y_test)
case_study_2(X_train, X_test, y_train, y_test)


===== CASE STUDY 1: ORIGINAL DATA =====

--- Random Forest ---
Confusion Matrix:
[[3996    4]
 [   5   35]]

Fraud (1) Metrics:
Precision: 0.8974358974358975
Recall: 0.875
F1-score: 0.8860759493670886

--- KNN ---
Confusion Matrix:
[[3998    2]
 [   7   33]]

Fraud (1) Metrics:
Precision: 0.9428571428571428
Recall: 0.825
F1-score: 0.88

--- SVM ---
Confusion Matrix:
[[3999    1]
 [  13   27]]

Fraud (1) Metrics:
Precision: 0.9642857142857143
Recall: 0.675
F1-score: 0.7941176470588235

--- Majority Voting Ensemble ---
Confusion Matrix:
[[3999    1]
 [   7   33]]

Fraud (1) Metrics:
Precision: 0.9705882352941176
Recall: 0.825
F1-score: 0.8918918918918919

===== CASE STUDY 2: DBSCAN HYBRID =====

--- Random Forest (DBSCAN-Augmented) ---
Confusion Matrix:
[[3997    3]
 [   5   35]]

Fraud (1) Metrics:
Precision: 0.9210526315789473
Recall: 0.875
F1-score: 0.8974358974358975

--- KNN (DBSCAN-Augmented) ---
Confusion Matrix:
[[3995    5]
 [   7   33]]

Fraud (1) Metrics:
Precision: 0.8684210

{'rf': array([0, 0, 0, ..., 0, 0, 0], shape=(4040,)),
 'knn': array([0, 0, 0, ..., 0, 0, 0], shape=(4040,)),
 'svm': array([0, 0, 0, ..., 0, 0, 0], shape=(4040,)),
 'ensemble': array([0, 0, 0, ..., 0, 0, 0], shape=(4040,))}

In [41]:
# Shared split for dataset 3 (example)
X_train, X_test, y_train, y_test = shared_split(dataset1)

# Same split → both case studies
case_study_1(X_train, X_test, y_train, y_test)
case_study_2(X_train, X_test, y_train, y_test)


===== CASE STUDY 1: ORIGINAL DATA =====

--- Random Forest ---
Confusion Matrix:
[[1999    1]
 [   6   14]]

Fraud (1) Metrics:
Precision: 0.9333333333333333
Recall: 0.7
F1-score: 0.8

--- KNN ---
Confusion Matrix:
[[1999    1]
 [   4   16]]

Fraud (1) Metrics:
Precision: 0.9411764705882353
Recall: 0.8
F1-score: 0.8648648648648649

--- SVM ---
Confusion Matrix:
[[2000    0]
 [  10   10]]

Fraud (1) Metrics:
Precision: 1.0
Recall: 0.5
F1-score: 0.6666666666666666

--- Majority Voting Ensemble ---
Confusion Matrix:
[[1999    1]
 [   5   15]]

Fraud (1) Metrics:
Precision: 0.9375
Recall: 0.75
F1-score: 0.8333333333333334

===== CASE STUDY 2: DBSCAN HYBRID =====

--- Random Forest (DBSCAN-Augmented) ---
Confusion Matrix:
[[1999    1]
 [   7   13]]

Fraud (1) Metrics:
Precision: 0.9285714285714286
Recall: 0.65
F1-score: 0.7647058823529411

--- KNN (DBSCAN-Augmented) ---
Confusion Matrix:
[[1999    1]
 [   4   16]]

Fraud (1) Metrics:
Precision: 0.9411764705882353
Recall: 0.8
F1-score: 0.86

{'rf': array([0, 0, 0, ..., 0, 0, 0], shape=(2020,)),
 'knn': array([0, 0, 0, ..., 0, 0, 0], shape=(2020,)),
 'svm': array([0, 0, 0, ..., 0, 0, 0], shape=(2020,)),
 'ensemble': array([0, 0, 0, ..., 0, 0, 0], shape=(2020,))}