In [1]:
#hehe


# =========================================
# 1. IMPORT LIBRARIES
# =========================================
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# =========================================
# 2. LOAD DATASET (Kaggle 2023 dataset)
# =========================================
df = pd.read_csv("creditcard_2023.csv")   # change path accordingly

In [3]:
# =========================================
# 2. CREATE 1% FRAUD SUBSET (242,400 records)
# =========================================
# Separate fraud and non-fraud
fraud = df[df['Class'] == 1]
non_fraud = df[df['Class'] == 0]

# Number of records for 1% fraud subset
subset_size = 242400
fraud_count = int(subset_size * 0.01)   # 1% fraud
non_fraud_count = subset_size - fraud_count  # remaining non-fraud

# Randomly sample to create subset
fraud_subset = fraud.sample(fraud_count, random_state=42)
non_fraud_subset = non_fraud.sample(non_fraud_count, random_state=42)

subset_1pct = pd.concat([fraud_subset, non_fraud_subset]).sample(frac=1, random_state=42)

print("1% fraud subset distribution:\n", subset_1pct['Class'].value_counts())

# =========================================
# 3. CREATE THREE IMBALANCED SUBSETS
# =========================================
# Dataset 1: 40,400 records (40,000 non-fraudulent, 400 fraudulent)
subset1 = pd.concat([
    non_fraud_subset.sample(40000, random_state=42),
    fraud_subset.sample(400, random_state=42)
]).sample(frac=1, random_state=42)

# Dataset 2: 80,800 records (80,000 non-fraudulent, 800 fraudulent)
subset2 = pd.concat([
    non_fraud_subset.sample(80000, random_state=42),
    fraud_subset.sample(800, random_state=42)
]).sample(frac=1, random_state=42)

# Dataset 3: 121,200 records (120,000 non-fraudulent, 1,200 fraudulent)
subset3 = pd.concat([
    non_fraud_subset.sample(120000, random_state=42),
    fraud_subset.sample(1200, random_state=42)
]).sample(frac=1, random_state=42)

# Quick sanity check
print("Dataset 1 distribution:\n", subset1['Class'].value_counts())
print("Dataset 2 distribution:\n", subset2['Class'].value_counts())
print("Dataset 3 distribution:\n", subset3['Class'].value_counts())

1% fraud subset distribution:
 Class
0    239976
1      2424
Name: count, dtype: int64
Dataset 1 distribution:
 Class
0    40000
1      400
Name: count, dtype: int64
Dataset 2 distribution:
 Class
0    80000
1      800
Name: count, dtype: int64
Dataset 3 distribution:
 Class
0    120000
1      1200
Name: count, dtype: int64


In [5]:
# =========================================
# 3. CREATE THREE IMBALANCED SUBSETS (1% fraud)
# =========================================
fraud = df[df['Class'] == 1]
non_fraud = df[df['Class'] == 0]

subset1 = pd.concat([
    non_fraud.iloc[:40000],
    fraud.iloc[:400]
]).sample(frac=1, random_state=42)

subset2 = pd.concat([
    non_fraud.iloc[:80000],
    fraud.iloc[:800]
]).sample(frac=1, random_state=42)

subset3 = pd.concat([
    non_fraud.iloc[:120000],
    fraud.iloc[:1200]
]).sample(frac=1, random_state=42)

In [4]:
# =========================================
# 4. PREPROCESSING FUNCTION
# =========================================
def preprocess_data(df_subset):
    
    # Remove missing values
    df_clean = df_subset.dropna()
    
    X = df_clean.drop("Class", axis=1)
    y = df_clean["Class"]
    
    # Standard Scaling
    scaler = StandardScaler()
    X_pca = scaler.fit_transform(X)
    
    # PCA (28 components as in dataset)
    # pca = PCA(n_components=28)
    # X_pca = pca.fit_transform(X_scaled)
    
    return X_pca, y

In [5]:
# =========================================
# 5. DBSCAN AUGMENTATION (ITERATIVE)
# =========================================
def dbscan_augmentation(X, eps=2, min_samples=4, iterations=2):
    
    X_aug = X.copy()
    
    for i in range(iterations):
        db = DBSCAN(eps=eps, min_samples=min_samples)
        labels = db.fit_predict(X_aug)
        
        labels = labels.reshape(-1, 1)
        X_aug = np.hstack((X_aug, labels))
    
    return X_aug

In [6]:

def train_and_evaluate(df_subset, name="Dataset"):
    
    print(f"\n========== {name} ==========\n")
    
    # Preprocess
    X, y = preprocess_data(df_subset)
    
    # Augmentation
    X_aug = dbscan_augmentation(X, eps=2, min_samples=4, iterations=2)
    
    # 80-20 Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_aug, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # ==================================
    # Models (Exact paper parameters)
    # ==================================
    
    rf = RandomForestClassifier(n_estimators=3, max_depth=None, random_state=42)
    knn = KNeighborsClassifier(n_neighbors=3)
    svm = SVC(kernel='rbf', C=1.0)
    
    # Train
    rf.fit(X_train, y_train)
    knn.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    
    # Predict
    rf_pred = rf.predict(X_test)
    knn_pred = knn.predict(X_test)
    svm_pred = svm.predict(X_test)
    
    # ==================================
    # Disjunctive (OR) Voting
    # Fraud if ANY classifier predicts fraud
    # ==================================
    voting_pred = np.array([
        1 if (rf_pred[i] == 1 or knn_pred[i] == 1 or svm_pred[i] == 1) else 0
        for i in range(len(y_test))
    ])
    
    # ==================================
    # Evaluate all models + voting
    # ==================================
    from sklearn.metrics import accuracy_score, classification_report
    
    models_preds = {
        'Random Forest': rf_pred,
        'KNN': knn_pred,
        'SVM': svm_pred,
        'Voting OR': voting_pred
    }
    
    results = {}
    
    for model_name, y_pred in models_preds.items():
        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        results[model_name] = {
            'Accuracy': acc,
            'Precision_0': report['0']['precision'],
            'Recall_0': report['0']['recall'],
            'F1_0': report['0']['f1-score'],
            'Precision_1': report['1']['precision'],
            'Recall_1': report['1']['recall'],
            'F1_1': report['1']['f1-score']
        }
    
    # Display results table
    df_results = pd.DataFrame(results).T
    print(df_results)
    
    # Optional: show confusion matrix for the voting classifier
    print("\nConfusion Matrix for Voting OR:")
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, voting_pred))
    
    return df_results

# =========================================
# 7. RUN EXPERIMENTS (Case Study 2)
# =========================================
df_results1 = train_and_evaluate(subset1, "Subset 1 (40,400)")



               Accuracy  Precision_0  Recall_0      F1_0  Precision_1  \
Random Forest  0.999134     0.999500  0.999625  0.999563     0.962025   
KNN            0.999752     1.000000  0.999750  0.999875     0.975610   
SVM            0.998762     0.999001  0.999750  0.999375     0.972973   
Voting OR      0.999505     1.000000  0.999500  0.999750     0.952381   

               Recall_1      F1_1  
Random Forest      0.95  0.955975  
KNN                1.00  0.987654  
SVM                0.90  0.935065  
Voting OR          1.00  0.975610  

Confusion Matrix for Voting OR:
[[7996    4]
 [   0   80]]


In [7]:
train_and_evaluate(subset2, "Subset 2 ")



               Accuracy  Precision_0  Recall_0      F1_0  Precision_1  \
Random Forest  0.999505     0.999500  1.000000  0.999750     1.000000   
KNN            0.998948     0.999250  0.999687  0.999469     0.967320   
SVM            0.997153     0.997381  0.999750  0.998564     0.967213   
Voting OR      0.999381     0.999687  0.999687  0.999687     0.968750   

               Recall_1      F1_1  
Random Forest   0.95000  0.974359  
KNN             0.92500  0.945687  
SVM             0.73750  0.836879  
Voting OR       0.96875  0.968750  

Confusion Matrix for Voting OR:
[[15995     5]
 [    5   155]]


Unnamed: 0,Accuracy,Precision_0,Recall_0,F1_0,Precision_1,Recall_1,F1_1
Random Forest,0.999505,0.9995,1.0,0.99975,1.0,0.95,0.974359
KNN,0.998948,0.99925,0.999687,0.999469,0.96732,0.925,0.945687
SVM,0.997153,0.997381,0.99975,0.998564,0.967213,0.7375,0.836879
Voting OR,0.999381,0.999687,0.999687,0.999687,0.96875,0.96875,0.96875


In [8]:
train_and_evaluate(subset3, "Subset 3 ")



               Accuracy  Precision_0  Recall_0      F1_0  Precision_1  \
Random Forest  0.999917     0.999917  1.000000  0.999958     1.000000   
KNN            0.999464     0.999583  0.999875  0.999729     0.987124   
SVM            0.996700     0.996802  0.999875  0.998336     0.981928   
Voting OR      0.999752     0.999958  0.999792  0.999875     0.979508   

               Recall_1      F1_1  
Random Forest  0.991667  0.995816  
KNN            0.958333  0.972516  
SVM            0.679167  0.802956  
Voting OR      0.995833  0.987603  

Confusion Matrix for Voting OR:
[[23995     5]
 [    1   239]]


Unnamed: 0,Accuracy,Precision_0,Recall_0,F1_0,Precision_1,Recall_1,F1_1
Random Forest,0.999917,0.999917,1.0,0.999958,1.0,0.991667,0.995816
KNN,0.999464,0.999583,0.999875,0.999729,0.987124,0.958333,0.972516
SVM,0.9967,0.996802,0.999875,0.998336,0.981928,0.679167,0.802956
Voting OR,0.999752,0.999958,0.999792,0.999875,0.979508,0.995833,0.987603
