In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
df = pd.read_csv("creditcard_2023.csv") 

In [5]:
fraud = df[df['Class'] == 1]
non_fraud = df[df['Class'] == 0]

# =========================================
# 1% FRAUD DATASET (242,400 total)
# =========================================
subset_size = 242400
fraud_count = int(subset_size * 0.01)   # 1%
non_fraud_count = subset_size - fraud_count

subset_1pct = pd.concat([
    fraud.sample(fraud_count, random_state=42),
    non_fraud.sample(non_fraud_count, random_state=42)
]).sample(frac=1, random_state=42)

print("1% fraud subset distribution:\n", subset_1pct['Class'].value_counts())


# =========================================
# THREE SEPARATE IMBALANCED DATASETS
# =========================================

def create_dataset(n_non_fraud, n_fraud, seed=42):
    return pd.concat([
        non_fraud.sample(n_non_fraud, random_state=seed),
        fraud.sample(n_fraud, random_state=seed)
    ]).sample(frac=1, random_state=seed)

subset1 = create_dataset(40000, 400)
subset2 = create_dataset(80000, 800)
subset3 = create_dataset(120000, 1200)

print("Dataset 1 distribution:\n", subset1['Class'].value_counts())
print("Dataset 2 distribution:\n", subset2['Class'].value_counts())
print("Dataset 3 distribution:\n", subset3['Class'].value_counts())

1% fraud subset distribution:
 Class
0    239976
1      2424
Name: count, dtype: int64
Dataset 1 distribution:
 Class
0    40000
1      400
Name: count, dtype: int64
Dataset 2 distribution:
 Class
0    80000
1      800
Name: count, dtype: int64
Dataset 3 distribution:
 Class
0    120000
1      1200
Name: count, dtype: int64


In [6]:
# =========================================
# 4. PREPROCESSING FUNCTION
# =========================================
def preprocess_data(df_subset):
    
    # Remove missing values
    df_clean = df_subset.dropna()
    
    X = df_clean.drop("Class", axis=1)
    y = df_clean["Class"]
    
    # Standard Scaling
    scaler = StandardScaler()
    X_pca = scaler.fit_transform(X)
    
    # PCA (28 components as in dataset)
    # pca = PCA(n_components=28)
    # X_pca = pca.fit_transform(X_scaled)
    
    return X_pca, y

In [7]:
# =========================================
# 5. DBSCAN AUGMENTATION (ITERATIVE)
# =========================================
def dbscan_augmentation(X, eps=2, min_samples=4, iterations=2):
    
    X_aug = X.copy()
    
    for i in range(iterations):
        db = DBSCAN(eps=eps, min_samples=min_samples)
        labels = db.fit_predict(X_aug)
        
        labels = labels.reshape(-1, 1)
        X_aug = np.hstack((X_aug, labels))
    
    return X_aug

In [None]:

def train_and_evaluate(df_subset, name="Dataset"):
    
    print(f"\n========== {name} ==========\n")
    
    # Preprocess
    X, y = preprocess_data(df_subset)
    
    # Augmentation
    X_aug = dbscan_augmentation(X, eps=2, min_samples=4, iterations=2)
    
    # 80-20 Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_aug, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # ==================================
    # Models (Exact paper parameters)
    # ==================================
    
    rf = RandomForestClassifier(n_estimators=3, max_depth=None, random_state=42)
    knn = KNeighborsClassifier(n_neighbors=3)
    svm = SVC(kernel='rbf', C=1.0)
    
    # Train
    rf.fit(X_train, y_train)
    knn.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    
    # Predict
    rf_pred = rf.predict(X_test)
    knn_pred = knn.predict(X_test)
    svm_pred = svm.predict(X_test)
    
    # ==================================
    # Disjunctive (OR) Voting
    # Fraud if ANY classifier predicts fraud
    # ==================================
    voting_pred = np.array([
        1 if (rf_pred[i] == 1 or knn_pred[i] == 1 or svm_pred[i] == 1) else 0
        for i in range(len(y_test))
    ])
    
    # ==================================
    # Evaluate all models + voting
    # ==================================
    from sklearn.metrics import accuracy_score, classification_report
    
    models_preds = {
        'Random Forest': rf_pred,
        'KNN': knn_pred,
        'SVM': svm_pred,
        'Voting OR': voting_pred
    }
    
    results = {}
    
    for model_name, y_pred in models_preds.items():
        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        results[model_name] = {
            'Accuracy': acc,
            'Precision_0': report['0']['precision'],
            'Recall_0': report['0']['recall'],
            'F1_0': report['0']['f1-score'],
            'Precision_1': report['1']['precision'],
            'Recall_1': report['1']['recall'],
            'F1_1': report['1']['f1-score']
        }
    
    # Display results table
    df_results = pd.DataFrame(results).T
    print(df_results)
    
    # Optional: show confusion matrix for the voting classifier
    print("\nConfusion Matrix for Voting OR:")
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, voting_pred))
    
    return df_results

# =========================================
# 7. RUN EXPERIMENTS (Case Study 2)
# =========================================
df_results1 = train_and_evaluate(subset1, "Subset 1")

In [7]:
train_and_evaluate(subset2, "Subset 2 ")



               Accuracy  Precision_0  Recall_0      F1_0  Precision_1  \
Random Forest  0.999505     0.999500  1.000000  0.999750     1.000000   
KNN            0.998948     0.999250  0.999687  0.999469     0.967320   
SVM            0.997153     0.997381  0.999750  0.998564     0.967213   
Voting OR      0.999381     0.999687  0.999687  0.999687     0.968750   

               Recall_1      F1_1  
Random Forest   0.95000  0.974359  
KNN             0.92500  0.945687  
SVM             0.73750  0.836879  
Voting OR       0.96875  0.968750  

Confusion Matrix for Voting OR:
[[15995     5]
 [    5   155]]


Unnamed: 0,Accuracy,Precision_0,Recall_0,F1_0,Precision_1,Recall_1,F1_1
Random Forest,0.999505,0.9995,1.0,0.99975,1.0,0.95,0.974359
KNN,0.998948,0.99925,0.999687,0.999469,0.96732,0.925,0.945687
SVM,0.997153,0.997381,0.99975,0.998564,0.967213,0.7375,0.836879
Voting OR,0.999381,0.999687,0.999687,0.999687,0.96875,0.96875,0.96875


In [None]:
train_and_evaluate(subset3, "Subset 3 ")