In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM
from sklearn.ensemble import RandomForestClassifier

# Load data
data1 = pd.read_csv('data1.csv')

# Step 1: Split dataset into 70%, 15%, 15% without stratification
train_data, temp_data = train_test_split(data1, test_size=0.3, random_state=42)
test1_data, test2_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Step 2: Prepare SVM training data (legitimate transactions only from train_data)
X_train_nonfraud = train_data[train_data['Class'] == 0].drop(['Time', 'Class'], axis=1)

# Train OneClassSVM
svm = OneClassSVM(gamma='auto', nu=0.01, verbose=1)
svm.fit(X_train_nonfraud)

# Step 3: Detect grey area transactions in test1_data
X_test1 = test1_data.drop(['Time', 'Class'], axis=1)
svm_pred_test1 = svm.predict(X_test1)  # -1 = anomaly, 1 = normal

# Add predictions to test1_data and filter grey area
test1_data = test1_data.copy()  # Avoid SettingWithCopyWarning
test1_data['SVM_Flag'] = svm_pred_test1
grey_area = test1_data[test1_data['SVM_Flag'] == -1]

# Step 4: Prepare RF training data (use original labels)
X_grey = grey_area.drop(['Time', 'Class', 'SVM_Flag'], axis=1)
y_grey = grey_area['Class']

# Step 5: Train Random Forest (remove n_bins parameter)
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    class_weight='balanced',
    random_state=42,
    verbose=1
)
rf.fit(X_grey, y_grey)

# Step 6: Predict on test2_data and evaluate
X_test2 = test2_data.drop(['Time', 'Class'], axis=1)
y_test2 = test2_data['Class']
rf_pred_test2 = rf.predict(X_test2)



[LibSVM]

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    1.3s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.2s finished


In [3]:
from sklearn.metrics import classification_report
import numpy as np

# 1. OneClassSVM on Train Set
# svm_pred_train: +1 (normal), -1 (anomaly). Map to 0 (legit), 1 (fraud) for report.
X_train = train_data.drop(['Time', 'Class'], axis=1)
y_train = train_data['Class']
svm_pred_train = svm.predict(X_train)
svm_pred_train_labels = np.where(svm_pred_train == 1, 0, 1)

print("\n[OneClassSVM] Classification Report on Train Set:")
print(classification_report(y_train, svm_pred_train_labels, target_names=['Legitimate', 'Fraud']))

# 2. OneClassSVM on Test1 Set
y_test1 = test1_data['Class']
svm_pred_test1_labels = np.where(test1_data['SVM_Flag'] == 1, 0, 1)

print("\n[OneClassSVM] Classification Report on Test1 Set:")
print(classification_report(y_test1, svm_pred_test1_labels, target_names=['Legitimate', 'Fraud']))

# 3. Random Forest on Grey Area Transactions (from Test1)
# y_grey and X_grey already defined
y_grey_pred = rf.predict(X_grey)
print("\n[RandomForest] Classification Report on SVM-Flagged (Grey Area) Transactions (Test1):")
print(classification_report(y_grey, y_grey_pred, target_names=['Legitimate', 'Fraud']))

# 4. Random Forest on Test2 Set
# y_test2 and rf_pred_test2 already defined
print("\n[RandomForest] Classification Report on Test2 Set:")
print(classification_report(y_test2, rf_pred_test2, target_names=['Legitimate', 'Fraud']))



[OneClassSVM] Classification Report on Train Set:
              precision    recall  f1-score   support

  Legitimate       1.00      0.97      0.99    199008
       Fraud       0.06      0.84      0.11       356

    accuracy                           0.97    199364
   macro avg       0.53      0.91      0.55    199364
weighted avg       1.00      0.97      0.99    199364


[OneClassSVM] Classification Report on Test1 Set:
              precision    recall  f1-score   support

  Legitimate       1.00      0.95      0.97     42644
       Fraud       0.03      0.88      0.06        77

    accuracy                           0.95     42721
   macro avg       0.51      0.91      0.51     42721
weighted avg       1.00      0.95      0.97     42721


[RandomForest] Classification Report on SVM-Flagged (Grey Area) Transactions (Test1):
              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00      2291
       Fraud       1.00      1.00      1.00     

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.0s finished


In [4]:
from sklearn.metrics import classification_report, confusion_matrix

labels = [0, 1]
target_names = ['Legitimate', 'Fraud']

# 1. OneClassSVM on Train Set
print("\n[OneClassSVM] Classification Report on Train Set:")
print(classification_report(y_train, svm_pred_train_labels, target_names=target_names))
print("[OneClassSVM] Confusion Matrix on Train Set:")
print(confusion_matrix(y_train, svm_pred_train_labels, labels=labels))

# 2. OneClassSVM on Test1 Set
print("\n[OneClassSVM] Classification Report on Test1 Set:")
print(classification_report(y_test1, svm_pred_test1_labels, target_names=target_names))
print("[OneClassSVM] Confusion Matrix on Test1 Set:")
print(confusion_matrix(y_test1, svm_pred_test1_labels, labels=labels))

# 3. Random Forest on SVM-Flagged (Grey Area) Transactions (Test1)
print("\n[RandomForest] Classification Report on SVM-Flagged (Grey Area) Transactions (Test1):")
print(classification_report(y_grey, y_grey_pred, target_names=target_names))
print("[RandomForest] Confusion Matrix on SVM-Flagged (Grey Area) Transactions (Test1):")
print(confusion_matrix(y_grey, y_grey_pred, labels=labels))

# 4. Random Forest on Test2 Set
print("\n[RandomForest] Classification Report on Test2 Set:")
print(classification_report(y_test2, rf_pred_test2, target_names=target_names))
print("[RandomForest] Confusion Matrix on Test2 Set:")
print(confusion_matrix(y_test2, rf_pred_test2, labels=labels))



[OneClassSVM] Classification Report on Train Set:
              precision    recall  f1-score   support

  Legitimate       1.00      0.97      0.99    199008
       Fraud       0.06      0.84      0.11       356

    accuracy                           0.97    199364
   macro avg       0.53      0.91      0.55    199364
weighted avg       1.00      0.97      0.99    199364

[OneClassSVM] Confusion Matrix on Train Set:
[[194017   4991]
 [    58    298]]

[OneClassSVM] Classification Report on Test1 Set:
              precision    recall  f1-score   support

  Legitimate       1.00      0.95      0.97     42644
       Fraud       0.03      0.88      0.06        77

    accuracy                           0.95     42721
   macro avg       0.51      0.91      0.51     42721
weighted avg       1.00      0.95      0.97     42721

[OneClassSVM] Confusion Matrix on Test1 Set:
[[40353  2291]
 [    9    68]]

[RandomForest] Classification Report on SVM-Flagged (Grey Area) Transactions (Test1):
 