In [None]:
#libraries and dataset set-up
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler


# Load the dataset
data = pd.read_csv("data1.csv")  # Adjust the path if needed

# Drop the 'Time' column
data = data.drop(columns=["Time"])

# Separate features and target
X = data.drop(columns=["Class"])
y = data["Class"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = data["Class"])

In [None]:
#twoclassSVM

# Standardize features because SVM are sensitive for feature scale.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the SVM with a non-linear kernel
svm_model = SVC(kernel='rbf', gamma='scale', C=1.0, random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = svm_model.predict(X_test_scaled)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[56861     3]
 [   32    66]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.96      0.67      0.79        98

    accuracy                           1.00     56962
   macro avg       0.98      0.84      0.90     56962
weighted avg       1.00      1.00      1.00     56962



In [None]:

# Random Forest
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    class_weight='balanced',
    random_state=42,
    verbose=1
)
rf.fit(X_train, y_train)

# Predict on Test set
rf_pred_test = rf.predict(X_test)
rf_pred_proba = rf.predict_proba(X_test)[:, 1]  # for AUC

# Evaluation
print("\n[RandomForest] Classification Report on Test Set:")
print(classification_report(y_test, rf_pred_test))
print("[RandomForest] Confusion Matrix on Test Set:")
print(confusion_matrix(y_test, rf_pred_test))

# Metrics
acc = accuracy_score(y_test, rf_pred_test)  # use predicted labels
auc = roc_auc_score(y_test, rf_pred_proba)  # use predicted probabilities
print("Accuracy:", acc)
print("AUC:", auc)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  2.0min
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:  7.8min
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:  7.8min finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    1.5s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.4s



[RandomForest] Classification Report on Test Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.95      0.76      0.84        98

    accuracy                           1.00     56962
   macro avg       0.97      0.88      0.92     56962
weighted avg       1.00      1.00      1.00     56962

[RandomForest] Confusion Matrix on Test Set:
[[56860     4]
 [   24    74]]
Accuracy: 0.9995084442259752
AUC: 0.9514465412642266


[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.6s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    1.6s finished


In [None]:
import joblib

# ---- Sklearn models ----
# Save Random Forest
try:
    joblib.dump(rf, 'random_forestb_model.pkl')
    print("[✔] Random Forest saved.")
except NameError:
    print("[✘] Random Forest model not found.")

In [None]:
#oneclassSVM
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, confusion_matrix

# Extract only non-fraudulent samples (assumes 0 = non-fraud, 1 = fraud)
X_train_nonfraud = X_train[y_train == 0]

# Train One-Class SVM on non-fraud data only
ocsvm = OneClassSVM(gamma='auto', nu=0.01)
ocsvm.fit(X_train_nonfraud)

# Predict on the full test set
y_ocsvm_pred = ocsvm.predict(X_test)

# One-Class SVM returns +1 (inlier), -1 (outlier)
# Convert to binary: 0 = non-fraud, 1 = fraud
y_ocsvm_pred_binary = [0 if x == 1 else 1 for x in y_ocsvm_pred]

# Evaluation
print("\n[One-Class SVM] Classification Report on Test Set:")
print(classification_report(y_test, y_ocsvm_pred_binary, target_names=["Non-Fraud", "Fraud"]))

print("[One-Class SVM] Confusion Matrix on Test Set:")
print(confusion_matrix(y_test, y_ocsvm_pred_binary, labels=[0, 1]))



[One-Class SVM] Classification Report on Test Set:
              precision    recall  f1-score   support

   Non-Fraud       1.00      0.95      0.97     56864
       Fraud       0.03      0.84      0.05        98

    accuracy                           0.95     56962
   macro avg       0.51      0.89      0.51     56962
weighted avg       1.00      0.95      0.97     56962

[One-Class SVM] Confusion Matrix on Test Set:
[[53967  2897]
 [   16    82]]


In [None]:
#tabNet, transformer based model for tabular data from google
!pip -q install pytorch-tabnet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:

#Tabnet from google optimsed transformer struture for tabular data and imbalanced dataset

import numpy as np
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Convert to NumPy arrays
X_train_np = X_train.values
X_test_np = X_test.values
y_train_np = y_train.values
y_test_np = y_test.values

# Encode labels to 0/1 if needed
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train_np)
y_test_enc = le.transform(y_test_np)

# Initialize TabNet
clf = TabNetClassifier(
    n_d=16, n_a=16,
    n_steps=5,
    gamma=1.5,
    lambda_sparse=1e-4,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size": 10, "gamma": 0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    verbose=1,
    seed=42
)

# Fit TabNet
clf.fit(
    X_train=X_train_np, y_train=y_train_enc,
    eval_set=[(X_test_np, y_test_enc)],
    eval_name=['test'],
    eval_metric=['accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
)

# Predict and evaluate
y_pred = clf.predict(X_test_np)

print("\n[TabNet] Classification Report on Test Set:")
print(classification_report(y_test_enc, y_pred, target_names=["Non-Fraud", "Fraud"]))

print("[TabNet] Confusion Matrix on Test Set:")
print(confusion_matrix(y_test_enc, y_pred, labels=[0, 1]))

print("Accuracy:", accuracy_score(y_test, clf.predict_proba(X_test_np)[:, 1])
print("AUC:", roc_auc_score(y_test,clf.predict(X_test_np).ravel()))




epoch 0  | loss: 0.03318 | test_accuracy: 0.99903 |  0:00:28s
epoch 1  | loss: 0.00571 | test_accuracy: 0.99896 |  0:01:00s
epoch 2  | loss: 0.00502 | test_accuracy: 0.99893 |  0:01:28s
epoch 3  | loss: 0.00445 | test_accuracy: 0.99917 |  0:01:55s
epoch 4  | loss: 0.0046  | test_accuracy: 0.99916 |  0:02:22s
epoch 5  | loss: 0.00428 | test_accuracy: 0.99916 |  0:02:50s
epoch 6  | loss: 0.00402 | test_accuracy: 0.99923 |  0:03:23s
epoch 7  | loss: 0.0036  | test_accuracy: 0.99926 |  0:03:51s
epoch 8  | loss: 0.00426 | test_accuracy: 0.99923 |  0:04:18s
epoch 9  | loss: 0.00398 | test_accuracy: 0.99917 |  0:04:52s
epoch 10 | loss: 0.00377 | test_accuracy: 0.99914 |  0:05:19s
epoch 11 | loss: 0.00418 | test_accuracy: 0.99905 |  0:05:46s
epoch 12 | loss: 0.00414 | test_accuracy: 0.99907 |  0:06:27s
epoch 13 | loss: 0.00434 | test_accuracy: 0.99917 |  0:06:54s
epoch 14 | loss: 0.00426 | test_accuracy: 0.99898 |  0:07:20s
epoch 15 | loss: 0.00412 | test_accuracy: 0.99907 |  0:07:48s
epoch 16




[TabNet] Classification Report on Test Set:
              precision    recall  f1-score   support

   Non-Fraud       1.00      1.00      1.00     56864
       Fraud       0.77      0.81      0.79        98

    accuracy                           1.00     56962
   macro avg       0.89      0.90      0.89     56962
weighted avg       1.00      1.00      1.00     56962

[TabNet] Confusion Matrix on Test Set:
[[56841    23]
 [   19    79]]


In [None]:
#XGboost
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 1. Load the dataset
df = pd.read_csv('data1.csv')
X = df.drop('Class', axis=1)
y = df['Class']

# 2. Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Initialize XGBoost with GPU support
model = xgb.XGBClassifier(
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    use_label_encoder=False,
)

# 4. Train the model
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

# 5. Predict probabilities and apply custom threshold
prob_preds = model.predict_proba(X_test)[:, 1]
threshold = 0.25
class_preds = (prob_preds > threshold).astype(int)

# 6. Evaluate the results
accuracy = accuracy_score(y_test, class_preds)
print(f"Test Accuracy (threshold=0.25): {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, class_preds))



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.



[0]	validation_0-logloss:0.09428
[1]	validation_0-logloss:0.06912
[2]	validation_0-logloss:0.05107
[3]	validation_0-logloss:0.03799
[4]	validation_0-logloss:0.02848
[5]	validation_0-logloss:0.02150
[6]	validation_0-logloss:0.01634
[7]	validation_0-logloss:0.01260
[8]	validation_0-logloss:0.00983
[9]	validation_0-logloss:0.00782
[10]	validation_0-logloss:0.00633
[11]	validation_0-logloss:0.00528
[12]	validation_0-logloss:0.00449
[13]	validation_0-logloss:0.00390
[14]	validation_0-logloss:0.00348
[15]	validation_0-logloss:0.00314
[16]	validation_0-logloss:0.00290
[17]	validation_0-logloss:0.00273
[18]	validation_0-logloss:0.00262
[19]	validation_0-logloss:0.00253
[20]	validation_0-logloss:0.00246
[21]	validation_0-logloss:0.00242
[22]	validation_0-logloss:0.00237
[23]	validation_0-logloss:0.00234
[24]	validation_0-logloss:0.00231
[25]	validation_0-logloss:0.00232
[26]	validation_0-logloss:0.00230
[27]	validation_0-logloss:0.00231
[28]	validation_0-logloss:0.00231
[29]	validation_0-loglos


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [None]:
#SMOTE with randomforest threshold = 0.25
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

# Only this import is changed for cuML
from cuml.ensemble import RandomForestClassifier as cuRF
import numpy as np

data1 = pd.read_csv('data1.csv')

# Assume data1 is your DataFrame and 'Class' is the label column
X = data1.drop('Class', axis=1)
y = data1['Class']


# Split before SMOTE to avoid data leakage (recommended practice)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43, stratify=y)

# Apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Train a classifier (cuML Random Forest on GPU)
clf = cuRF(random_state=42)
clf.fit(X_train_res.values, y_train_res.values)  # cuML expects numpy arrays

# Predict probabilities on the test set
y_proba = clf.predict_proba(X_test.values)  # shape: (n_samples, n_classes)

# Use threshold = 0.25 for positive class (assuming binary classification and positive class is 1)
threshold = 0.25
y_pred = (y_proba[:, 1] >= threshold).astype(int)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


  return init_func(self, *args, **kwargs)


Accuracy: 0.9982619992275552

Confusion Matrix:
 [[56774    90]
 [    9    89]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.50      0.91      0.64        98

    accuracy                           1.00     56962
   macro avg       0.75      0.95      0.82     56962
weighted avg       1.00      1.00      1.00     56962

