In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from cuml.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

data1 = pd.read_csv('data1.csv')

# Fix typo: use data1, not dat1
X = data1.drop(columns=['Class']).values
y = data1['Class'].values  # Correct syntax

# Check for at least two classes
if len(np.unique(y)) < 2:
    raise ValueError("SVM requires at least two classes in the target variable.")

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# Standardize features because SVM are sensitive for feature scale.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Fit cuML SVM (SVC) on legitimate transactions
svm = SVC()
svm.fit(X_train, y_train)

# Predict on test set
y_pred = svm.predict(X_test)

# Evaluate using scikit-learn
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.97      0.62      0.76        98

    accuracy                           1.00     56962
   macro avg       0.98      0.81      0.88     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
[[56862     2]
 [   37    61]]


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from cuml.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('data1.csv')

In [6]:
import cupy as cp
import cudf
from cuml.ensemble import RandomForestClassifier
from cuml.model_selection import train_test_split, StratifiedKFold
import xgboost as xgb
from sklearn.metrics import recall_score



def assign_penalty_weights(y_true, y_pred, alpha=2.0, P=45):
    y_true = cp.asarray(y_true)
    y_pred = cp.asarray(y_pred)
    N = len(y_true)
    weights = cp.ones(N)  # default: 1 for all
    for i in range(N):
        if y_true[i] == 1 and y_pred[i] == 0:  # FN
            weights[i] = P
    return weights, P

# Assume df is a cudf.DataFrame with features and 'Class' as label
X = df.drop(columns=['Class'])
y = df['Class']

# Split into trainval (80%) and final test (20%), stratified
X_trainval, X_finaltest, y_trainval, y_finaltest = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Stratified KFold for 5 folds on trainval
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
all_augmented_X, all_augmented_y, all_augmented_weights = [], [], []

for fold, (train_idx, test_idx) in enumerate(skf.split(X_trainval, y_trainval)):
    print(f"Fold {fold+1}")
    # FIX: Convert cupy indices to numpy for iloc
    X_train = X_trainval.iloc[train_idx.get()]
    X_test = X_trainval.iloc[test_idx.get()]
    y_train = y_trainval.iloc[train_idx.get()]
    y_test = y_trainval.iloc[test_idx.get()]

    # Train XGBoost on legitimate only (class 0)
    X_train_legit = X_train[y_train == 0]
    y_train_legit = y_train[y_train == 0]
    dtrain = xgb.DMatrix(X_train_legit.to_pandas(), label=y_train_legit.to_pandas())
    params = {
        'objective': 'binary:logistic',
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'eval_metric': 'logloss',
        'verbosity': 0
    }
    xgb_model = xgb.train(params, dtrain, num_boost_round=50)

    # Predict on test set (probabilities)
    dtest = xgb.DMatrix(X_test.to_pandas())
    y_pred_proba = xgb_model.predict(dtest)
    # Outlier detection: classify as fraud (1) if probability < threshold (e.g., 0.1)
    threshold = 0.1
    y_pred_test_bin = (y_pred_proba < threshold).astype(cp.int32)

    # Assign penalties
    weights, P = assign_penalty_weights(y_test.values, y_pred_test_bin, alpha=2.0, P=45)

    augmented_X, augmented_y, augmented_weights = [], [], []
    for i in range(len(X_test)):
        xi = X_test.iloc[i]
        yi = y_test.iloc[i]
        wi = int(cp.round(weights[i]).get())
        for _ in range(wi):
            augmented_X.append(xi)
            augmented_y.append(yi)
            augmented_weights.append(wi)
    # Add original training data (from this fold)
    for i in range(len(X_train)):
        augmented_X.append(X_train.iloc[i])
        augmented_y.append(y_train.iloc[i])
        augmented_weights.append(1.0)

# Convert to cudf DataFrame/Series
all_augmented_X = cudf.DataFrame(all_augmented_X)
all_augmented_y = cudf.Series(all_augmented_y)
all_augmented_weights = cp.array(all_augmented_weights)

# Train cuML Random Forest on the augmented 80% set
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(all_augmented_X, all_augmented_y, sample_weight=all_augmented_weights)

# Evaluate on the untouched final 20% test set
y_pred_final = rf.predict(X_finaltest)
recall_fraud = recall_score(y_finaltest.to_numpy(), y_pred_final.get(), pos_label=1)
print("Final fraud recall on untouched test set:", recall_fraud)


Fold 1


KeyboardInterrupt: 

In [None]:
# Convert all_augmented_X, all_augmented_y, all_augmented_weights to pandas for easy manipulation
import pandas as pd

# Convert cudf to pandas (if not already)
X_pd = all_augmented_X.to_pandas()
y_pd = all_augmented_y.to_pandas()
w_pd = cp.asnumpy(all_augmented_weights)

# Round weights to nearest integer (minimum 1)
w_pd = np.maximum(np.round(w_pd).astype(int), 1)

# Repeat (oversample) each row according to its weight
X_oversampled = np.repeat(X_pd.values, w_pd, axis=0)
y_oversampled = np.repeat(y_pd.values, w_pd, axis=0)

# Convert back to cudf DataFrame/Series for cuML
X_oversampled_cudf = cudf.DataFrame(X_oversampled, columns=X_pd.columns)
y_oversampled_cudf = cudf.Series(y_oversampled)

# Train cuML Random Forest on the oversampled data
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_oversampled_cudf, y_oversampled_cudf)

# Evaluate on the untouched final 20% test set
y_pred_final = rf.predict(X_finaltest)
recall_fraud = recall_score(y_finaltest.to_numpy(), y_pred_final.get(), pos_label=1)
print("Final fraud recall on untouched test set:", recall_fraud)


In [16]:
#XGboost
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 1. Load the dataset
df = pd.read_csv('data1.csv')
X = df.drop('Class', axis=1)
y = df['Class']

# 2. Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Initialize XGBoost with GPU support
model = xgb.XGBClassifier(
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    use_label_encoder=False,
)

# 4. Train the model
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

# 5. Predict probabilities and apply custom threshold
prob_preds = model.predict_proba(X_test)[:, 1]
threshold = 0.25
class_preds = (prob_preds > threshold).astype(int)

# 6. Evaluate the results
accuracy = accuracy_score(y_test, class_preds)
print(f"Test Accuracy (threshold=0.25): {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, class_preds))


[0]	validation_0-logloss:0.09428
[1]	validation_0-logloss:0.06912
[2]	validation_0-logloss:0.05107
[3]	validation_0-logloss:0.03799
[4]	validation_0-logloss:0.02848
[5]	validation_0-logloss:0.02150
[6]	validation_0-logloss:0.01634
[7]	validation_0-logloss:0.01260
[8]	validation_0-logloss:0.00983
[9]	validation_0-logloss:0.00782
[10]	validation_0-logloss:0.00633
[11]	validation_0-logloss:0.00528
[12]	validation_0-logloss:0.00449
[13]	validation_0-logloss:0.00390
[14]	validation_0-logloss:0.00348
[15]	validation_0-logloss:0.00314
[16]	validation_0-logloss:0.00290
[17]	validation_0-logloss:0.00273
[18]	validation_0-logloss:0.00262
[19]	validation_0-logloss:0.00253
[20]	validation_0-logloss:0.00246
[21]	validation_0-logloss:0.00242
[22]	validation_0-logloss:0.00237
[23]	validation_0-logloss:0.00234



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.



[24]	validation_0-logloss:0.00231
[25]	validation_0-logloss:0.00232
[26]	validation_0-logloss:0.00230
[27]	validation_0-logloss:0.00231
[28]	validation_0-logloss:0.00231
[29]	validation_0-logloss:0.00234
[30]	validation_0-logloss:0.00233
[31]	validation_0-logloss:0.00232
[32]	validation_0-logloss:0.00234
[33]	validation_0-logloss:0.00234
[34]	validation_0-logloss:0.00235
[35]	validation_0-logloss:0.00235
[36]	validation_0-logloss:0.00237
[37]	validation_0-logloss:0.00236
[38]	validation_0-logloss:0.00237
[39]	validation_0-logloss:0.00238
[40]	validation_0-logloss:0.00240
[41]	validation_0-logloss:0.00240
[42]	validation_0-logloss:0.00241
[43]	validation_0-logloss:0.00242
[44]	validation_0-logloss:0.00243
[45]	validation_0-logloss:0.00243
[46]	validation_0-logloss:0.00245
[47]	validation_0-logloss:0.00246
[48]	validation_0-logloss:0.00246
[49]	validation_0-logloss:0.00247
[50]	validation_0-logloss:0.00247
[51]	validation_0-logloss:0.00247
[52]	validation_0-logloss:0.00249
[53]	validatio


    E.g. tree_method = "hist", device = "cuda"



In [14]:
# 5. Predict probabilities and apply custom threshold
prob_preds = model.predict_proba(X_test)[:, 1]
threshold = 0.005
class_preds = (prob_preds > threshold).astype(int)

# 6. Evaluate the results
accuracy = accuracy_score(y_test, class_preds)
print(f"Test Accuracy (threshold=0.25): {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, class_preds))

Test Accuracy (threshold=0.25): 0.9991

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.67      0.97      0.79        98

    accuracy                           1.00     56962
   macro avg       0.83      0.98      0.90     56962
weighted avg       1.00      1.00      1.00     56962



In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load data
data1 = pd.read_csv('data1.csv')
X = data1.drop('Class', axis=1)
y = data1['Class']

# Train/test split (hold out 20% for final test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# XGBoost DMatrix for cross-validation
dtrain = xgb.DMatrix(X_train, label=y_train)

# Set parameters
params = {
    "objective": "binary:logistic",
    "tree_method": "gpu_hist",  # use 'hist' if no GPU
    "eval_metric": "logloss"
}

# Cross-validation (5-fold)
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=100,
    nfold=5,
    metrics={"logloss"},
    early_stopping_rounds=10,
    seed=42
)

print(cv_results.head())

# Train final model using best number of rounds from CV
best_n = len(cv_results)
final_model = xgb.XGBClassifier(
    n_estimators=best_n,
    tree_method='gpu_hist',
    use_label_encoder=False,
    eval_metric='logloss'
)
final_model.fit(X_train, y_train)

# Evaluate on test set
y_pred = final_model.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))





    E.g. tree_method = "hist", device = "cuda"



   train-logloss-mean  train-logloss-std  test-logloss-mean  test-logloss-std
0            0.094245           0.000037           0.094410          0.000162
1            0.069035           0.000039           0.069253          0.000186
2            0.050933           0.000045           0.051197          0.000211
3            0.037832           0.000052           0.038135          0.000228
4            0.028257           0.000045           0.028614          0.000242
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.91      0.81      0.85        98

    accuracy                           1.00     56962
   macro avg       0.95      0.90      0.93     56962
weighted avg       1.00      1.00      1.00     56962




    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [14]:
# 5. Predict probabilities and apply custom threshold
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
prob_preds = final_model.predict_proba(X_test)[:, 1]
threshold = 0.1
class_preds = (prob_preds > threshold).astype(int)

# 6. Evaluate the results
accuracy = accuracy_score(y_test, class_preds)
print(f"Test Accuracy (threshold=0.25): {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, class_preds))

Test Accuracy (threshold=0.25): 0.9994

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.80      0.86      0.83        98

    accuracy                           1.00     56962
   macro avg       0.90      0.93      0.91     56962
weighted avg       1.00      1.00      1.00     56962



In [30]:
#FINAL
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, recall_score, precision_score
import xgboost as xgb

# Step 0: Add 'index' column as the first column
data = pd.read_csv('data1.csv')
data.insert(0, 'index', range(len(data)))  # Add index column at the start

# Step 1: Split into train (60%), test1 (20%), test2 (20%) with stratification
train, temp = train_test_split(
    data, test_size=0.4, stratify=data['Class'], random_state=42
)
test1, test2 = train_test_split(
    temp, test_size=0.5, stratify=temp['Class'], random_state=42
)

# Step 2: Split train into 6 equal, non-overlapping stratified subsets
n_subsets = 6
skf = StratifiedKFold(n_splits=n_subsets, shuffle=True, random_state= 42)
subsets = []
for i, (_, idx) in enumerate(skf.split(train, train['Class'])):
    subset = data.iloc[idx].copy()
    subset['subset'] = f'sub{i+1}'
    subsets.append(subset)
    #print(f"Subset {i+1} has {len(subset)} instances.")
    #print(subset['Class'].value_counts())
    #print()
#subsets[0].head()

In [31]:

# Example: Use sub1-sub4 for training, sub6 for evaluation
train_subs = pd.concat(subsets[:4], ignore_index=True)
val_sub = subsets[5].copy()  # sub6

# Step 3: Train XGBoost on sub1-sub4, predict on sub6
X_train = train_subs.drop(['Class', 'index', 'subset'], axis=1)
y_train = train_subs['Class']
X_val = val_sub.drop(['Class', 'index', 'subset'], axis=1)
y_val = val_sub['Class']

model = xgb.XGBClassifier(tree_method='hist', device='cuda', eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:, 1]  # Probability of fraud (class 1)


In [32]:
# Step 3.1: Classification report, confusion matrix, probabilities
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred, zero_division=0))
print("\nFirst 10 predicted probabilities:", y_proba[:10])

# Step 3.2: Find easy frauds (P > T and actual fraud)
T = 0.5  # Set your threshold
easy_frauds_idx = val_sub[(y_proba > T) & (y_val == 1)].index
easy = val_sub.loc[easy_frauds_idx].copy()
print(f"\nNumber of easy frauds (P > {T} and actual fraud):", len(easy))

# Remove easy frauds from val_sub and update y_val/y_pred/y_proba
y_pred = pd.Series(y_pred, index=val_sub.index)
y_proba = pd.Series(y_proba, index=val_sub.index)

val_sub = val_sub.drop(easy_frauds_idx)
y_val = y_val.drop(easy_frauds_idx)

y_pred = y_pred.drop(easy_frauds_idx)
y_proba = y_proba.drop(easy_frauds_idx)

# Step 3.3: Find easy legits (P < 0.1 and actual legit)
B = 0.1
easy_legits_idx = val_sub[(y_proba < B) & (y_val == 0)].index
easy = pd.concat([easy, val_sub.loc[easy_legits_idx]])
print(f"\nNumber of easy legits (P < {B} and actual legit):", len(easy_legits_idx ))

# Remove easy legits from val_sub and update y_val/y_pred/y_proba
val_sub = val_sub.drop(easy_legits_idx)
y_val = y_val.drop(easy_legits_idx)
y_pred = y_pred.drop(easy_legits_idx)
y_proba = y_proba.drop(easy_legits_idx)


# Step 3.4: For false negatives (actual fraud, predicted legit), oversample
false_negatives_idx = val_sub[(y_val == 1) & (y_pred == 0)].index
recall = recall_score(y_val, y_pred, zero_division=0)
lamda_1 = 2  # initial value, will adjust later
fn_rows = val_sub.loc[false_negatives_idx]
fn_probs = pd.Series(y_proba, index=val_sub.index).loc[false_negatives_idx]
fn_repeat = np.ceil(recall * lamda_1 / np.maximum(fn_probs, 0.1)).astype(int)
fn_oversampled = pd.DataFrame(
    np.repeat(fn_rows.values, fn_repeat, axis=0),
    columns=fn_rows.columns)

# Step 3.5: For false positives (actual legit, predicted fraud), oversample
false_positives_idx = val_sub[(y_val == 0) & (y_pred == 1)].index
precision = precision_score(y_val, y_pred, zero_division=0)
lamda_2 = 2  # initial value, will adjust later
fp_rows = val_sub.loc[false_positives_idx]
fp_probs = pd.Series(y_proba, index=val_sub.index).loc[false_positives_idx]
fp_repeat = np.ceil(precision * lamda_2 / np.maximum(fp_probs, 0.1)).astype(int)
fp_oversampled = pd.DataFrame(
    np.repeat(fp_rows.values, fp_repeat, axis=0),
    columns=fp_rows.columns)

# Step 3.6: Adjust lamda_1 and lamda_2 to reach 20% fraud, 80% legit in val_sub
def adjust_lambdas(val_sub, fn_oversampled, fp_oversampled, target_fraud_ratio=0.2):
    min_lam = 2
    lam1, lam2 = min_lam, min_lam
    for _ in range(100):  # max 100 iterations
        temp = pd.concat([val_sub, fn_oversampled, fp_oversampled])
        frauds = temp[temp['Class'] == 1]
        legits = temp[temp['Class'] == 0]
        total = len(temp)
        fraud_ratio = len(frauds) / total if total > 0 else 0
        if abs(fraud_ratio - target_fraud_ratio) < 0.01:
            break
        if fraud_ratio < target_fraud_ratio:
            lam1 += 1
        else:
            lam2 += 1
        lam1 = max(lam1, min_lam)
        lam2 = max(lam2, min_lam)
    return lam1, lam2, temp

lamda_1, lamda_2, val_sub_balanced = adjust_lambdas(val_sub, fn_oversampled, fp_oversampled)



Confusion Matrix:
[[28397     5]
 [   11    67]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28402
           1       0.93      0.86      0.89        78

    accuracy                           1.00     28480
   macro avg       0.97      0.93      0.95     28480
weighted avg       1.00      1.00      1.00     28480


First 10 predicted probabilities: [2.9913022e-07 1.4242125e-05 9.0913227e-06 2.1766514e-06 5.8514360e-07
 5.2338709e-07 3.4755087e-07 5.3917188e-06 6.3509378e-06 1.5363554e-06]

Number of easy frauds (P > 0.5 and actual fraud): 67

Number of easy legits (P < 0.1 and actual legit): 28395

Final lamda_1: 2, lamda_2: 102
Total samples in balanced sub6: 18
Fraud ratio in balanced sub6: 0.611
Samples added to sub6 by oversampling: 0


  temp = pd.concat([val_sub, fn_oversampled, fp_oversampled])


In [33]:
# Step 3.7: Display results
print(f"\nFinal lamda_1: {lamda_1}, lamda_2: {lamda_2}")
print(f"Total samples in balanced sub6: {len(val_sub_balanced)}")
print(f"Number of frauds left in sub6: {(val_sub['Class'] == 1).sum()}")
print(f"Number of legits left in sub6: {(val_sub['Class'] == 0).sum()}")
print(f"Total samples left in sub6: {len(val_sub)}")
print(f"Fraud ratio in balanced sub6: {val_sub_balanced['Class'].mean():.3f}")
print(f"Samples added to sub6 by oversampling: {len(val_sub_balanced) - len(val_sub)}")



Final lamda_1: 2, lamda_2: 102
Total samples in balanced sub6: 18
Number of frauds left in sub6: 11
Number of legits left in sub6: 7
Total samples left in sub6: 18
Fraud ratio in balanced sub6: 0.611
Samples added to sub6 by oversampling: 0
