In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from cuml.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

data1 = pd.read_csv('data1.csv')

# Fix typo: use data1, not dat1
X = data1.drop(columns=['Class']).values
y = data1['Class'].values  # Correct syntax

# Check for at least two classes
if len(np.unique(y)) < 2:
    raise ValueError("SVM requires at least two classes in the target variable.")

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# Standardize features because SVM are sensitive for feature scale.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Fit cuML SVM (SVC) on legitimate transactions
svm = SVC()
svm.fit(X_train, y_train)

# Predict on test set
y_pred = svm.predict(X_test)

# Evaluate using scikit-learn
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.97      0.62      0.76        98

    accuracy                           1.00     56962
   macro avg       0.98      0.81      0.88     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
[[56862     2]
 [   37    61]]


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from cuml.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('data1.csv')

In [6]:
import cupy as cp
import cudf
from cuml.ensemble import RandomForestClassifier
from cuml.model_selection import train_test_split, StratifiedKFold
import xgboost as xgb
from sklearn.metrics import recall_score



def assign_penalty_weights(y_true, y_pred, alpha=2.0, P=45):
    y_true = cp.asarray(y_true)
    y_pred = cp.asarray(y_pred)
    N = len(y_true)
    weights = cp.ones(N)  # default: 1 for all
    for i in range(N):
        if y_true[i] == 1 and y_pred[i] == 0:  # FN
            weights[i] = P
    return weights, P

# Assume df is a cudf.DataFrame with features and 'Class' as label
X = df.drop(columns=['Class'])
y = df['Class']

# Split into trainval (80%) and final test (20%), stratified
X_trainval, X_finaltest, y_trainval, y_finaltest = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Stratified KFold for 5 folds on trainval
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
all_augmented_X, all_augmented_y, all_augmented_weights = [], [], []

for fold, (train_idx, test_idx) in enumerate(skf.split(X_trainval, y_trainval)):
    print(f"Fold {fold+1}")
    # FIX: Convert cupy indices to numpy for iloc
    X_train = X_trainval.iloc[train_idx.get()]
    X_test = X_trainval.iloc[test_idx.get()]
    y_train = y_trainval.iloc[train_idx.get()]
    y_test = y_trainval.iloc[test_idx.get()]

    # Train XGBoost on legitimate only (class 0)
    X_train_legit = X_train[y_train == 0]
    y_train_legit = y_train[y_train == 0]
    dtrain = xgb.DMatrix(X_train_legit.to_pandas(), label=y_train_legit.to_pandas())
    params = {
        'objective': 'binary:logistic',
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'eval_metric': 'logloss',
        'verbosity': 0
    }
    xgb_model = xgb.train(params, dtrain, num_boost_round=50)

    # Predict on test set (probabilities)
    dtest = xgb.DMatrix(X_test.to_pandas())
    y_pred_proba = xgb_model.predict(dtest)
    # Outlier detection: classify as fraud (1) if probability < threshold (e.g., 0.1)
    threshold = 0.1
    y_pred_test_bin = (y_pred_proba < threshold).astype(cp.int32)

    # Assign penalties
    weights, P = assign_penalty_weights(y_test.values, y_pred_test_bin, alpha=2.0, P=45)

    augmented_X, augmented_y, augmented_weights = [], [], []
    for i in range(len(X_test)):
        xi = X_test.iloc[i]
        yi = y_test.iloc[i]
        wi = int(cp.round(weights[i]).get())
        for _ in range(wi):
            augmented_X.append(xi)
            augmented_y.append(yi)
            augmented_weights.append(wi)
    # Add original training data (from this fold)
    for i in range(len(X_train)):
        augmented_X.append(X_train.iloc[i])
        augmented_y.append(y_train.iloc[i])
        augmented_weights.append(1.0)

# Convert to cudf DataFrame/Series
all_augmented_X = cudf.DataFrame(all_augmented_X)
all_augmented_y = cudf.Series(all_augmented_y)
all_augmented_weights = cp.array(all_augmented_weights)

# Train cuML Random Forest on the augmented 80% set
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(all_augmented_X, all_augmented_y, sample_weight=all_augmented_weights)

# Evaluate on the untouched final 20% test set
y_pred_final = rf.predict(X_finaltest)
recall_fraud = recall_score(y_finaltest.to_numpy(), y_pred_final.get(), pos_label=1)
print("Final fraud recall on untouched test set:", recall_fraud)


Fold 1


KeyboardInterrupt: 

In [None]:
# Convert all_augmented_X, all_augmented_y, all_augmented_weights to pandas for easy manipulation
import pandas as pd

# Convert cudf to pandas (if not already)
X_pd = all_augmented_X.to_pandas()
y_pd = all_augmented_y.to_pandas()
w_pd = cp.asnumpy(all_augmented_weights)

# Round weights to nearest integer (minimum 1)
w_pd = np.maximum(np.round(w_pd).astype(int), 1)

# Repeat (oversample) each row according to its weight
X_oversampled = np.repeat(X_pd.values, w_pd, axis=0)
y_oversampled = np.repeat(y_pd.values, w_pd, axis=0)

# Convert back to cudf DataFrame/Series for cuML
X_oversampled_cudf = cudf.DataFrame(X_oversampled, columns=X_pd.columns)
y_oversampled_cudf = cudf.Series(y_oversampled)

# Train cuML Random Forest on the oversampled data
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_oversampled_cudf, y_oversampled_cudf)

# Evaluate on the untouched final 20% test set
y_pred_final = rf.predict(X_finaltest)
recall_fraud = recall_score(y_finaltest.to_numpy(), y_pred_final.get(), pos_label=1)
print("Final fraud recall on untouched test set:", recall_fraud)


In [16]:
#XGboost
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 1. Load the dataset
df = pd.read_csv('data1.csv')
X = df.drop('Class', axis=1)
y = df['Class']

# 2. Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Initialize XGBoost with GPU support
model = xgb.XGBClassifier(
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    use_label_encoder=False,
)

# 4. Train the model
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

# 5. Predict probabilities and apply custom threshold
prob_preds = model.predict_proba(X_test)[:, 1]
threshold = 0.25
class_preds = (prob_preds > threshold).astype(int)

# 6. Evaluate the results
accuracy = accuracy_score(y_test, class_preds)
print(f"Test Accuracy (threshold=0.25): {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, class_preds))


[0]	validation_0-logloss:0.09428
[1]	validation_0-logloss:0.06912
[2]	validation_0-logloss:0.05107
[3]	validation_0-logloss:0.03799
[4]	validation_0-logloss:0.02848
[5]	validation_0-logloss:0.02150
[6]	validation_0-logloss:0.01634
[7]	validation_0-logloss:0.01260
[8]	validation_0-logloss:0.00983
[9]	validation_0-logloss:0.00782
[10]	validation_0-logloss:0.00633
[11]	validation_0-logloss:0.00528
[12]	validation_0-logloss:0.00449
[13]	validation_0-logloss:0.00390
[14]	validation_0-logloss:0.00348
[15]	validation_0-logloss:0.00314
[16]	validation_0-logloss:0.00290
[17]	validation_0-logloss:0.00273
[18]	validation_0-logloss:0.00262
[19]	validation_0-logloss:0.00253
[20]	validation_0-logloss:0.00246
[21]	validation_0-logloss:0.00242
[22]	validation_0-logloss:0.00237
[23]	validation_0-logloss:0.00234



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.



[24]	validation_0-logloss:0.00231
[25]	validation_0-logloss:0.00232
[26]	validation_0-logloss:0.00230
[27]	validation_0-logloss:0.00231
[28]	validation_0-logloss:0.00231
[29]	validation_0-logloss:0.00234
[30]	validation_0-logloss:0.00233
[31]	validation_0-logloss:0.00232
[32]	validation_0-logloss:0.00234
[33]	validation_0-logloss:0.00234
[34]	validation_0-logloss:0.00235
[35]	validation_0-logloss:0.00235
[36]	validation_0-logloss:0.00237
[37]	validation_0-logloss:0.00236
[38]	validation_0-logloss:0.00237
[39]	validation_0-logloss:0.00238
[40]	validation_0-logloss:0.00240
[41]	validation_0-logloss:0.00240
[42]	validation_0-logloss:0.00241
[43]	validation_0-logloss:0.00242
[44]	validation_0-logloss:0.00243
[45]	validation_0-logloss:0.00243
[46]	validation_0-logloss:0.00245
[47]	validation_0-logloss:0.00246
[48]	validation_0-logloss:0.00246
[49]	validation_0-logloss:0.00247
[50]	validation_0-logloss:0.00247
[51]	validation_0-logloss:0.00247
[52]	validation_0-logloss:0.00249
[53]	validatio


    E.g. tree_method = "hist", device = "cuda"



In [14]:
# 5. Predict probabilities and apply custom threshold
prob_preds = model.predict_proba(X_test)[:, 1]
threshold = 0.005
class_preds = (prob_preds > threshold).astype(int)

# 6. Evaluate the results
accuracy = accuracy_score(y_test, class_preds)
print(f"Test Accuracy (threshold=0.25): {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, class_preds))

Test Accuracy (threshold=0.25): 0.9991

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.67      0.97      0.79        98

    accuracy                           1.00     56962
   macro avg       0.83      0.98      0.90     56962
weighted avg       1.00      1.00      1.00     56962



In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load data
data1 = pd.read_csv('data1.csv')
X = data1.drop('Class', axis=1)
y = data1['Class']

# Train/test split (hold out 20% for final test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# XGBoost DMatrix for cross-validation
dtrain = xgb.DMatrix(X_train, label=y_train)

# Set parameters
params = {
    "objective": "binary:logistic",
    "tree_method": "gpu_hist",  # use 'hist' if no GPU
    "eval_metric": "logloss"
}

# Cross-validation (5-fold)
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=100,
    nfold=5,
    metrics={"logloss"},
    early_stopping_rounds=10,
    seed=42
)

print(cv_results.head())

# Train final model using best number of rounds from CV
best_n = len(cv_results)
final_model = xgb.XGBClassifier(
    n_estimators=best_n,
    tree_method='gpu_hist',
    use_label_encoder=False,
    eval_metric='logloss'
)
final_model.fit(X_train, y_train)

# Evaluate on test set
y_pred = final_model.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))





    E.g. tree_method = "hist", device = "cuda"



   train-logloss-mean  train-logloss-std  test-logloss-mean  test-logloss-std
0            0.094245           0.000037           0.094410          0.000162
1            0.069035           0.000039           0.069253          0.000186
2            0.050933           0.000045           0.051197          0.000211
3            0.037832           0.000052           0.038135          0.000228
4            0.028257           0.000045           0.028614          0.000242
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.91      0.81      0.85        98

    accuracy                           1.00     56962
   macro avg       0.95      0.90      0.93     56962
weighted avg       1.00      1.00      1.00     56962




    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [14]:
# 5. Predict probabilities and apply custom threshold
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
prob_preds = final_model.predict_proba(X_test)[:, 1]
threshold = 0.1
class_preds = (prob_preds > threshold).astype(int)

# 6. Evaluate the results
accuracy = accuracy_score(y_test, class_preds)
print(f"Test Accuracy (threshold=0.25): {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, class_preds))

Test Accuracy (threshold=0.25): 0.9994

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.80      0.86      0.83        98

    accuracy                           1.00     56962
   macro avg       0.90      0.93      0.91     56962
weighted avg       1.00      1.00      1.00     56962



In [30]:
#FINAL
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, recall_score, precision_score
import xgboost as xgb

# Step 0: Add 'index' column as the first column
data = pd.read_csv('data1.csv')
data.insert(0, 'index', range(len(data)))  # Add index column at the start

# Step 1: Split into train (60%), test1 (20%), test2 (20%) with stratification
train, temp = train_test_split(
    data, test_size=0.4, stratify=data['Class'], random_state=42
)
test1, test2 = train_test_split(
    temp, test_size=0.5, stratify=temp['Class'], random_state=42
)

# Step 2: Split train into 6 equal, non-overlapping stratified subsets
n_subsets = 6
skf = StratifiedKFold(n_splits=n_subsets, shuffle=True, random_state= 42)
subsets = []
for i, (_, idx) in enumerate(skf.split(train, train['Class'])):
    subset = data.iloc[idx].copy()
    subset['subset'] = f'sub{i+1}'
    subsets.append(subset)
    #print(f"Subset {i+1} has {len(subset)} instances.")
    #print(subset['Class'].value_counts())
    #print()
#subsets[0].head()

In [31]:

# Example: Use sub1-sub4 for training, sub6 for evaluation
train_subs = pd.concat(subsets[:4], ignore_index=True)
val_sub = subsets[5].copy()  # sub6

# Step 3: Train XGBoost on sub1-sub4, predict on sub6
X_train = train_subs.drop(['Class', 'index', 'subset'], axis=1)
y_train = train_subs['Class']
X_val = val_sub.drop(['Class', 'index', 'subset'], axis=1)
y_val = val_sub['Class']

model = xgb.XGBClassifier(tree_method='hist', device='cuda', eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:, 1]  # Probability of fraud (class 1)


In [32]:
# Step 3.1: Classification report, confusion matrix, probabilities
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred, zero_division=0))
print("\nFirst 10 predicted probabilities:", y_proba[:10])

# Step 3.2: Find easy frauds (P > T and actual fraud)
T = 0.5  # Set your threshold
easy_frauds_idx = val_sub[(y_proba > T) & (y_val == 1)].index
easy = val_sub.loc[easy_frauds_idx].copy()
print(f"\nNumber of easy frauds (P > {T} and actual fraud):", len(easy))

# Remove easy frauds from val_sub and update y_val/y_pred/y_proba
y_pred = pd.Series(y_pred, index=val_sub.index)
y_proba = pd.Series(y_proba, index=val_sub.index)

val_sub = val_sub.drop(easy_frauds_idx)
y_val = y_val.drop(easy_frauds_idx)

y_pred = y_pred.drop(easy_frauds_idx)
y_proba = y_proba.drop(easy_frauds_idx)

# Step 3.3: Find easy legits (P < 0.1 and actual legit)
B = 0.1
easy_legits_idx = val_sub[(y_proba < B) & (y_val == 0)].index
easy = pd.concat([easy, val_sub.loc[easy_legits_idx]])
print(f"\nNumber of easy legits (P < {B} and actual legit):", len(easy_legits_idx ))

# Remove easy legits from val_sub and update y_val/y_pred/y_proba
val_sub = val_sub.drop(easy_legits_idx)
y_val = y_val.drop(easy_legits_idx)
y_pred = y_pred.drop(easy_legits_idx)
y_proba = y_proba.drop(easy_legits_idx)


# Step 3.4: For false negatives (actual fraud, predicted legit), oversample
false_negatives_idx = val_sub[(y_val == 1) & (y_pred == 0)].index
recall = recall_score(y_val, y_pred, zero_division=0)
lamda_1 = 2  # initial value, will adjust later
fn_rows = val_sub.loc[false_negatives_idx]
fn_probs = pd.Series(y_proba, index=val_sub.index).loc[false_negatives_idx]
fn_repeat = np.ceil(recall * lamda_1 / np.maximum(fn_probs, 0.1)).astype(int)
fn_oversampled = pd.DataFrame(
    np.repeat(fn_rows.values, fn_repeat, axis=0),
    columns=fn_rows.columns)

# Step 3.5: For false positives (actual legit, predicted fraud), oversample
false_positives_idx = val_sub[(y_val == 0) & (y_pred == 1)].index
precision = precision_score(y_val, y_pred, zero_division=0)
lamda_2 = 2  # initial value, will adjust later
fp_rows = val_sub.loc[false_positives_idx]
fp_probs = pd.Series(y_proba, index=val_sub.index).loc[false_positives_idx]
fp_repeat = np.ceil(precision * lamda_2 / np.maximum(fp_probs, 0.1)).astype(int)
fp_oversampled = pd.DataFrame(
    np.repeat(fp_rows.values, fp_repeat, axis=0),
    columns=fp_rows.columns)

# Step 3.6: Adjust lamda_1 and lamda_2 to reach 20% fraud, 80% legit in val_sub
def adjust_lambdas(val_sub, fn_oversampled, fp_oversampled, target_fraud_ratio=0.2):
    min_lam = 2
    lam1, lam2 = min_lam, min_lam
    for _ in range(100):  # max 100 iterations
        temp = pd.concat([val_sub, fn_oversampled, fp_oversampled])
        frauds = temp[temp['Class'] == 1]
        legits = temp[temp['Class'] == 0]
        total = len(temp)
        fraud_ratio = len(frauds) / total if total > 0 else 0
        if abs(fraud_ratio - target_fraud_ratio) < 0.01:
            break
        if fraud_ratio < target_fraud_ratio:
            lam1 += 1
        else:
            lam2 += 1
        lam1 = max(lam1, min_lam)
        lam2 = max(lam2, min_lam)
    return lam1, lam2, temp

lamda_1, lamda_2, val_sub_balanced = adjust_lambdas(val_sub, fn_oversampled, fp_oversampled)



Confusion Matrix:
[[28397     5]
 [   11    67]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28402
           1       0.93      0.86      0.89        78

    accuracy                           1.00     28480
   macro avg       0.97      0.93      0.95     28480
weighted avg       1.00      1.00      1.00     28480


First 10 predicted probabilities: [2.9913022e-07 1.4242125e-05 9.0913227e-06 2.1766514e-06 5.8514360e-07
 5.2338709e-07 3.4755087e-07 5.3917188e-06 6.3509378e-06 1.5363554e-06]

Number of easy frauds (P > 0.5 and actual fraud): 67

Number of easy legits (P < 0.1 and actual legit): 28395

Final lamda_1: 2, lamda_2: 102
Total samples in balanced sub6: 18
Fraud ratio in balanced sub6: 0.611
Samples added to sub6 by oversampling: 0


  temp = pd.concat([val_sub, fn_oversampled, fp_oversampled])


In [33]:
# Step 3.7: Display results
print(f"\nFinal lamda_1: {lamda_1}, lamda_2: {lamda_2}")
print(f"Total samples in balanced sub6: {len(val_sub_balanced)}")
print(f"Number of frauds left in sub6: {(val_sub['Class'] == 1).sum()}")
print(f"Number of legits left in sub6: {(val_sub['Class'] == 0).sum()}")
print(f"Total samples left in sub6: {len(val_sub)}")
print(f"Fraud ratio in balanced sub6: {val_sub_balanced['Class'].mean():.3f}")
print(f"Samples added to sub6 by oversampling: {len(val_sub_balanced) - len(val_sub)}")



Final lamda_1: 2, lamda_2: 102
Total samples in balanced sub6: 18
Number of frauds left in sub6: 11
Number of legits left in sub6: 7
Total samples left in sub6: 18
Fraud ratio in balanced sub6: 0.611
Samples added to sub6 by oversampling: 0


In [169]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import recall_score, precision_score
import xgboost as xgb

# Step 0: Add 'index' column as the first column
data = pd.read_csv('data1.csv')
data.insert(0, 'index', range(len(data)))  # Add index column at the start

# Step 1: Split into train (60%), test1 (20%), test2 (20%) with stratification
train, temp = train_test_split(
    data, test_size=0.4, stratify=data['Class'], random_state=42
)
test1, test2 = train_test_split(
    temp, test_size=0.5, stratify=temp['Class'], random_state=42
)

# Step 2: Split train into 6 equal, non-overlapping stratified subsets
n_subsets = 6
skf = StratifiedKFold(n_splits=n_subsets, shuffle=True, random_state=42)
subsets = []
for i, (_, idx) in enumerate(skf.split(train, train['Class'])):
    subset = train.iloc[idx].copy()
    subset['subset'] = f'sub{i+1}'
    subsets.append(subset)

# Prepare to collect tough frauds and legits from all validation subsets
# Remove easy frauds
T = 0.99 #top limit for easy fraud probability
# Remove easy legits
B = 0.05 # lower limit for fraud probability for easy legit.
# Now, apply oversampling with initial lambda values, using per-instance recall/precision
lamda_1 = 100
lamda_2 = 1
fnlt= 0.15 # flase-negative(fraud not cuaght) lower threshol for taugh flase-negative
fput= 0.95 # false-positive(legit wrong caught) higher threshold for taugh false-positive.
target_fraud_ratio = 0.10

all_fn_rows = []
all_fn_probs = []
all_fn_recalls = []
all_fp_rows = []
all_fp_probs = []
all_fp_precisions = []
all_val_sub = []
all_hf_rows = []  # To store oversampled hard frauds

for val_idx in range(n_subsets):
    # Select training and validation subsets
    train_subs = pd.concat([subsets[i] for i in range(n_subsets) if i != val_idx], ignore_index=True)
    val_sub = subsets[val_idx].copy()

    X_train = train_subs.drop(['Class', 'index', 'subset'], axis=1)
    y_train = train_subs['Class']
    X_val = val_sub.drop(['Class', 'index', 'subset'], axis=1)
    y_val = val_sub['Class']

    model = xgb.XGBClassifier(tree_method='hist', device='cuda', eval_metric='logloss', random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]

    # Calculate recall and precision for this fold (on remaining val_sub)
    fold_recall = recall_score(y_val, y_pred, zero_division=0)
    fold_precision = precision_score(y_val, y_pred, zero_division=0)

    # Remove easy frauds

    y_pred = pd.Series(y_pred, index=val_sub.index)
    y_proba = pd.Series(y_proba, index=val_sub.index)
    easy_frauds_idx = val_sub[(y_proba > T) & (y_val == 1)].index
    val_sub = val_sub.drop(easy_frauds_idx)
    y_val = y_val.drop(easy_frauds_idx)
    y_pred = y_pred.drop(easy_frauds_idx)
    y_proba = y_proba.drop(easy_frauds_idx)

    # Remove easy legits

    easy_legits_idx = val_sub[(y_proba < B) & (y_val == 0)].index
    val_sub = val_sub.drop(easy_legits_idx)
    y_val = y_val.drop(easy_legits_idx)
    y_pred = y_pred.drop(easy_legits_idx)
    y_proba = y_proba.drop(easy_legits_idx)



    # Collect remaining (tough) false negatives and false positives
    false_negatives_idx = val_sub[(y_val == 1) & (y_pred == 0)].index
    fn_rows = val_sub.loc[false_negatives_idx].copy()
    fn_probs = y_proba.loc[false_negatives_idx]
    fn_rows['fold_recall'] = fold_recall  # Store per-fold recall with each row
    all_fn_rows.append(fn_rows)
    all_fn_probs.append(fn_probs)
    all_fn_recalls.append(pd.Series([fold_recall] * len(fn_rows), index=fn_rows.index))

    false_positives_idx = val_sub[(y_val == 0) & (y_pred == 1)].index
    fp_rows = val_sub.loc[false_positives_idx].copy()
    fp_probs = y_proba.loc[false_positives_idx]
    fp_rows['fold_precision'] = fold_precision  # Store per-fold precision with each row
    all_fp_rows.append(fp_rows)
    all_fp_probs.append(fp_probs)
    all_fp_precisions.append(pd.Series([fold_precision] * len(fp_rows), index=fp_rows.index))

    # Hard frauds: remaining frauds in val_sub that are not FNs
    hard_fraud_idx = val_sub[(y_val == 1) & (y_pred == 1)].index
    hard_fraud_rows = val_sub.loc[hard_fraud_idx].copy()
    hard_fraud_probs = y_proba.loc[hard_fraud_idx]

    # Oversample using recall / predicted probability (no lambda)
    hf_repeat = np.ceil(fold_recall / np.maximum(hard_fraud_probs, 0.1)).astype(int)
    hf_oversampled = pd.DataFrame(
        np.repeat(hard_fraud_rows.values, hf_repeat, axis=0),
        columns=hard_fraud_rows.columns)
    # Store oversampled hard frauds
    all_hf_rows.append(hf_oversampled)

    # Optionally, collect all remaining val_sub for analysis
    all_val_sub.append(val_sub)

# Combine all tough cases and their associated probabilities and metrics
all_fn_rows = pd.concat(all_fn_rows, ignore_index=True)
all_fn_probs = pd.concat(all_fn_probs, ignore_index=True)
all_fn_recalls = pd.concat(all_fn_recalls, ignore_index=True)
all_fp_rows = pd.concat(all_fp_rows, ignore_index=True)
all_fp_probs = pd.concat(all_fp_probs, ignore_index=True)
all_fp_precisions = pd.concat(all_fp_precisions, ignore_index=True)
all_val_sub = pd.concat(all_val_sub, ignore_index=True)
all_hf_rows = pd.concat(all_hf_rows, ignore_index=True)

# Now, apply oversampling with initial lambda values, using per-instance recall/precision


fn_repeat = np.ceil(all_fn_recalls * lamda_1 / np.maximum(all_fn_probs, 0.1)).astype(int)
fn_oversampled = pd.DataFrame(
    np.repeat(all_fn_rows.values, fn_repeat, axis=0),
    columns=all_fn_rows.columns)

fp_repeat = np.ceil(all_fp_precisions * lamda_2 / np.maximum(all_fp_probs, 0.1)).astype(int)
fp_oversampled = pd.DataFrame(
    np.repeat(all_fp_rows.values, fp_repeat, axis=0),
    columns=all_fp_rows.columns)

# Oversample using recall / predicted probability (no lambda)
hf_repeat = np.ceil(fold_recall / np.maximum(hard_fraud_probs, 0.1)).astype(int)
hf_oversampled = pd.DataFrame(
    np.repeat(hard_fraud_rows.values, hf_repeat, axis=0),
    columns=hard_fraud_rows.columns
)

# Adjust lambdas to reach 20% fraud, 80% legit
def adjust_lambdas(val_sub, fn_oversampled, fp_oversampled, all_hf_rows, all_fn_probs, all_fn_recalls, all_fp_probs, all_fp_precisions, target_fraud_ratio):

    lam1, lam2 = lamda_1, lamda_2
    for _ in range(100):  # max 100 iterations
        temp = pd.concat([val_sub, fn_oversampled, fp_oversampled, all_hf_rows])
        frauds = temp[temp['Class'] == 1]
        legits = temp[temp['Class'] == 0]
        total = len(temp)
        fraud_ratio = len(frauds) / total if total > 0 else 0
        if abs(fraud_ratio - target_fraud_ratio) < 0.01:
            break
        if fraud_ratio < target_fraud_ratio:
            lam1 += 1
        else:
            lam2 += 1
        lam1 = max(lam1, lamda_1)
        lam2 = max(lam2, lamda_2)
        # Recalculate oversampled sets with updated lambdas and per-instance recall/precision
        fn_repeat = np.ceil(all_fn_recalls * lam1 / np.maximum(all_fn_probs, fnlt)).astype(int)
        fn_oversampled = pd.DataFrame(
            np.repeat(all_fn_rows.values, fn_repeat, axis=0),
            columns=all_fn_rows.columns)
        fp_repeat = np.ceil(all_fp_precisions * lam2 / np.maximum(all_fp_probs, fput)).astype(int)
        fp_oversampled = pd.DataFrame(
            np.repeat(all_fp_rows.values, fp_repeat, axis=0),
            columns=all_fp_rows.columns)
    temp = pd.concat([val_sub, fn_oversampled, fp_oversampled])
    return lam1, lam2, temp

lamda_1, lamda_2, val_sub_balanced = adjust_lambdas(
    all_val_sub, fn_oversampled, fp_oversampled, all_hf_rows,
    all_fn_probs, all_fn_recalls, all_fp_probs, all_fp_precisions,target_fraud_ratio
)








In [170]:
# Count hard frauds and hard legits in all_val_sub (not oversampled)
hard_frauds = all_val_sub[all_val_sub['Class'] == 1]
hard_legits = all_val_sub[all_val_sub['Class'] == 0]

# Count tough frauds (oversampled FNs)
tough_frauds_oversampled = len(fn_oversampled)
# Count tough positives (oversampled FPs)
tough_positives_oversampled = len(fp_oversampled)
# Count hard frauds not oversampled
hard_frauds_not_oversampled = len(hard_frauds)
# Count hard legits not oversampled
hard_legits_not_oversampled = len(hard_legits)

print("\n==================================")
print(f"\nFinal lamda_1: {lamda_1}, lamda_2: {lamda_2}")
print(f"Total samples remaing in train set after removing easy fraud and")
print(f"legits ;and adding tough fraud and tough positive: {len(val_sub_balanced)}")
print(f"Fraud ratio in resultant set from train set: {val_sub_balanced['Class'].mean():.3f}")
print(f"Samples added by oversampling: {len(val_sub_balanced) - len(all_val_sub)}")
print(f"Tough fraud samples added by oversampling (False Negatives, FNs): {tough_frauds_oversampled}")
print(f"Tough positive samples added by oversampling (False Positives, FPs): {tough_positives_oversampled}")
print(f"Medium fraud samples added by oversampling: {len(all_hf_rows)}")

print(f"Medium legit samples (not oversampled): {hard_legits_not_oversampled}")
print(final_balanced_df.shape)
print(val_sub_balanced['Class'].value_counts())






Final lamda_1: 100, lamda_2: 101
Total samples remaing in train set after removing easy fraud and
legits ;and adding tough fraud and tough positive: 31751
Fraud ratio in resultant set from train set: 0.946
Samples added by oversampling: 31570
Tough fraud samples added by oversampling (False Negatives, FNs): 43986
Tough positive samples added by oversampling (False Positives, FPs): 28
Medium fraud samples added by oversampling: 84
Medium legit samples (not oversampled): 46
(6151, 33)
Class
1    30045
0     1706
Name: count, dtype: int64


In [171]:
# Prepare the final dataset by combining everything
final_oversampled_df = pd.concat([val_sub_balanced], ignore_index=True)

# Drop helper columns if needed
final_oversampled_df  = final_oversampled_df .drop(columns=[col for col in ['fold_recall', 'fold_precision', 'index', 'subset'] if col in final_oversampled_df.columns])

# Convert all columns except 'Class' to numeric
for col in final_oversampled_df.columns:
    df[col] = pd.to_numeric(final_oversampled_df[col], errors='coerce')  # convert, set invalid entries to NaN
# Optionally drop rows with NaNs if any were introduced
final_oversampled_df = final_oversampled_df.dropna()
df_new = final_oversampled_df.copy()

In [149]:
#down is final ensembled model and up is final data seperation and oversampling

In [179]:
#this is the final ensemble model
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import numpy as np




# 2. Combine 'train' and 'test1' for Model 1 training
train_model1 = pd.concat([train, test1], ignore_index=True)

# Separate features (X) and target (y) for Model 1 training
X_train_model1 = train_model1.drop('Class', axis=1)
y_train_model1 = train_model1['Class']

# Separate features (X) and target (y) for Model 2 training (using the original 'train' data after concatenation)
train_model2 = pd.concat([df_new, train, test1], ignore_index=True)
X_train_model2 = train_model2.drop('Class', axis=1)
y_train_model2 = train_model2['Class']

# Separate features (X) and target (y) for test2 (final evaluation)
X_test2 = test2.drop('Class', axis=1)
y_test2 = test2['Class']

# Ensure 'index' column is dropped if present in any of the relevant DataFrames
for df_to_clean in [X_train_model1, X_train_model2, X_test2]:
    if 'index' in df_to_clean.columns:
        df_to_clean.drop('index', axis=1, inplace=True)

# Ensure data types are suitable for XGBoost and other models
X_train_model1 = X_train_model1.astype(float)
y_train_model1 = y_train_model1.astype(float)
X_train_model2 = X_train_model2.astype(float)
y_train_model2 = y_train_model2.astype(float)
X_test2 = X_test2.astype(float)
y_test2 = y_test2.astype(float)

# 3. Initialize and train the first XGBoost model (Model 1) on train_model1
model1 = xgb.XGBClassifier(tree_method='hist', device='cuda', eval_metric='logloss', random_state=42)
model1.fit(X_train_model1, y_train_model1)

# 4. Generate predictions (probabilities) from Model 1 on the data used to train Model 2 (X_train_model2)
# This is crucial for avoiding data leakage when training the meta-model
model1_preds_for_model2_training = model1.predict_proba(X_train_model2)[:, 1]

# 5. Create the dataset for the meta-model's training
X_meta_train = pd.DataFrame(model1_preds_for_model2_training, columns=['model1_prob'])
y_meta_train = y_train_model2  # The target for the meta-model is the true labels from X_train_model2

# 6. Initialize and train the second model (Meta-model) on the meta-model training data
meta_model= xgb.XGBClassifier(tree_method='hist', device='cuda', eval_metric='logloss', random_state=42)
meta_model.fit(X_meta_train, y_meta_train)




# 7. Generate predictions (probabilities) from Model 1 on test2 (for final evaluation)
model1_preds_test2 = model1.predict_proba(X_test2)[:, 1]

# 8. Create the dataset for the meta-model's prediction (on test2)
X_meta_test = pd.DataFrame(model1_preds_test2, columns=['model1_prob'])

# 9. Make final predictions with the Meta-model on test2
final_predictions_prob = meta_model.predict_proba(X_meta_test)[:, 1]

# Apply your custom threshold for final classification
threshold = 0.15 # Or your desired threshold
final_predictions_class = (final_predictions_prob > threshold).astype(int)

# 10. Evaluate the final ensemble results on test2
accuracy = accuracy_score(y_test2, final_predictions_class)
print(f"Final Ensemble Model Accuracy on test2: {accuracy:.4f}")

# Calculate and print the confusion matrix on test2
cm = confusion_matrix(y_test2, final_predictions_class)
print("\nConfusion Matrix on test2:\n", cm)

print("\nClassification Report on test2:\n", classification_report(y_test2, final_predictions_class))


Final Ensemble Model Accuracy on test2: 0.9996

Confusion Matrix on test2:
 [[56862     1]
 [   24    75]]

Classification Report on test2:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     56863
         1.0       0.99      0.76      0.86        99

    accuracy                           1.00     56962
   macro avg       0.99      0.88      0.93     56962
weighted avg       1.00      1.00      1.00     56962



In [173]:
# ... (rest of your code for splitting data, training model1 and meta_model)

# 7. Generate predictions (probabilities) from Model 1 on test2 (for final evaluation)
model1_preds_test2 = model1.predict_proba(X_test2)[:, 1]

# 8. Create the dataset for the meta-model's prediction (on test2)
X_meta_test = pd.DataFrame(model1_preds_test2, columns=['model1_prob'])

threshold = 0.6

# 9. Make final predictions with the Meta-model on test2 (you already have this)
final_predictions_prob = meta_model.predict_proba(X_meta_test)[:, 1]
final_predictions_class = (final_predictions_prob > threshold).astype(int)

# 10. Evaluate the final ensemble results on test2
# Calculate the AUC score for the ensemble model (meta-model) on test2 data.
auc = roc_auc_score(y_test2, meta_model.predict_proba(X_meta_test)[:, 1]) # Use X_meta_test here
print("AUC:", auc)

# ... (rest of your evaluation code for accuracy, confusion matrix, and classification report)


AUC: 0.9619881171065596


In [180]:
# Apply your custom threshold for final classification
#threshold = 0.5 # Or your desired threshold
final_predictions_class = (final_predictions_prob > threshold).astype(int)



print("\nClassification Report on test2:\n", classification_report(y_test2, final_predictions_class))


# Calculate and print the confusion matrix on test2
cm = confusion_matrix(y_test2, (final_predictions_prob > 0.15).astype(int)) # Use 0.15 threshold for confusion matrix
print("\nConfusion Matrix on test2:\n", cm)

print("\nClassification Report on test2:\n", classification_report(y_test2, (final_predictions_prob > 0.15).astype(int))) # Use 0.15 threshold for classification report

# Calculate the AUC score for the ensemble model (meta-model) on test2 data.
# Use a different variable name for the ROC AUC score
roc_auc = roc_auc_score(y_test2, final_predictions_prob)  # Renamed 'auc' to 'roc_auc'
print("ROC AUC:", roc_auc) # Print 'ROC AUC'

# 11. Draw the Precision-Recall curve
precision, recall, thresholds = precision_recall_curve(y_test2, final_predictions_prob)

# Plot the Precision-Recall curve
plt.figure()
plt.plot(recall, precision, label=f'Precision-Recall curve (AUC-PR = {auc(recall, precision):.2f})') # Use the original auc function
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve for Ensemble Model")
plt.legend(loc="lower left")
plt.grid(True)
plt.show()


Classification Report on test2:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     56863
         1.0       0.99      0.76      0.86        99

    accuracy                           1.00     56962
   macro avg       0.99      0.88      0.93     56962
weighted avg       1.00      1.00      1.00     56962


Confusion Matrix on test2:
 [[56862     1]
 [   24    75]]

Classification Report on test2:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     56863
         1.0       0.99      0.76      0.86        99

    accuracy                           1.00     56962
   macro avg       0.99      0.88      0.93     56962
weighted avg       1.00      1.00      1.00     56962

ROC AUC: 0.9619881171065596


TypeError: 'numpy.float64' object is not callable

<Figure size 640x480 with 0 Axes>

In [167]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt # Import matplotlib

# Assuming 'data' is your initial DataFrame

# 1. Split data into train, test1, and test2 sets
train, temp = train_test_split(
    data, test_size=0.4, stratify=data['Class'], random_state=42
)
test1, test2 = train_test_split(
    temp, test_size=0.5, stratify=temp['Class'], random_state=42
)

# Load your df_new data
df_new = pd.read_csv('df_new.csv') # Replace with the actual path to your df_new

# 2. Combine 'train' and 'test1' for Model 1 training
train_model1 = pd.concat([train, test1], ignore_index=True)

# Separate features (X) and target (y) for Model 1 training
X_train_model1 = train_model1.drop('Class', axis=1)
y_train_model1 = train_model1['Class']

# Separate features (X) and target (y) for Model 2 training (using the original 'train' data after concatenation)
train_model2 = pd.concat([df_new, train], ignore_index=True)
X_train_model2 = train_model2.drop('Class', axis=1)
y_train_model2 = train_model2['Class']

# Separate features (X) and target (y) for test2 (final evaluation)
X_test2 = test2.drop('Class', axis=1)
y_test2 = test2['Class']

# Ensure 'index' column is dropped if present in any of the relevant DataFrames
for df_to_clean in [X_train_model1, X_train_model2, X_test2]:
    if 'index' in df_to_clean.columns:
        df_to_clean.drop('index', axis=1, inplace=True)

# Ensure data types are suitable for XGBoost and other models
X_train_model1 = X_train_model1.astype(float)
y_train_model1 = y_train_model1.astype(float)
X_train_model2 = X_train_model2.astype(float)
y_train_model2 = y_train_model2.astype(float)
X_test2 = X_test2.astype(float)
y_test2 = y_test2.astype(float)

# 3. Initialize and train the first XGBoost model (Model 1) on train_model1
model1 = xgb.XGBClassifier(tree_method='hist', device='cuda', eval_metric='logloss', random_state=42)
model1.fit(X_train_model1, y_train_model1)

# 4. Generate predictions (probabilities) from Model 1 on the data used to train Model 2 (X_train_model2)
model1_preds_for_model2_training = model1.predict_proba(X_train_model2)[:, 1]

# 5. Create the dataset for the meta-model's training
X_meta_train = pd.DataFrame(model1_preds_for_model2_training, columns=['model1_prob'])
y_meta_train = y_train_model2  # The target for the meta-model is the true labels from X_train_model2

# 6. Initialize and train the second model (Meta-model) on the meta-model training data
meta_model = LogisticRegression(random_state=42)
meta_model.fit(X_meta_train, y_meta_train)

# 7. Generate predictions (probabilities) from Model 1 on test2 (for final evaluation)
model1_preds_test2 = model1.predict_proba(X_test2)[:, 1]

# 8. Create the dataset for the meta-model's prediction (on test2)
X_meta_test = pd.DataFrame(model1_preds_test2, columns=['model1_prob'])

# 9. Make final predictions with the Meta-model on test2
final_predictions_prob = meta_model.predict_proba(X_meta_test)[:, 1]

# 10. Evaluate the final ensemble results on test2
accuracy = accuracy_score(y_test2, (final_predictions_prob > 0.15).astype(int)) # Use 0.15 threshold for accuracy
print(f"Final Ensemble Model Accuracy on test2: {accuracy:.4f}")

# Calculate and print the confusion matrix on test2
cm = confusion_matrix(y_test2, (final_predictions_prob > 0.15).astype(int)) # Use 0.15 threshold for confusion matrix
print("\nConfusion Matrix on test2:\n", cm)

print("\nClassification Report on test2:\n", classification_report(y_test2, (final_predictions_prob > 0.15).astype(int))) # Use 0.15 threshold for classification report

# Calculate the AUC score for the ensemble model (meta-model) on test2 data.
auc_score = roc_auc_score(y_test2, final_predictions_prob) # Use final_predictions_prob
print("AUC:", auc_score)

# 11. Draw the Precision-Recall curve
precision, recall, thresholds = precision_recall_curve(y_test2, final_predictions_prob) # Use final_predictions_prob

# Plot the Precision-Recall curve
plt.figure()
plt.plot(recall, precision, label=f'Precision-Recall curve (AUC = {auc(recall, precision):.2f})') # Calculate AUC-PR for the curve
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve for Ensemble Model")
plt.legend(loc="lower left")
plt.grid(True)
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'df_new.csv'

In [None]:
#up

In [None]:
# Assuming you have loaded your data into 'train' and 'test2' DataFrames

# 1. Load the dataset
df = pd.read_csv('data1.csv')
X = df.drop('Class', axis=1)
y = df['Class']

# 2. Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42

# 3. Initialize XGBoost with GPU support
model = xgb.XGBClassifier(tree_method='hist', device='cuda', eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

# 4. Predict probabilities and apply custom threshold
y_pred = model.predict(X_test)
prob_preds = model.predict_proba(X_test)[:, 1]
threshold = 0.15
class_preds = (prob_preds > threshold).astype(int)

# 5. Evaluate the results
accuracy = accuracy_score(y_test, class_preds)
print(f"Test Accuracy (threshold=0.25): {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, class_preds))


In [132]:
# Assuming you have loaded your data into 'train' and 'test2' DataFrames

# 1. Prepare X_train and y_train
X_train = train.drop(['Class', 'index'], axis=1).astype(float) # Drop 'Class' and 'index'
y_train = train['Class'].astype(float)

# 2. Prepare X_test and y_test
X_test = test2.drop(['Class', 'index'], axis=1).astype(float)  # Drop 'Class' and 'index'
y_test = test2['Class'].astype(float)

# 3. Initialize XGBoost with GPU support
model = xgb.XGBClassifier(tree_method='hist', device='cuda', eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

# 4. Predict probabilities and apply custom threshold
y_pred = model.predict(X_test)
prob_preds = model.predict_proba(X_test)[:, 1]
threshold = 0.15
class_preds = (prob_preds > threshold).astype(int)

# 5. Evaluate the results
accuracy = accuracy_score(y_test, class_preds)
print(f"Test Accuracy (threshold=0.25): {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, class_preds))


Test Accuracy (threshold=0.25): 0.9996

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     56863
         1.0       0.94      0.82      0.88        99

    accuracy                           1.00     56962
   macro avg       0.97      0.91      0.94     56962
weighted avg       1.00      1.00      1.00     56962



In [134]:
threshold = 0.10
class_preds = (prob_preds > threshold).astype(int)

# 5. Evaluate the results
accuracy = accuracy_score(y_test, class_preds)
print(f"Test Accuracy (threshold=0.25): {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, class_preds))


Test Accuracy (threshold=0.25): 0.9995

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     56863
         1.0       0.88      0.83      0.85        99

    accuracy                           1.00     56962
   macro avg       0.94      0.91      0.93     56962
weighted avg       1.00      1.00      1.00     56962



In [109]:
final_oversampled_df.head()
#Optional: Save to CSV
# final_balanced_df.to_csv("balanced_dataset.csv", index=False)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,140293.0,0.951025,3.252926,-5.039105,4.632411,3.014501,-1.34957,0.98094,-1.819539,-2.099049,...,1.404524,-0.760549,0.358292,-1.185942,-1.286177,0.000365,0.169662,0.108276,0.77,1
1,53316.0,-7.752965,0.705763,0.478148,3.985048,3.270238,1.01761,-10.758946,-8.702424,-0.164677,...,-5.729882,1.216213,-4.698273,-0.296797,-1.700717,0.156541,0.3693,0.075165,0.0,0
2,39729.0,-0.964567,-1.643541,-0.187727,1.158253,-2.458336,0.852222,2.785163,-0.303609,0.940006,...,0.44718,0.536204,1.634061,0.203839,0.218749,-0.221886,-0.308555,-0.1645,776.83,1
3,35261.0,-1.236842,1.94939,-1.139733,1.142948,-1.189765,-1.09756,-2.130852,0.701933,-1.488241,...,0.234634,-0.887139,-0.140437,-0.467764,0.041665,0.171172,0.865273,0.374717,2.69,0
4,166198.0,-35.548539,-31.850484,-48.325589,15.304184,-113.743307,73.301626,120.589494,-27.34736,-3.872425,...,-21.62012,5.712303,-1.581098,4.584549,4.554683,3.415636,31.612198,-15.430084,25691.16,0


In [94]:
#XGboost
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 1. Load the dataset



# Optionally drop rows with NaNs if any were introduced
df = df.dropna()



X = test2.drop('Class', axis=1)
y = test2['Class']


)

# 3. Initialize XGBoost with GPU support
model = xgb.XGBClassifier(
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    use_label_encoder=False,
)

# 4. Train the model
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

# 5. Predict probabilities and apply custom threshold
prob_preds = model.predict_proba(X_test)[:, 1]
threshold = 0.25
class_preds = (prob_preds > threshold).astype(int)

# 6. Evaluate the results
accuracy = accuracy_score(y_test, class_preds)
print(f"Test Accuracy (threshold=0.25): {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, class_preds))


In [86]:
# Count hard frauds and hard legits in all_val_sub (not oversampled)
hard_frauds = all_val_sub[all_val_sub['Class'] == 1]
hard_legits = all_val_sub[all_val_sub['Class'] == 0]

# Count tough frauds (oversampled FNs)
tough_frauds_oversampled = len(fn_oversampled)
# Count tough positives (oversampled FPs)
tough_positives_oversampled = len(fp_oversampled)
# Count hard frauds not oversampled
hard_frauds_not_oversampled = len(hard_frauds)
# Count hard legits not oversampled
hard_legits_not_oversampled = len(hard_legits)

print("\n==================================")
print(f"\nFinal lamda_1: {lamda_1}, lamda_2: {lamda_2}")
print(f"Total samples remaing in train set after removing easy fraud and")
print(f"legits ;and adding tough fraud and tough positive: {len(val_sub_balanced)}")
print(f"Fraud ratio in resultant set from train set: {val_sub_balanced['Class'].mean():.3f}")
print(f"Samples added by oversampling: {len(val_sub_balanced) - len(all_val_sub)}")
print(f"Tough fraud samples added by oversampling (False Negatives, FNs): {tough_frauds_oversampled}")
print(f"Tough positive samples added by oversampling (False Positives, FPs): {tough_positives_oversampled}")
print(f"Medium fraud samples added by oversampling: {len(all_hf_rows)}")

print(f"Medium legit samples (not oversampled): {hard_legits_not_oversampled}")
print(final_balanced_df.shape)
print(val_sub_balanced['Class'].value_counts())






Final lamda_1: 5, lamda_2: 102
Total samples remaing in train set after removing easy fraud and
legits ;and adding tough fraud and tough positive: 6151
Fraud ratio in resultant set from train set: 0.703
Samples added by oversampling: 6026
Tough fraud samples added by oversampling (False Negatives, FNs): 2210
Tough positive samples added by oversampling (False Positives, FPs): 46
Medium fraud samples added by oversampling: 28
Medium legit samples (not oversampled): 46
(6151, 33)
Class
1    4322
0    1829
Name: count, dtype: int64


In [112]:
df1= final_oversampled_df
# Convert all columns (except 'Class') to numeric
for col in df.columns:

  df1[col] = pd.to_numeric(df1[col], errors='coerce')

# Drop any rows with NaNs (from conversion)
df1 = df1.dropna()

import pandas as pd
df2 = pd.read_csv('data1.csv')
df = pd.concat([df1, df2], ignore_index=True)

In [113]:
#XGboost
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 1. Load the dataset



# Optionally drop rows with NaNs if any were introduced
df = df.dropna()



X = df.drop('Class', axis=1)
y = df['Class']

# 2. Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Initialize XGBoost with GPU support
model = xgb.XGBClassifier(
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    use_label_encoder=False,
)

# 4. Train the model
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

# 5. Predict probabilities and apply custom threshold
prob_preds = model.predict_proba(X_test)[:, 1]
threshold = 0.25
class_preds = (prob_preds > threshold).astype(int)

# 6. Evaluate the results
accuracy = accuracy_score(y_test, class_preds)
print(f"Test Accuracy (threshold=0.25): {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, class_preds))


[0]	validation_0-logloss:0.12036
[1]	validation_0-logloss:0.08878
[2]	validation_0-logloss:0.06634
[3]	validation_0-logloss:0.04998
[4]	validation_0-logloss:0.03891
[5]	validation_0-logloss:0.03019
[6]	validation_0-logloss:0.02375
[7]	validation_0-logloss:0.01904
[8]	validation_0-logloss:0.01553
[9]	validation_0-logloss:0.01284
[10]	validation_0-logloss:0.01074
[11]	validation_0-logloss:0.00890
[12]	validation_0-logloss:0.00773
[13]	validation_0-logloss:0.00661
[14]	validation_0-logloss:0.00590
[15]	validation_0-logloss:0.00531
[16]	validation_0-logloss:0.00483
[17]	validation_0-logloss:0.00446
[18]	validation_0-logloss:0.00404
[19]	validation_0-logloss:0.00378
[20]	validation_0-logloss:0.00348



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.



[21]	validation_0-logloss:0.00322
[22]	validation_0-logloss:0.00312
[23]	validation_0-logloss:0.00286
[24]	validation_0-logloss:0.00269
[25]	validation_0-logloss:0.00258
[26]	validation_0-logloss:0.00249
[27]	validation_0-logloss:0.00242
[28]	validation_0-logloss:0.00237
[29]	validation_0-logloss:0.00231
[30]	validation_0-logloss:0.00226
[31]	validation_0-logloss:0.00219
[32]	validation_0-logloss:0.00213
[33]	validation_0-logloss:0.00209
[34]	validation_0-logloss:0.00202
[35]	validation_0-logloss:0.00196
[36]	validation_0-logloss:0.00191
[37]	validation_0-logloss:0.00186
[38]	validation_0-logloss:0.00184
[39]	validation_0-logloss:0.00181
[40]	validation_0-logloss:0.00181
[41]	validation_0-logloss:0.00179
[42]	validation_0-logloss:0.00175
[43]	validation_0-logloss:0.00174
[44]	validation_0-logloss:0.00173
[45]	validation_0-logloss:0.00171
[46]	validation_0-logloss:0.00169
[47]	validation_0-logloss:0.00168
[48]	validation_0-logloss:0.00168
[49]	validation_0-logloss:0.00166
[50]	validatio


    E.g. tree_method = "hist", device = "cuda"

