In [None]:
#Below is the final code
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import recall_score, precision_score
import xgboost as xgb

# Step 0: Add 'index' column as the first column
data = pd.read_csv('data1.csv')
data.insert(0, 'index', range(len(data)))  # Add index column at the start

# Step 1: Split into train (60%), test1 (20%), test2 (20%) with stratification
train, temp = train_test_split(
    data, test_size=0.4, stratify=data['Class'], random_state=42
)
test1, test2 = train_test_split(
    temp, test_size=0.5, stratify=temp['Class'], random_state=42
)

# Step 2: Split train into 6 equal, non-overlapping stratified subsets
n_subsets = 6
skf = StratifiedKFold(n_splits=n_subsets, shuffle=True, random_state=42)
subsets = []
for i, (_, idx) in enumerate(skf.split(train, train['Class'])):
    subset = train.iloc[idx].copy()
    subset['subset'] = f'sub{i+1}'
    subsets.append(subset)

# Prepare to collect tough frauds and legits from all validation subsets
# Remove easy frauds
T = 0.99 #top limit for easy fraud probability
# Remove easy legits
B = 0.05 # lower limit for fraud probability for easy legit.
# Now, apply oversampling with initial lambda values, using per-instance recall/precision
lamda_1 = 10 #(greater than 10 will give greater than 10 and lower than 100 integer)
lamda_2 = 1
fnlt= 0.15 # flase-negative(fraud not cuaght) lower threshol for taugh flase-negative
fput= 0.95 # false-positive(legit wrong caught) higher threshold for taugh false-positive.
target_fraud_ratio = 0.01

all_fn_rows = []
all_fn_probs = []
all_fn_recalls = []
all_fp_rows = []
all_fp_probs = []
all_fp_precisions = []
all_val_sub = []
all_hf_rows = []  # To store oversampled hard frauds

for val_idx in range(n_subsets):
    # Select training and validation subsets
    train_subs = pd.concat([subsets[i] for i in range(n_subsets) if i != val_idx], ignore_index=True)
    val_sub = subsets[val_idx].copy()

    X_train = train_subs.drop(['Class', 'index', 'subset'], axis=1)
    y_train = train_subs['Class']
    X_val = val_sub.drop(['Class', 'index', 'subset'], axis=1)
    y_val = val_sub['Class']

    model = xgb.XGBClassifier(tree_method='hist', device='cuda', eval_metric='logloss', random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]

    # Calculate recall and precision for this fold (on remaining val_sub)
    fold_recall = recall_score(y_val, y_pred, zero_division=0)
    fold_precision = precision_score(y_val, y_pred, zero_division=0)

    # Remove easy frauds

    y_pred = pd.Series(y_pred, index=val_sub.index)
    y_proba = pd.Series(y_proba, index=val_sub.index)
    easy_frauds_idx = val_sub[(y_proba > T) & (y_val == 1)].index
    val_sub = val_sub.drop(easy_frauds_idx)
    y_val = y_val.drop(easy_frauds_idx)
    y_pred = y_pred.drop(easy_frauds_idx)
    y_proba = y_proba.drop(easy_frauds_idx)

    # Remove easy legits

    easy_legits_idx = val_sub[(y_proba < B) & (y_val == 0)].index
    val_sub = val_sub.drop(easy_legits_idx)
    y_val = y_val.drop(easy_legits_idx)
    y_pred = y_pred.drop(easy_legits_idx)
    y_proba = y_proba.drop(easy_legits_idx)



    # Collect remaining (tough) false negatives and false positives
    false_negatives_idx = val_sub[(y_val == 1) & (y_pred == 0)].index
    fn_rows = val_sub.loc[false_negatives_idx].copy()
    fn_probs = y_proba.loc[false_negatives_idx]
    fn_rows['fold_recall'] = fold_recall  # Store per-fold recall with each row
    all_fn_rows.append(fn_rows)
    all_fn_probs.append(fn_probs)
    all_fn_recalls.append(pd.Series([fold_recall] * len(fn_rows), index=fn_rows.index))

    false_positives_idx = val_sub[(y_val == 0) & (y_pred == 1)].index
    fp_rows = val_sub.loc[false_positives_idx].copy()
    fp_probs = y_proba.loc[false_positives_idx]
    fp_rows['fold_precision'] = fold_precision  # Store per-fold precision with each row
    all_fp_rows.append(fp_rows)
    all_fp_probs.append(fp_probs)
    all_fp_precisions.append(pd.Series([fold_precision] * len(fp_rows), index=fp_rows.index))

    # Hard frauds: remaining frauds in val_sub that are not FNs
    hard_fraud_idx = val_sub[(y_val == 1) & (y_pred == 1)].index
    hard_fraud_rows = val_sub.loc[hard_fraud_idx].copy()
    hard_fraud_probs = y_proba.loc[hard_fraud_idx]

    # Oversample using recall / predicted probability (no lambda)
    hf_repeat = np.ceil(fold_recall / np.maximum(hard_fraud_probs, 0.1)).astype(int)
    hf_oversampled = pd.DataFrame(
        np.repeat(hard_fraud_rows.values, hf_repeat, axis=0),
        columns=hard_fraud_rows.columns)
    # Store oversampled hard frauds
    all_hf_rows.append(hf_oversampled)

    # Optionally, collect all remaining val_sub for analysis
    all_val_sub.append(val_sub)

# Combine all tough cases and their associated probabilities and metrics
all_fn_rows = pd.concat(all_fn_rows, ignore_index=True)
all_fn_probs = pd.concat(all_fn_probs, ignore_index=True)
all_fn_recalls = pd.concat(all_fn_recalls, ignore_index=True)
all_fp_rows = pd.concat(all_fp_rows, ignore_index=True)
all_fp_probs = pd.concat(all_fp_probs, ignore_index=True)
all_fp_precisions = pd.concat(all_fp_precisions, ignore_index=True)
all_val_sub = pd.concat(all_val_sub, ignore_index=True)
all_hf_rows = pd.concat(all_hf_rows, ignore_index=True)

# Now, apply oversampling with initial lambda values, using per-instance recall/precision


fn_repeat = np.ceil(all_fn_recalls * lamda_1 / np.maximum(all_fn_probs, 0.1)).astype(int)
fn_oversampled = pd.DataFrame(
    np.repeat(all_fn_rows.values, fn_repeat, axis=0),
    columns=all_fn_rows.columns)

fp_repeat = np.ceil(all_fp_precisions * lamda_2 / np.maximum(all_fp_probs, 0.1)).astype(int)
fp_oversampled = pd.DataFrame(
    np.repeat(all_fp_rows.values, fp_repeat, axis=0),
    columns=all_fp_rows.columns)

# Oversample using recall / predicted probability (no lambda)
hf_repeat = np.ceil(fold_recall / np.maximum(hard_fraud_probs, 0.1)).astype(int)
hf_oversampled = pd.DataFrame(
    np.repeat(hard_fraud_rows.values, hf_repeat, axis=0),
    columns=hard_fraud_rows.columns
)

# Adjust lambdas to reach threshold ratio of fraud to legit
def adjust_lambdas(val_sub, fn_oversampled, fp_oversampled, all_hf_rows, all_fn_probs, all_fn_recalls, all_fp_probs, all_fp_precisions, target_fraud_ratio):

    lam1, lam2 = lamda_1, lamda_2
    for _ in range(100):  # max 100 iterations
        temp = pd.concat([val_sub, fn_oversampled, fp_oversampled, all_hf_rows])
        frauds = temp[temp['Class'] == 1]
        legits = temp[temp['Class'] == 0]
        total = len(temp)
        fraud_ratio = len(frauds) / total if total > 0 else 0
        if abs(fraud_ratio - target_fraud_ratio) < 0.01:
            break
        if fraud_ratio < target_fraud_ratio:
            lam1 += 1
        else:
            lam2 += 1
        lam1 = max(lam1, lamda_1)
        lam2 = max(lam2, lamda_2)
        # Recalculate oversampled sets with updated lambdas and per-instance recall/precision
        fn_repeat = np.ceil(all_fn_recalls * lam1 / np.maximum(all_fn_probs, fnlt)).astype(int)
        fn_oversampled = pd.DataFrame(
            np.repeat(all_fn_rows.values, fn_repeat, axis=0),
            columns=all_fn_rows.columns)
        fp_repeat = np.ceil(all_fp_precisions * lam2 / np.maximum(all_fp_probs, fput)).astype(int)
        fp_oversampled = pd.DataFrame(
            np.repeat(all_fp_rows.values, fp_repeat, axis=0),
            columns=all_fp_rows.columns)
    temp = pd.concat([val_sub, fn_oversampled, fp_oversampled])
    return lam1, lam2, temp

lamda_1, lamda_2, val_sub_balanced = adjust_lambdas(
    all_val_sub, fn_oversampled, fp_oversampled, all_hf_rows,
    all_fn_probs, all_fn_recalls, all_fp_probs, all_fp_precisions,target_fraud_ratio
)



Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [None]:
# Count hard frauds and hard legits in all_val_sub (not oversampled)
hard_frauds = all_val_sub[all_val_sub['Class'] == 1]
hard_legits = all_val_sub[all_val_sub['Class'] == 0]

# Count tough frauds (oversampled FNs)
tough_frauds_oversampled = len(fn_oversampled)
# Count tough positives (oversampled FPs)
tough_positives_oversampled = len(fp_oversampled)
# Count hard frauds not oversampled
hard_frauds_not_oversampled = len(hard_frauds)
# Count hard legits not oversampled
hard_legits_not_oversampled = len(hard_legits)

print("\n==================================")
print(f"\nFinal lamda_1: {lamda_1}, lamda_2: {lamda_2}")
print(f"Total samples remaing in train set after removing easy fraud and")
print(f"legits ;and adding tough fraud and tough positive: {len(val_sub_balanced)}")
print(f"Fraud ratio in resultant set: {val_sub_balanced['Class'].mean():.3f}")
print(f"Samples added by oversampling: {len(val_sub_balanced) - len(all_val_sub)}")
print(f"Tough fraud samples added by oversampling (False Negatives, FNs): {tough_frauds_oversampled}")
print(f"Tough positive samples added by oversampling (False Positives, FPs): {tough_positives_oversampled}")
print(f"Medium fraud samples added by oversampling: {len(all_hf_rows)}")

print(f"Medium legit samples (not oversampled): {hard_legits_not_oversampled}")
print(val_sub_balanced.shape)
print(val_sub_balanced['Class'].value_counts())






Final lamda_1: 10, lamda_2: 101
Total samples remaing in train set after removing easy fraud and
legits ;and adding tough fraud and tough positive: 4854
Fraud ratio in resultant set: 0.649
Samples added by oversampling: 4673
Tough fraud samples added by oversampling (False Negatives, FNs): 4417
Tough positive samples added by oversampling (False Positives, FPs): 28
Medium fraud samples added by oversampling: 84
Medium legit samples (not oversampled): 46
(4854, 35)
Class
1    3148
0    1706
Name: count, dtype: int64


In [None]:
# Prepare the final dataset by combining everything
final_oversampled_df = pd.concat([val_sub_balanced], ignore_index=True)

# Drop helper columns if needed
final_oversampled_df  = final_oversampled_df .drop(columns=[col for col in ['fold_recall', 'fold_precision', 'index', 'subset'] if col in final_oversampled_df.columns])

# Convert all columns except 'Class' to numeric
for col in final_oversampled_df.columns:
    final_oversampled_df[col] = pd.to_numeric(final_oversampled_df[col], errors='coerce')  # convert, set invalid entries to NaN
# Optionally drop rows with NaNs if any were introduced
final_oversampled_df = final_oversampled_df.dropna()
df_new = final_oversampled_df.copy()

In [None]:
#this is the final ensemble model
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import numpy as np




# 2. Combine 'train' and 'test1' for Model 1 training
train_model1 = pd.concat([train, test1], ignore_index=True)

# Separate features (X) and target (y) for Model 1 training
X_train_model1 = train_model1.drop('Class', axis=1)
y_train_model1 = train_model1['Class']

# Separate features (X) and target (y) for Model 2 training (using the original 'train' data after concatenation)
train_model2 = pd.concat([df_new, train, test1], ignore_index=True)
X_train_model2 = train_model2.drop('Class', axis=1)
y_train_model2 = train_model2['Class']

# Separate features (X) and target (y) for test2 (final evaluation)
X_test2 = test2.drop('Class', axis=1)
y_test2 = test2['Class']

# Ensure 'index' column is dropped if present in any of the relevant DataFrames
for df_to_clean in [X_train_model1, X_train_model2, X_test2]:
    if 'index' in df_to_clean.columns:
        df_to_clean.drop('index', axis=1, inplace=True)

# Ensure data types are suitable for XGBoost and other models
X_train_model1 = X_train_model1.astype(float)
y_train_model1 = y_train_model1.astype(float)
X_train_model2 = X_train_model2.astype(float)
y_train_model2 = y_train_model2.astype(float)
X_test2 = X_test2.astype(float)
y_test2 = y_test2.astype(float)

# 3. Initialize and train the first XGBoost model (Model 1) on train_model1
model1 = xgb.XGBClassifier(tree_method='hist', device='cuda', eval_metric='logloss', random_state=42)
model1.fit(X_train_model1, y_train_model1)

# 4. Generate predictions (probabilities) from Model 1 on the data used to train Model 2 (X_train_model2)
# This is crucial for avoiding data leakage when training the meta-model
model1_preds_for_model2_training = model1.predict_proba(X_train_model2)[:, 1]

# 5. Create the dataset for the meta-model's training
X_meta_train = pd.DataFrame(model1_preds_for_model2_training, columns=['model1_prob'])
y_meta_train = y_train_model2  # The target for the meta-model is the true labels from X_train_model2

# 6. Initialize and train the second model (Meta-model) on the meta-model training data
meta_model= xgb.XGBClassifier(tree_method='hist', device='cuda', eval_metric='logloss', random_state=42)
meta_model.fit(X_meta_train, y_meta_train)




# 7. Generate predictions (probabilities) from Model 1 on test2 (for final evaluation)
model1_preds_test2 = model1.predict_proba(X_test2)[:, 1]

# 8. Create the dataset for the meta-model's prediction (on test2)
X_meta_test = pd.DataFrame(model1_preds_test2, columns=['model1_prob'])

# 9. Make final predictions with the Meta-model on test2
final_predictions_prob = meta_model.predict_proba(X_meta_test)[:, 1]

# Apply your custom threshold for final classification
threshold = 0.15 # Or your desired threshold
final_predictions_class = (final_predictions_prob > threshold).astype(int)

# 10. Evaluate the final ensemble results on test2
accuracy = accuracy_score(y_test2, final_predictions_class)
print(f"Final Ensemble Model Accuracy on test2: {accuracy:.4f}")

# Calculate and print the confusion matrix on test2
cm = confusion_matrix(y_test2, final_predictions_class)
print("\nConfusion Matrix on test2:\n", cm)

print("\nClassification Report on test2:\n", classification_report(y_test2, final_predictions_class))


Final Ensemble Model Accuracy on test2: 0.9996

Confusion Matrix on test2:
 [[56858     5]
 [   16    83]]

Classification Report on test2:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     56863
         1.0       0.94      0.84      0.89        99

    accuracy                           1.00     56962
   macro avg       0.97      0.92      0.94     56962
weighted avg       1.00      1.00      1.00     56962



In [None]:
# ... (rest of your code for splitting data, training model1 and meta_model)

# 7. Generate predictions (probabilities) from Model 1 on test2 (for final evaluation)
model1_preds_test2 = model1.predict_proba(X_test2)[:, 1]

# 8. Create the dataset for the meta-model's prediction (on test2)
X_meta_test = pd.DataFrame(model1_preds_test2, columns=['model1_prob'])

threshold = 0.6

# 9. Make final predictions with the Meta-model on test2 (you already have this)
final_predictions_prob = meta_model.predict_proba(X_meta_test)[:, 1]
final_predictions_class = (final_predictions_prob > threshold).astype(int)

# 10. Evaluate the final ensemble results on test2
# Calculate the AUC score for the ensemble model (meta-model) on test2 data.
auc = roc_auc_score(y_test2, meta_model.predict_proba(X_meta_test)[:, 1]) # Use X_meta_test here
print("AUC:", auc)

# ... (rest of your evaluation code for accuracy, confusion matrix, and classification report)


In [None]:
# Apply your custom threshold for final classification
#threshold = 0.5 # Or your desired threshold
final_predictions_class = (final_predictions_prob > threshold).astype(int)



print("\nClassification Report on test2:\n", classification_report(y_test2, final_predictions_class))


# Calculate and print the confusion matrix on test2
cm = confusion_matrix(y_test2, (final_predictions_prob > 0.15).astype(int)) # Use 0.15 threshold for confusion matrix
print("\nConfusion Matrix on test2:\n", cm)

print("\nClassification Report on test2:\n", classification_report(y_test2, (final_predictions_prob > 0.15).astype(int))) # Use 0.15 threshold for classification report

# Calculate the AUC score for the ensemble model (meta-model) on test2 data.
# Use a different variable name for the ROC AUC score
roc_auc = roc_auc_score(y_test2, final_predictions_prob)  # Renamed 'auc' to 'roc_auc'
print("ROC AUC:", roc_auc) # Print 'ROC AUC'

# 11. Draw the Precision-Recall curve
precision, recall, thresholds = precision_recall_curve(y_test2, final_predictions_prob)

# Plot the Precision-Recall curve
plt.figure()
plt.plot(recall, precision, label=f'Precision-Recall curve (AUC-PR = {auc(recall, precision):.2f})') # Use the original auc function
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve for Ensemble Model")
plt.legend(loc="lower left")
plt.grid(True)
plt.show()