# Data Preprocessing

In [1]:
import sys, os

# Add the parent directory containing IBM_GNN to sys.path
project_root = "/Users/hanbeobmun/Desktop/대학원/연구실/Fraud_detection_GNN"
if project_root not in sys.path:
    sys.path.append(project_root)

from IBM_GNN.IBM_dataset import IBM_Dataset
import numpy as np

TRANSACTIONS_CSV_PATH = '../data/IBM_Credit_Card_Transaction/credit_card_transactions-ibm_v2.csv'
USERS_CSV_PATH = '../data/IBM_Credit_Card_Transaction/sd254_users.csv'
CARDS_CSV_PATH = '../data/IBM_Credit_Card_Transaction/sd254_cards.csv'

try:
    dataset = (IBM_Dataset()
                .read_transactions_csv(TRANSACTIONS_CSV_PATH)
                .read_users_csv(USERS_CSV_PATH)
                .read_cards_csv(CARDS_CSV_PATH)
                .preprocess_transactions()
                .preprocess_users()
                .preprocess_cards()
                .create_node_mappings()
                )
except Exception as e:
    print(f"Error occurred: {e}")

Loading transactions CSV...
Transactions CSV loaded successfully.
Users CSV loaded successfully.
Cards CSV loaded successfully.


                                                                                                    

Preprocessing transactions completed.
Preprocessing users completed.
Preprocessing cards completed.
Total unique nodes: 106482


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc, f1_score

edge_transactions = dataset.edge_transactions
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
relation_onehot = onehot_encoder.fit_transform(edge_transactions[['Relation']])
relation_types = onehot_encoder.get_feature_names_out(['Relation'])
relation_df = pd.DataFrame(relation_onehot, columns=relation_types, index=edge_transactions.index)
edge_transactions = pd.concat([relation_df, edge_transactions], axis=1)
edge_transactions = edge_transactions.drop(columns=['Relation'])
edge_transactions.head()

Unnamed: 0,Relation_refund,Relation_transaction,Date,isFraud,Src,Dest,Scaled_Amount,MCC_idx,Zip_idx,Use Chip_Chip Transaction,Use Chip_Online Transaction,Use Chip_Swipe Transaction,Error_Bad Expiration,Error_Bad PIN,Error_Insufficient Balance,Error_Bad CVV,Error_Technical Glitch,Error_Bad Card Number,Error_Bad Zipcode
0,0.0,1.0,1991-01-02,0,791_1,2027553650310142703,0.449253,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1991-01-02,0,2027553650310142703,791_1,0.449253,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1991-01-02,0,791_1,2027553650310142703,0.503102,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1991-01-02,0,791_1,-7269691894846892021,0.504125,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,1991-01-03,0,791_1,-3693650930986299431,0.58699,2,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
drop_columns = ['Src', 'Dest', 'MCC_idx', 'Zip_idx']
edge_transactions = edge_transactions.drop(columns=drop_columns)
edge_transactions.head()

Unnamed: 0,Relation_refund,Relation_transaction,Date,isFraud,Scaled_Amount,Use Chip_Chip Transaction,Use Chip_Online Transaction,Use Chip_Swipe Transaction,Error_Bad Expiration,Error_Bad PIN,Error_Insufficient Balance,Error_Bad CVV,Error_Technical Glitch,Error_Bad Card Number,Error_Bad Zipcode
0,0.0,1.0,1991-01-02,0,0.449253,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1991-01-02,0,0.449253,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1991-01-02,0,0.503102,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1991-01-02,0,0.504125,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,1991-01-03,0,0.58699,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
def get_edge_transactions(edge_transactions, start_date=None, end_date=None):
    if edge_transactions is None:
        raise ValueError("Edge transactions dataframe is not loaded. Please call read_transactions_csv() and preprocess_transactions() first.")
    if start_date is not None and end_date is not None:
        if pd.to_datetime(start_date) == pd.to_datetime(end_date):
            mask = (edge_transactions['Date'] == pd.to_datetime(start_date))
            return edge_transactions.loc[mask].reset_index(drop=True)
        else:
            mask = (edge_transactions['Date'] >= pd.to_datetime(start_date)) & (edge_transactions['Date'] < pd.to_datetime(end_date))
            return edge_transactions.loc[mask].reset_index(drop=True)
    elif start_date is not None and end_date is None:
        mask = (edge_transactions['Date'] >= pd.to_datetime(start_date))
        return edge_transactions.loc[mask].reset_index(drop=True)
    elif start_date is None and end_date is not None:
        mask = (edge_transactions['Date'] < pd.to_datetime(end_date))
        return edge_transactions.loc[mask].reset_index(drop=True)
    else:
        return edge_transactions

start_date = '1996-01-01'
end_date = '2020-01-01'
# end_date = '2019-12-31'
days = pd.date_range(start=start_date, end=end_date, freq='D')
train_data, test_data = [], []
s = 0.4
for i in range(5):
    t_e = int(len(days) * s)
    train_end_date = days[t_e]
    test_start_date = days[t_e]
    test_end_date = days[min(int(len(days)*(s+0.2)), len(days)-1)]
    print(train_end_date, test_start_date, test_end_date)
    train_d = get_edge_transactions(edge_transactions, end_date=train_end_date)
    test_d = get_edge_transactions(edge_transactions, start_date=test_start_date, end_date=test_end_date)
    train_data.append(train_d)
    test_data.append(test_d)
    s += 0.1
    

# train_days = days[:int(len(days)*0.6)]
# val_days = days[int(len(days)*0.6):int(len(days)*0.8)]
# test_days = days[int(len(days)*0.8):]

# train_data = get_edge_transactions(edge_transactions, start_date=train_days[0], end_date=train_days[-1])
# val_data = get_edge_transactions(edge_transactions, start_date=val_days[0], end_date=val_days[-1])
# test_data = get_edge_transactions(edge_transactions, start_date=test_days[0], end_date=test_days[-1])

2005-08-07 00:00:00 2005-08-07 00:00:00 2010-05-27 00:00:00
2008-01-01 00:00:00 2008-01-01 00:00:00 2012-10-19 00:00:00
2010-05-27 00:00:00 2010-05-27 00:00:00 2015-03-15 00:00:00
2012-10-19 00:00:00 2012-10-19 00:00:00 2017-08-08 00:00:00
2015-03-15 00:00:00 2015-03-15 00:00:00 2020-01-01 00:00:00


In [5]:
# train_data = train_data.drop(columns=['Date'])
# val_data = val_data.drop(columns=['Date'])
# test_data = test_data.drop(columns=['Date'])

# train_x = train_data.drop(columns=['isFraud'])
# train_y = train_data['isFraud']

# val_x = val_data.drop(columns=['isFraud'])
# val_y = val_data['isFraud']

# test_x = test_data.drop(columns=['isFraud'])
# test_y = test_data['isFraud']

# train_x.head()

for i in range(5):
    train_data[i] = train_data[i].drop(columns=['Date'])
    test_data[i] = test_data[i].drop(columns=['Date'])

train_x = list()
train_y = list()
test_x = list()
test_y = list()
for i in range(5):
    train_x.append(train_data[i].drop(columns=['isFraud']))
    train_y.append(train_data[i]['isFraud'])
    test_x.append(test_data[i].drop(columns=['isFraud']))
    test_y.append(test_data[i]['isFraud'])


# Random Forest

In [6]:
# 5-fold cross validation
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm

metrics_rf = {
    'ROC_AUC': [],
    'Average Precision': [],
    'PR_AUC': []
}
for i in tqdm(range(5)):
    model_rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)
    model_rf.fit(train_x[i], train_y[i])
    y_pred_proba_rf = model_rf.predict_proba(test_x[i])[:, 1]
    roc_auc_rf = roc_auc_score(test_y[i], y_pred_proba_rf)
    avg_precision_rf = average_precision_score(test_y[i], y_pred_proba_rf)
    precision_rf, recall_rf, _ = precision_recall_curve(test_y[i], y_pred_proba_rf)
    pr_auc_rf = auc(recall_rf, precision_rf)
    metrics_rf['ROC_AUC'].append(roc_auc_rf)
    metrics_rf['Average Precision'].append(avg_precision_rf)
    metrics_rf['PR_AUC'].append(pr_auc_rf)

print("Random Forest 5-Fold Cross Validation Results:")
print(f"Average ROC AUC: {np.mean(metrics_rf['ROC_AUC']):.4f} ± {np.std(metrics_rf['ROC_AUC']):.4f}")
print(f"Average Average Precision: {np.mean(metrics_rf['Average Precision']):.4f} ± {np.std(metrics_rf['Average Precision']):.4f}")
print(f"Average PR AUC: {np.mean(metrics_rf['PR_AUC']):.4f} ± {np.std(metrics_rf['PR_AUC']):.4f}")

100%|██████████| 5/5 [25:14<00:00, 302.85s/it]

Random Forest 5-Fold Cross Validation Results:
Average ROC AUC: 0.5507 ± 0.0845
Average Average Precision: 0.0042 ± 0.0017
Average PR AUC: 0.0043 ± 0.0017





In [11]:
print(f"{metrics_rf['ROC_AUC'][2]:.4f}, {metrics_rf['Average Precision'][2]:.4f}, {metrics_rf['PR_AUC'][2]:.4f}")

0.7088, 0.0054, 0.0052


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc

model_rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)

model_rf.fit(train_x, train_y)
y_pred_proba_rf = model_rf.predict_proba(val_x)[:, 1]
roc_auc_rf = roc_auc_score(val_y, y_pred_proba_rf)
avg_precision_rf = average_precision_score(val_y, y_pred_proba_rf)
precision_rf, recall_rf, _ = precision_recall_curve(val_y, y_pred_proba_rf)
pr_auc_rf = auc(recall_rf, precision_rf)
print(f"Random Forest ROC AUC: {roc_auc_rf:.4f}")
print(f"Random Forest Average Precision: {avg_precision_rf:.4f}")
print(f"Random Forest PR AUC: {pr_auc_rf:.4f}")



ROC AUC: 0.5324
Average Precision: 0.0048
PR AUC: 0.0047


In [None]:
train_val_x = pd.concat([train_x, val_x], axis=0)
train_val_y = pd.concat([train_y, val_y], axis=0)

model_rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)

model_rf.fit(train_val_x, train_val_y)
y_pred_proba_rf = model_rf.predict_proba(test_x)[:, 1]
roc_auc_rf = roc_auc_score(test_y, y_pred_proba_rf)
avg_precision_rf = average_precision_score(test_y, y_pred_proba_rf)
precision_rf, recall_rf, _ = precision_recall_curve(test_y, y_pred_proba_rf)
pr_auc_rf = auc(recall_rf, precision_rf)
print(f"Random Forest ROC AUC: {roc_auc_rf:.4f}")
print(f"Random Forest Average Precision: {avg_precision_rf:.4f}")
print(f"Random Forest PR AUC: {pr_auc_rf:.4f}")

ROC AUC: 0.5638
Average Precision: 0.0045
PR AUC: 0.0051


# XGBoost

In [22]:
# 5-fold cross validation
from xgboost import XGBClassifier

metrics_xgb = {
    'ROC_AUC': [],
    'Average Precision': [],
    'PR_AUC': [],
    'f1_score': []
}

for i in tqdm(range(5)):

    neg_count = np.sum(train_y[i] == 0) + np.sum(test_y[i] == 0)
    pos_count = np.sum(train_y[i] == 1) +  np.sum(test_y[i] == 1)
    scale_pos_weight_value = neg_count / pos_count

    model_xgb = XGBClassifier(objective='binary:logistic',
                        n_estimators=100,
                        learning_rate=1e-3,
                        max_depth=5,
                        subsample=0.8,
                        gamma=0.1,
                        random_state=42,
                        use_label_encoder=False,
                        eval_metric='logloss',
                        n_jobs=-1,
                        scale_pos_weight=scale_pos_weight_value)
    model_xgb.fit(train_x[i], train_y[i])
    y_pred_proba_xgb = model_xgb.predict_proba(test_x[i])[:, 1]
    roc_auc_xgb = roc_auc_score(test_y[i], y_pred_proba_xgb)
    avg_precision_xgb = average_precision_score(test_y[i], y_pred_proba_xgb)
    precision_xgb, recall_xgb, _ = precision_recall_curve(test_y[i], y_pred_proba_xgb)
    pr_auc_xgb = auc(recall_xgb, precision_xgb)
    metrics_xgb['ROC_AUC'].append(roc_auc_xgb)
    metrics_xgb['Average Precision'].append(avg_precision_xgb)
    metrics_xgb['PR_AUC'].append(pr_auc_xgb)
    metrics_xgb['f1_score'].append(f1_score(test_y[i], (y_pred_proba_xgb >= 0.5).astype(int)))

print("XGBoost 5-Fold Cross Validation Results:")
print(f"Average ROC AUC: {np.mean(metrics_xgb['ROC_AUC']):.4f} ± {np.std(metrics_xgb['ROC_AUC']):.4f}")
print(f"Average Average Precision: {np.mean(metrics_xgb['Average Precision']):.4f} ± {np.std(metrics_xgb['Average Precision']):.4f}")
print(f"Average PR AUC: {np.mean(metrics_xgb['PR_AUC']):.4f} ± {np.std(metrics_xgb['PR_AUC']):.4f}")
print(f"Average F1 Score: {np.mean(metrics_xgb['f1_score']):.4f} ± {np.std(metrics_xgb['f1_score']):.4f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

100%|██████████| 5/5 [00:58<00:00, 11.67s/it]

XGBoost 5-Fold Cross Validation Results:
Average ROC AUC: 0.8640 ± 0.0558
Average Average Precision: 0.0250 ± 0.0062
Average PR AUC: 0.0335 ± 0.0069
Average F1 Score: 0.0071 ± 0.0088





In [23]:
print(f"{metrics_xgb['ROC_AUC'][2]:.4f}, {metrics_xgb['Average Precision'][2]:.4f}, {metrics_xgb['PR_AUC'][2]:.4f}, {metrics_xgb['f1_score'][2]:.4f}")

0.9097, 0.0211, 0.0322, 0.0017


In [25]:
from xgboost import XGBClassifier
neg_count = np.sum(train_y == 0) + np.sum(val_y == 0) + np.sum(test_y == 0)
pos_count = np.sum(train_y == 1) + np.sum(val_y == 1) + np.sum(test_y == 1)
scale_pos_weight_value = neg_count / pos_count

model_xgb = XGBClassifier(objective='binary:logistic',
                      n_estimators=100,
                      learning_rate=1e-3,
                      max_depth=5,
                      subsample=0.8,
                      gamma=0.1,
                      random_state=42,
                      use_label_encoder=False,
                      eval_metric='logloss',
                      n_jobs=-1,
                      scale_pos_weight=scale_pos_weight_value)

model_xgb.fit(train_x, train_y)


Parameters: { "use_label_encoder" } are not used.



In [26]:
y_pred_proba_xgb = model_xgb.predict_proba(val_x)[:, 1]
roc_auc_xgb = roc_auc_score(val_y, y_pred_proba_xgb)
avg_precision_xgb = average_precision_score(val_y, y_pred_proba_xgb)
precision_xgb, recall_xgb, _ = precision_recall_curve(val_y, y_pred_proba_xgb)
pr_auc_xgb = auc(recall_xgb, precision_xgb)
print(f"XGBoost ROC AUC: {roc_auc_xgb:.4f}")
print(f"XGBoost Average Precision: {avg_precision_xgb:.4f}")
print(f"XGBoost PR AUC: {pr_auc_xgb:.4f}")


XGBoost ROC AUC: 0.9128
XGBoost Average Precision: 0.0212
XGBoost PR AUC: 0.0321


In [27]:
model_xgb = XGBClassifier(objective='binary:logistic',
                      n_estimators=100,
                      learning_rate=1e-3,
                      max_depth=5,
                      subsample=0.8,
                      gamma=0.1,
                      random_state=42,
                      use_label_encoder=False,
                      eval_metric='logloss',
                      n_jobs=-1,
                      scale_pos_weight=scale_pos_weight_value)

model_xgb.fit(train_val_x, train_val_y)
y_pred_proba_xgb = model_xgb.predict_proba(test_x)[:, 1]
roc_auc_xgb = roc_auc_score(test_y, y_pred_proba_xgb)
avg_precision_xgb = average_precision_score(test_y, y_pred_proba_xgb)
precision_xgb, recall_xgb, _ = precision_recall_curve(test_y, y_pred_proba_xgb)
pr_auc_xgb = auc(recall_xgb, precision_xgb)
print(f"XGBoost ROC AUC: {roc_auc_xgb:.4f}")
print(f"XGBoost Average Precision: {avg_precision_xgb:.4f}")
print(f"XGBoost PR AUC: {pr_auc_xgb:.4f}")

Parameters: { "use_label_encoder" } are not used.



XGBoost ROC AUC: 0.7703
XGBoost Average Precision: 0.0155
XGBoost PR AUC: 0.0225


# Logistic Regression

In [24]:
# 5-fold cross validation
from sklearn.linear_model import LogisticRegression

metrics_lr = {
    'ROC_AUC': [],
    'Average Precision': [],
    'PR_AUC': [],
    'f1_score': []
}

for i in tqdm(range(5)):
    model_lr = LogisticRegression(solver='lbfgs', penalty='l2', class_weight='balanced', random_state=42, max_iter=1000, n_jobs=-1)
    model_lr.fit(train_x[i], train_y[i])
    y_pred_proba_lr = model_lr.predict_proba(test_x[i])[:, 1]
    roc_auc_lr = roc_auc_score(test_y[i], y_pred_proba_lr)
    avg_precision_lr = average_precision_score(test_y[i], y_pred_proba_lr)
    precision_lr, recall_lr, _ = precision_recall_curve(test_y[i], y_pred_proba_lr)
    pr_auc_lr = auc(recall_lr, precision_lr)
    metrics_lr['ROC_AUC'].append(roc_auc_lr)
    metrics_lr['Average Precision'].append(avg_precision_lr)
    metrics_lr['PR_AUC'].append(pr_auc_lr)
    metrics_lr['f1_score'].append(f1_score(test_y[i], (y_pred_proba_lr >= 0.5).astype(int)))

print("Logistic Regression 5-Fold Cross Validation Results:")
print(f"Average ROC AUC: {np.mean(metrics_lr['ROC_AUC']):.4f} ± {np.std(metrics_lr['ROC_AUC']):.4f}")
print(f"Average Average Precision: {np.mean(metrics_lr['Average Precision']):.4f} ± {np.std(metrics_lr['Average Precision']):.4f}")
print(f"Average PR AUC: {np.mean(metrics_lr['PR_AUC']):.4f} ± {np.std(metrics_lr['PR_AUC']):.4f}")
print(f"Average F1 Score: {np.mean(metrics_lr['f1_score']):.4f} ± {np.std(metrics_lr['f1_score']):.4f}")

100%|██████████| 5/5 [01:05<00:00, 13.02s/it]

Logistic Regression 5-Fold Cross Validation Results:
Average ROC AUC: 0.8419 ± 0.0651
Average Average Precision: 0.0171 ± 0.0036
Average PR AUC: 0.0170 ± 0.0036
Average F1 Score: 0.0116 ± 0.0042





In [25]:
print(f"{metrics_lr['ROC_AUC'][2]:.4f}, {metrics_lr['Average Precision'][2]:.4f}, {metrics_lr['PR_AUC'][2]:.4f}, {metrics_lr['f1_score'][2]:.4f}")

0.8964, 0.0141, 0.0141, 0.0097


In [47]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(solver='lbfgs', penalty='l2', class_weight='balanced', random_state=42, max_iter=1000, n_jobs=-1)

model_lr.fit(train_x, train_y)
y_pred_proba_lr = model_lr.predict_proba(val_x)[:, 1]
roc_auc_lr = roc_auc_score(val_y, y_pred_proba_lr)
avg_precision_lr = average_precision_score(val_y, y_pred_proba_lr)
precision_lr, recall_lr, _ = precision_recall_curve(val_y, y_pred_proba_lr)
pr_auc_lr = auc(recall_lr, precision_lr)
print(f"Logistic Regression ROC AUC: {roc_auc_lr:.4f}")
print(f"Logistic Regression Average Precision: {avg_precision_lr:.4f}")
print(f"Logistic Regression PR AUC: {pr_auc_lr:.4f}")


Logistic Regression ROC AUC: 0.8965
Logistic Regression Average Precision: 0.0141
Logistic Regression PR AUC: 0.0141


In [31]:
model_lr = LogisticRegression(solver='lbfgs', penalty='l2', class_weight='balanced', random_state=42, max_iter=1000, n_jobs=-1)

model_lr.fit(train_val_x, train_val_y)
y_pred_proba_lr = model_lr.predict_proba(test_x)[:, 1]
roc_auc_lr = roc_auc_score(test_y, y_pred_proba_lr)
avg_precision_lr = average_precision_score(test_y, y_pred_proba_lr)
precision_lr, recall_lr, _ = precision_recall_curve(test_y, y_pred_proba_lr)
pr_auc_lr = auc(recall_lr, precision_lr)
print(f"Logistic Regression ROC AUC: {roc_auc_lr:.4f}")
print(f"Logistic Regression Average Precision: {avg_precision_lr:.4f}")
print(f"Logistic Regression PR AUC: {pr_auc_lr:.4f}")

Logistic Regression ROC AUC: 0.7226
Logistic Regression Average Precision: 0.0115
Logistic Regression PR AUC: 0.0114


In [20]:
for i in range(5):
    print(f"Fold {i+1}:")
    print(f"  Train samples: {len(train_y[i])}, Fraud cases: {np.sum(train_y[i])}, Fraud ratio: {np.sum(train_y[i])/len(train_y[i]):.4f}")
    print(f"  Test samples: {len(test_y[i])}, Fraud cases: {np.sum(test_y[i])}, Fraud ratio: {np.sum(test_y[i])/len(test_y[i]):.4f}")


Fold 1:
  Train samples: 2605333, Fraud cases: 1720, Fraud ratio: 0.0007
  Test samples: 5452077, Fraud cases: 9637, Fraud ratio: 0.0018
Fold 2:
  Train samples: 4896433, Fraud cases: 4921, Fraud ratio: 0.0010
  Test samples: 6924155, Fraud cases: 10073, Fraud ratio: 0.0015
Fold 3:
  Train samples: 8057410, Fraud cases: 11357, Fraud ratio: 0.0014
  Test samples: 7751019, Fraud cases: 6707, Fraud ratio: 0.0009
Fold 4:
  Train samples: 11820588, Fraud cases: 14994, Fraud ratio: 0.0013
  Test samples: 8094662, Fraud cases: 9930, Fraud ratio: 0.0012
Fold 5:
  Train samples: 15808429, Fraud cases: 18064, Fraud ratio: 0.0011
  Test samples: 8241971, Fraud cases: 11693, Fraud ratio: 0.0014


In [19]:
# sum expects an iterable; add the two counts instead
total_fraud_cases = int(np.sum(train_y[-1]) + np.sum(test_y[-1]))
total_transactions = int(len(train_y[-1]) + len(test_y[-1]))
fraud_ratio = total_fraud_cases / total_transactions if total_transactions > 0 else 0.0
print(f"Total samples: {total_transactions}, Total fraud cases: {total_fraud_cases}, Overall fraud ratio: {fraud_ratio:.6f}")

Total samples: 24050400, Total fraud cases: 29757, Overall fraud ratio: 0.001237
