In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import requests
import os
import networkx as nx
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import igraph as ig
import sys
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.utils import compute_sample_weight
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, f1_score, average_precision_score, precision_recall_curve,make_scorer
import psutil
import optuna

In [2]:
#load data 
data = pd.read_csv(r"C:\Users\Alix\Downloads\archive\HI-Small_Trans_subset.csv") 
data.tail()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
2999995,2022/09/07 11:00,410,800240B70,10,80005B110,14402.14,US Dollar,14402.14,US Dollar,ACH,1
2999996,2022/09/08 19:26,232073,80BA3FAD0,221279,80BA3FB70,2560.23,Euro,2560.23,Euro,ACH,1
2999997,2022/09/09 15:08,23289,808839F70,24840,80A2A61B0,12320.7,Euro,12320.7,Euro,ACH,1
2999998,2022/09/03 22:42,23538,80D018930,222363,80D018D90,37314.94,US Dollar,37314.94,US Dollar,ACH,1
2999999,2022/09/01 20:30,113,80FFF5A30,243897,80FFF5B20,4027.43,Shekel,4027.43,Shekel,ACH,1


In [3]:
# Timefeatures
data['Timestamp'] = pd.to_datetime(data['Timestamp'])
data['Hour'] = data['Timestamp'].dt.hour
data['Date_Year'] = data['Timestamp'].dt.year
data['Date_Month'] = data['Timestamp'].dt.month
data['Date_Day'] = data['Timestamp'].dt.day

data.drop(columns=['Timestamp'], inplace=True)

# Amounts
data['Amount'] = data[['Amount Paid', 'Amount Received']].mean(axis=1)  # of kies alleen één


In [4]:
categorical_cols = [
    'Account', 'Account.1',
    'From Bank', 'To Bank',
    'Payment Format'  
]


for col in categorical_cols:
    data[col] = data[col].fillna("missing").astype(str)

data.dropna(subset=['Account', 'Account.1', 'Amount'], inplace=True)

# Network and time features
G = nx.DiGraph()
edges = list(zip(data["Account"], data["Account.1"], data["Amount"]))
G.add_weighted_edges_from(edges)

G_ig = ig.Graph.TupleList(edges, directed=True)
data["degree_centrality"] = data["Account"].map(dict(zip(G_ig.vs["name"], G_ig.degree()))).fillna(0)
pagerank_scores = G_ig.pagerank()
data["pagerank"] = data["Account"].map(dict(zip(G_ig.vs["name"], pagerank_scores))).fillna(0)

# time and transactional features
window_size = 50
data["rolling_24h_amount"] = data.groupby("Account")["Amount"]\
                                 .rolling(window_size).sum().reset_index(0, drop=True).fillna(0)
data["transaction_count"] = data.groupby("Account")["Amount"].transform("count")


features = data.drop(columns=[
    'Is Laundering',
    'Payment Currency',
    'Receiving Currency',
    'rolling_24h_amount'  
]).copy()

# add the extra features
features["degree_centrality"] = data["degree_centrality"]
features["pagerank"] = data["pagerank"]
features["rolling_24h_amount"] = data["rolling_24h_amount"]
features["transaction_count"] = data["transaction_count"]

# remove irrelevant features
features.drop(columns=['Date_Year', 'Date_Month'], inplace=True, errors='ignore')

# Target
target = data['Is Laundering']
data_standardized = features.copy()
data_standardized['Is Laundering'] = target.reset_index(drop=True)



In [5]:
#split the data
X = data_standardized.drop(columns=["Is Laundering"])
y = data_standardized["Is Laundering"]

# Train/test split
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Train/validation split within the trainingset
X_train, X_val, y_train_true, y_val_true = train_test_split(
    X_train_full, y_train_full, test_size=0.2, stratify=y_train_full, random_state=42
)

#label encoding
encoders = {}
for col in categorical_cols:
    combined = pd.concat([X_train[col], X_val[col], X_test[col]]).astype(str)
    le = LabelEncoder().fit(combined)
    
    X_train[col] = le.transform(X_train[col].astype(str))
    X_val[col] = le.transform(X_val[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
    
    encoders[col] = le

#scaling
numerical_cols = ['Hour', 'Date_Day', 'Amount']
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


sample_weights = compute_sample_weight(class_weight={0: 1, 1: 10}, y=y_train_true)

In [6]:
def maak_pu_setting_van_echte_labels(y_true, label_ratio, random_state=42):
    
    np.random.seed(random_state)
    positieve_indexen = np.where(y_true == 1)[0]
    n_gelabeld = int(label_ratio * len(positieve_indexen))
    gelabelde_indexen = np.random.choice(positieve_indexen, size=n_gelabeld, replace=False)

    y_pu = np.zeros_like(y_true)
    y_pu[gelabelde_indexen] = 1

    return y_pu, y_true, gelabelde_indexen


alpha = 0.001  # 
label_ratio = 0.2

y_train_pu, y_train_true, gelabelde_indexen = maak_pu_setting_van_echte_labels(y_train_true, label_ratio)

#sample weights 
sample_weights = compute_sample_weight(class_weight={0: 1, 1: 10}, y=y_train_pu)

print("finish")
print(gelabelde_indexen.shape[0])
print(y_train_true.shape[0])

finish
342
1680000


In [None]:
#Best hyperparameters: {'n_estimators': 322, 'max_depth': 3, 'learning_rate': 0.07482190006321321, 'subsample': 0.6601618247993428, 'colsample_bytree': 0.671789475048553, 'gamma': 3.2560401283698, 'reg_alpha': 6.941806486527001, 'reg_lambda': 5.886403784981896, 'scale_pos_weight': 17.936820946355468}
params = {
    'n_estimators': 322,
    'max_depth': 3,
    'learning_rate':  0.07482190006321321,
    'subsample': 0.6601618247993428,
    'colsample_bytree':  0.671789475048553,
    'gamma': 3.2560401283698,
    'reg_alpha':6.941806486527001,
    'reg_lambda': 5.886403784981896,
    'scale_pos_weight': 17.936820946355468,
    'use_label_encoder': False,
    'tree_method': 'auto',           
    'eval_metric': None,        
    'n_jobs': None                     
}

xgb_best = XGBClassifier(**params)
print("Model succeeded ")
xgb_best.fit(X_train, y_train_pu, sample_weight=sample_weights)

Model succeeded 


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# Predcition
y_proba = xgb_best.predict_proba(X_test)[:, 1]

# Threshold tuning based on F1
prec, rec, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (prec * rec) / (prec + rec + 1e-8)
best_idx = np.argmax(f1_scores)
best_thresh = thresholds[best_idx]
print(f"Beste threshold (voor F1): {best_thresh:.4f}")

# Prediction
y_pred = (y_proba >= best_thresh).astype(int)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
pr_auc = average_precision_score(y_test, y_proba)

print("Model evaluatie (gewogen + getuned + threshold-optimalisatie):")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"PR AUC: {pr_auc:.4f}")

In [None]:
# Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(
    confusion_matrix(y_test, y_pred),
    annot=True, fmt='d', cmap='Blues',
    xticklabels=['Not Suspicious', 'Suspicious'],
    yticklabels=['Not Suspicious', 'Suspicious']
)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Threshold performance
plt.figure(figsize=(8,6))
plt.plot(thresholds, f1_scores[:-1], label="F1-score")
plt.plot(thresholds, prec[:-1], label="Precision")
plt.plot(thresholds, rec[:-1], label="Recall")
plt.axvline(x=best_thresh, color='r', linestyle='--', label=f'Beste threshold = {best_thresh:.2f}')
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("Precision, Recall en F1-score per Threshold")
plt.legend()
plt.grid()
plt.show()

In [9]:
np.save("XGB_IBM_y_true.npy",y_test)
np.save("XGB_IBM_y_pred.npy",y_pred)
np.save("XGB_IBM_y_probs.npy",y_proba)

In [17]:
#optuna
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import optuna

def objective(trial):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "use_label_encoder": False,
        "tree_method": "hist",
        "n_estimators": trial.suggest_int("n_estimators", 50, 600),
        "max_depth": trial.suggest_int("max_depth", 3, 30),
        "learning_rate": trial.suggest_float("learning_rate", 0.0001, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10.0),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1.0, 20.0)
    }

    model = XGBClassifier(**params)
    
    # Train on PU-labels
    model.fit(X_train, y_train_pu)

    
    y_val_pred = model.predict(X_val)
    score = f1_score(y_val_true, y_val_pred)

    return score


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30, timeout=1200)

print("Beste hyperparameters:", study.best_params)

# Use best model
best_params = study.best_params
best_params.update({
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "use_label_encoder": False,
    "tree_method": "hist"
})
xgb_best = XGBClassifier(**best_params)


[I 2025-05-12 10:13:26,696] A new study created in memory with name: no-name-a6bb5a47-a5b2-4b9c-8289-93c2966955ab
[I 2025-05-12 10:13:44,283] Trial 0 finished with value: 0.09691629955947137 and parameters: {'n_estimators': 222, 'max_depth': 8, 'learning_rate': 0.2243744590277013, 'subsample': 0.9058542932298366, 'colsample_bytree': 0.8663273139121019, 'gamma': 1.928068613690046, 'reg_alpha': 3.1235627897407934, 'reg_lambda': 8.413453309885758, 'scale_pos_weight': 12.047973379019961}. Best is trial 0 with value: 0.09691629955947137.
[I 2025-05-12 10:14:12,234] Trial 1 finished with value: 0.027459954233409613 and parameters: {'n_estimators': 368, 'max_depth': 8, 'learning_rate': 0.019095907493897585, 'subsample': 0.8651504152666714, 'colsample_bytree': 0.6842780979478401, 'gamma': 0.9549447027542612, 'reg_alpha': 2.3991420024168297, 'reg_lambda': 6.026170371483948, 'scale_pos_weight': 3.3646879091569684}. Best is trial 0 with value: 0.09691629955947137.
[I 2025-05-12 10:14:53,012] Tria

Beste hyperparameters: {'n_estimators': 322, 'max_depth': 3, 'learning_rate': 0.07482190006321321, 'subsample': 0.6601618247993428, 'colsample_bytree': 0.671789475048553, 'gamma': 3.2560401283698, 'reg_alpha': 6.941806486527001, 'reg_lambda': 5.886403784981896, 'scale_pos_weight': 17.936820946355468}
