## Isolation Forest Model

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score, average_precision_score
from sklearn.metrics import precision_recall_curve, roc_curve

In [None]:
DATASET = "HI-Small" ## either HI-Small or LI-Small

DATASET_LOCATION = f"../datasets/synthetic/03_feature_engineering/{DATASET}-features"
GFP_FEATURES_LOCATION = f"./datasets/synthetic/03_feature_engineering/{DATASET}-enriched"

In [None]:
data = pd.read_parquet(DATASET_LOCATION)

In [None]:
# Fix a 2-hours difference (mismatch between pyspark and pandas) 
print(data['timestamp'].min(), data['timestamp'].max())
data['timestamp'] = data['timestamp'] + pd.Timedelta(hours=2)
print(data['timestamp'].min(), data['timestamp'].max())

In [None]:
data["timestamp"] = pd.to_datetime(data["timestamp"])
data["timestamp"] = data["timestamp"].values.astype(int) // 10**9
min_timestamp = data["timestamp"].min()
data["timestamp"] = data["timestamp"] - min_timestamp
data = data.sort_values(by='timestamp', ascending=True)

In [None]:
enriched_data = pd.read_parquet(GFP_FEATURES_LOCATION)

In [None]:
# Combine dataset from feature engineering and dataset from graph feature preprocessor
data = data.merge(enriched_data, on='transaction_id', how='left')
data = data.sort_values(by="timestamp", ascending=True)

In [None]:
X_train = data.drop(columns=['target', 'source', 'target_bank', 'source_bank', 'transaction_id',
'timestamp', 'source_currency', 'target_currency', 'source_amount', 'target_amount', 'format',
'is_laundering', 'amount'])
X_train_columns = X_train.columns

y_target = data[['transaction_id', 'is_laundering']].copy()
y_target_columns = y_target.columns

In [None]:
# Fill null and inf values and change type
X_train = X_train.astype(np.float32)
X_train = X_train.replace([np.inf, -np.inf], 0)
X_train = X_train.fillna(0)

In [None]:
# Use a subset of features mainly focused on amount, velocity, counterparties,
# and patterns like cycles, fan-in and fan-out
selected_features = ['percentage_of_total_sent', 'percentage_of_total_received',
        'percentage_of_total_last_5_d_sent', 'percentage_of_total_next_5_d_sent',
        'percentage_of_total_last_5_d_received', 'percentage_of_total_next_5_d_received',
        'is_round_amount', 'daily_trans_count_counterparty', 'total_interactions',
        'weekly_interactions', 'dst_daily_percentage_of_counterparty',
        'percentage_weekly_sent', 'percentage_weekly_received',
        'percentage_daily_sent', 'src_percentage_of_interactions',
        'dst_percentage_of_interactions', 'src_time_diff', 'dst_time_diff',
        'fan_in_bins_4-5', 'fan_in_bins_5-6', 'fan_in_bins_6-7',
        'fan_in_bins_7-8', 'fan_in_bins_8-9', 'fan_in_bins_9-10',
        'fan_in_bins_11-12', 'degree_out_bins_6-7', 'degree_out_bins_8-9',
        'temp-cycle_bins_2-3', 'temp-cycle_bins_3-4', 'temp-cycle_bins_4-5',
        'lc-cycle_bins_2-3', 'source_ratio_out', 'source_min_col3_out', 
        'source_kurtosis_col3_out', 'source_sum_col4_out', 'source_fan_in',
        'source_ratio_in', 'source_min_col3_in', 'dest_ratio_in', 'dest_min_col3_in']
X_training = X_train[selected_features]

In [None]:
# Calculate outlier fraction (since the target is available)
outlier_fraction = len(data[data['is_laundering'] == 1]) / len(data)
print(outlier_fraction)

### Instantiate and train model

In [None]:
# Model hyperparameters
trees = 100
samples = 0.1
jobs = -1
state = 42

model = IsolationForest(n_estimators=trees,
                        max_samples=samples, 
                        contamination=outlier_fraction,
                        n_jobs=jobs,
                        random_state=state)

In [None]:
# Train the IF model
model.fit(X_training)

### Perform inference and export predictions

In [None]:
# Use the trained IF model to compute anomaly scores for the training data
scores = model.decision_function(X_training)

In [None]:
# Invert and scale Isolation Forest scores to [0, 1], where higher scores indicate anomalies
inverted_scores = -scores
inverted_scores = (inverted_scores - inverted_scores.min()) / (inverted_scores.max() - inverted_scores.min())
scaled_scores = inverted_scores

In [None]:
evaluation = pd.concat([X_training, y_target], axis=1)
evaluation["scores"] = scaled_scores

In [None]:
# Set the 70% percentile as threshold to separate between normal and abnomal transactions
THRESHOLD = 0.7
threshold = evaluation['scores'].quantile(THRESHOLD)

predictions = [1 if score>=threshold else 0 for score in evaluation['scores']]
evaluation['predictions'] = predictions

TP = evaluation[(evaluation['predictions'] == 1) & (evaluation['is_laundering'] > 0)].shape[0]
FN = evaluation[(evaluation['predictions'] == 0) & (evaluation['is_laundering'] > 0)].shape[0]
FP = evaluation[(evaluation['predictions'] == 1) & (evaluation['is_laundering'] <= 0)].shape[0]
TN = evaluation[(evaluation['predictions'] == 0) & (evaluation['is_laundering'] <= 0)].shape[0]

print(f"True Positives: {TP}")
print(f"False Negatives: {FN}")
print(f"False Positives: {FP}")
print(f"True Negatives: {TN}")

In [None]:
# Print the result metrics for the model
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)
specificity = TN / (TN + FP)
accuracy = (TP + TN) / (TP + TN + FP + FN)
tpr = TP / (TP + FN)
tnr = TN / (TN + FP)
fpr = FP / (FP + TN)
fnr = FN / (FN + TP)
average_precision = average_precision_score(evaluation['is_laundering'], evaluation['scores'])
auc = roc_auc_score(evaluation['is_laundering'], evaluation['scores'])


print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1-Score: {f1:.2%}")
print(f"Specificity: {specificity:.2%}")
print(f"Accuracy: {accuracy:.2%}")
print(f"False Positive Rate (FPR): {fpr:.2%}")
print(f"False Negative Rate (FNR): {fnr:.2%}")
print(f"Average Precision (AP): {average_precision:.2%}")
print(f"True Positive Rate (TPR): {tpr:.2%}")
print(f"True Negative Rate (TNR): {tnr:.2%}")
print(f"AUC: {auc:.2%}")

In [None]:
evaluation.sort_values(by='scores', ascending=False, inplace=True)

In [None]:
percentage = outlier_fraction * 100
cutoff_index = int(len(evaluation) * (percentage / 100)) 
cutoff_score = evaluation['scores'].nlargest(cutoff_index).min()

evaluation['predicted_anomaly'] = (evaluation['scores'] >= cutoff_score).astype(int)

f1 = f1_score(evaluation['is_laundering'], evaluation['predicted_anomaly'])
print(f"F1-Score: {f1*100:.2f}%")

In [None]:
normal_trans = evaluation[evaluation['is_laundering'] == 0]
normal_trans_sample = evaluation[evaluation['is_laundering'] == 0].sample(frac=0.2, random_state=42)
laundering_trans = evaluation[evaluation['is_laundering'] == 1]

In [None]:
# Plot the IF scores distribution for identified normal and abnormal transactions
plt.figure(figsize=(10, 6))
plt.scatter(normal_trans_sample.index, normal_trans_sample['scores'], c='#007191', s=20, label='Normal Transactions', alpha=0.7)
plt.scatter(laundering_trans.index, laundering_trans['scores'], c='#f47a00', s=20, label='Laundering Transactions', alpha=0.7)
plt.xlabel('Transactions', fontsize=12)
plt.ylabel('Anomaly Score', fontsize=12)
plt.legend(frameon=True, shadow=True, fontsize=10)
plt.grid(False)
plt.title(f"{DATASET}: IF scores distribution")
plt.show()

In [None]:
# Plot the IF scores distribution for identified normal and abnormal transactions
fig, ax1 = plt.subplots(figsize=(10, 6))

ax1.hist(normal_trans['scores'], bins=200, alpha=0.7, label='Normal Transactions', color='#007191')
ax1.set_xlabel('Anomaly Score', fontsize=12)
ax1.set_ylabel('Frequency (Normal Transactions)', color='#007191', fontsize=12)
ax1.tick_params(axis='y', labelcolor='#007191')

ax2 = ax1.twinx()
ax2.hist(laundering_trans['scores'], bins=200, alpha=0.7, label='Laundering Transactions', color='#f47a00')
ax2.set_ylabel('Frequency (Laundering Transactions)', color='#f47a00', fontsize=12)
ax2.tick_params(axis='y', labelcolor='#f47a00')
ax2.set_ylim(0, 70)
ax1.grid(False)
ax2.grid(False)
fig.tight_layout()
fig.legend(frameon=True, shadow=True, fontsize=10)
plt.title(f"{DATASET}: IF scores distribution")
plt.show()

In [None]:
# Combine two visualizations in the same plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

ax1.scatter(normal_trans_sample.index, normal_trans_sample['scores'], c='#007191', s=20, label='Normal Transactions', alpha=0.7)
ax1.scatter(laundering_trans.index, laundering_trans['scores'], c='#f47a00', s=20, label='Laundering Transactions', alpha=0.7)
ax1.set_xlabel('Transactions', fontsize=12)
ax1.set_ylabel('Anomaly Score', fontsize=12)
ax1.legend(frameon=True, loc='upper left', fontsize=10, shadow=True)
ax1.grid(False)

ax3 = ax2.twinx()
ax2.hist(normal_trans['scores'], bins=200, alpha=0.7, label='Normal Transactions', color='#007191')
ax2.set_xlabel('Anomaly Score', fontsize=12)
ax2.set_ylabel('Frequency (Normal Transactions)', color='#007191', fontsize=12)
ax2.tick_params(axis='y', labelcolor='#007191')

ax3.hist(laundering_trans['scores'], bins=200, alpha=0.7, label='Laundering Transactions', color='#f47a00')
ax3.set_ylabel('Frequency (Laundering Transactions)', color='#f47a00', fontsize=12)
ax3.tick_params(axis='y', labelcolor='#f47a00')
ax3.set_ylim(0, 600)
ax2.legend(frameon=True, loc='upper right', fontsize=10, shadow=True,)
ax3.legend(frameon=True, loc='upper right',bbox_to_anchor=(1, 0.95), fontsize=10, shadow=True)
ax2.grid(False)
ax3.grid(False)
fig.tight_layout()
plt.show()

In [None]:
# Plot the AUC-ROC Curve
fpr, tpr, thresholds = roc_curve(evaluation['is_laundering'], evaluation['scores'])
roc_auc = roc_auc_score(evaluation['is_laundering'], evaluation['scores'])

plt.figure(figsize=(8, 6), dpi=150)
plt.plot(fpr, tpr, color='#1f77b4', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='gray', label='Chance Level')
plt.fill_between(fpr, tpr, alpha=0.2, color='#1f77b4')
plt.title(f"{DATASET}: AUC-ROC Curve")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FP Rate)', fontsize=12)
plt.ylabel('True Positive Rate (TP Rate)', fontsize=12)
plt.legend(loc="lower right", frameon=True, shadow=True, fontsize=10)
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

In [None]:
# Export the IF model scores for the GNN-based model
to_save = evaluation[['transaction_id', 'is_laundering', 'scores']]
to_save.to_csv(f"../datasets/synthetic/04_if_output/{DATASET}_if_scores.csv", index=False)