## Final Evaluation

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

%matplotlib inline
DATASET = "HI-Small"    ## either HI-Small or LI-Small
MODEL = "GAE"           ## either GAE, OCGNN or COLA

In [18]:
scores_location = f"../results/synthetic/{DATASET}_GAE_100_epochs.csv"
transactions_location = f"../datasets/synthetic/02_preprocessed/{DATASET}-transactions.parquet"
pattern_location = f"../datasets/synthetic/02_preprocessed/{DATASET}-patterns.csv"

In [19]:
# Read gnn model scores, transactions and patterns file (money laundering cases)
scores = pd.read_csv(scores_location)
transactions = pd.read_parquet(transactions_location)
patterns = pd.read_csv(pattern_location)

In [20]:
# Ensure id are compatible across datasets
scores["id"] = scores["id"].astype(str)
patterns["id"] = patterns["id"].astype(str)
patterns["transaction_id"] = patterns["transaction_id"].astype(str)

In [21]:
# Normalize gnn model scores in the range [0,1]
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
scores['score_min_max_norm'] = min_max_scaler.fit_transform(scores[['score']])

In [22]:
# Merge the normalized score with transactions dataframe
scores = scores.merge(transactions, left_on="id", right_on="transaction_id", how="left").drop(columns=["id"])

In [24]:
patterns_ids = patterns[['transaction_id', 'id', 'type', 'sub_type']].copy()
patterns_ids.rename(columns={'id':'pattern_id'}, inplace=True)

In [None]:
# Merge the scores dataframe with patterns dataframe
scores = scores.merge(patterns_ids, on="transaction_id", how="left").fillna(-1)
scores.rename(columns={'score':'model_score'}, inplace=True)

#### Select the dataframe for evaluation

In [26]:
table_eval = scores[['transaction_id', 'amount', 'pattern_id', 'type', 'sub_type','model_score', 'score_min_max_norm']].copy()

In [27]:
table_eval['is_pattern'] = table_eval['pattern_id'].apply(lambda x: 1 if x != -1 else 0)

In [28]:
# Read scores of Isolation Forest (IF) model
# select the cutoff point and divide between normal and abnormal transaction ids

if_scores_location = f"../datasets/synthetic/04_if_output/{DATASET}_if_scores.csv"
trans_location = f"../datasets/synthetic/02_preprocessed/{DATASET}-transactions.parquet"


if_scores = pd.read_csv(if_scores_location)
if_scores['transaction_id'] = if_scores['transaction_id'].astype(str)

normal_percentage = 70
threshold = if_scores['scores'].quantile(normal_percentage / 100)
normal_ids = list(if_scores[if_scores['scores'] < threshold]['transaction_id'].values)
anomalous_ids = list(if_scores[if_scores['scores'] >= threshold]['transaction_id'].values)

# Remove anomalous ids from the normal set and reinsert in the anomalous set
transactions = pd.read_parquet(trans_location)
real_laundering_ids = list(transactions[transactions['is_laundering']==1]['transaction_id'].values)

normal_ids_set = set(normal_ids)
anomalous_ids_set = set(anomalous_ids)
real_laundering_ids_set = set(real_laundering_ids)
ids_to_remove = normal_ids_set.intersection(real_laundering_ids_set)

normal_ids_set.difference_update(ids_to_remove)
anomalous_ids_set.update(ids_to_remove)

normal_ids = list(normal_ids_set)
anomalous_ids = list(anomalous_ids_set)

In [29]:
# Focus the evaluation only on the set of abnormal transactions
table_eval = table_eval[table_eval['transaction_id'].isin(anomalous_ids)]

In [None]:
# Normalize the transaction amount to the range [0,1]
scaler_amount = MinMaxScaler(feature_range=(0, 1))
table_eval['amount_norm'] = scaler_amount.fit_transform(table_eval[['amount']])

### Initial Combined Score: 
$ c_i = s_i^{\textit{norm}} \times a_i^{\textit{norm}} $

where $s_i$ is the model score for the transaction and $a_i$ is the transaction amount

In [30]:
table_eval['combined_score'] = table_eval['score_min_max_norm'] * table_eval['amount_norm']

### Final Anomaly Score
1. Weight function to target specific amounts, applied to each amount $a_i$

    $w(a_i) = \frac{1}{1 + e^{k(a_i - \text{threshold})}}$

    where $k$ is a parameter controlling the stepness of the weight decay and $threshold$ is the amount threshold we are interested in detecting.
    
2. Compute anomaly score:

    $AnomalyScore_i = c_i \times w(a_i)$

In [None]:
def weight_function(amount, threshold=35_000, k=0.0001):
    return 1 / (1 + np.exp(k * (amount - threshold)))

table_eval['amount_weight'] = weight_function(table_eval['amount'])

table_eval['combined_score'] = (
    (table_eval['score_min_max_norm']) *
    (table_eval['amount_norm']) *
    table_eval['amount_weight']
)

# Compute the AUC-ROC with the final anomaly score
auc = roc_auc_score(table_eval['is_pattern'], table_eval['combined_score'])
print(f"AUC-ROC with adjusted weights: {auc:.4f}")

### Print Results and Metrics

In [None]:
# Save final scores 
table_eval_to_save = table_eval[['is_pattern','combined_score']]
table_eval_to_save.to_csv(f"{DATASET}_{MODEL}_final_score.csv", index=False)

In [None]:
# Plot confusion matrix for all models on the two datasets

DATASETS = ["HI-Small", "LI-Small"]
MODELS = ["GAE", "OCGNN", "CoLA"]

for model in MODELS:
    # Load and rename scores for HI-Small and LI-Small datasets
    hi_small_file = f"{DATASETS[0]}_{model}_final_score.csv"
    li_small_file = f"{DATASETS[1]}_{model}_final_score.csv"
    
    hi_table = pd.read_csv(hi_small_file)
    li_table = pd.read_csv(li_small_file)
    hi_table.rename(columns={'combined_score': f'{model}_score'}, inplace=True)
    li_table.rename(columns={'combined_score': f'{model}_score'}, inplace=True)

    # Calculate the top 30% threshold
    threshold_70_hi = hi_table[f'{model}_score'].quantile(0.7)
    threshold_70_li = li_table[f'{model}_score'].quantile(0.7)

    # Assign predictions based on threshold
    hi_table['predicted'] = (hi_table[f'{model}_score'] >= threshold_70_hi).astype(int)
    li_table['predicted'] = (li_table[f'{model}_score'] >= threshold_70_li).astype(int)

    # Calculate confusion matrices
    conf_matrix_hi = confusion_matrix(hi_table['is_pattern'], hi_table['predicted'])
    conf_matrix_li = confusion_matrix(li_table['is_pattern'], li_table['predicted'])

    # Extract TN, FP, FN, TP from the confusion matrix
    TN_hi, FP_hi, FN_hi, TP_hi = conf_matrix_hi.ravel()
    TN_li, FP_li, FN_li, TP_li = conf_matrix_li.ravel()

    # Create confusion matrices in the specified layout (with TP in the top-left)
    confusion_matrix_hi_small = np.array([[TP_hi, FN_hi], [FP_hi, TN_hi]])
    confusion_matrix_li_small = np.array([[TP_li, FN_li], [FP_li, TN_li]])

    # Set up the plot
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    # Plot HI-Small Confusion Matrix
    sns.heatmap(confusion_matrix_hi_small, annot=True, fmt="d", cmap="Blues", 
                xticklabels=["Predicted Positive", "Predicted Negative"],
                yticklabels=["Actual Positive", "Actual Negative"],
                ax=axes[0])
    axes[0].set_title(f"{DATASETS[0]} Dataset - {model} Model - Confusion Matrix")
    axes[0].set_xlabel("Predicted Labels")
    axes[0].set_ylabel("Actual Labels")

    # Plot LI-Small Confusion Matrix
    sns.heatmap(confusion_matrix_li_small, annot=True, fmt="d", cmap="Blues", 
                xticklabels=["Predicted Positive", "Predicted Negative"],
                yticklabels=["Actual Positive", "Actual Negative"],
                ax=axes[1])
    axes[1].set_title(f"{DATASETS[1]} Dataset - {model} Model - Confusion Matrix")
    axes[1].set_xlabel("Predicted Labels")
    axes[1].set_ylabel("Actual Labels")
    plt.tight_layout()
    plt.show()

In [34]:
# Read the written final scores

DATASET = "HI-Small"

gae_location = f"{DATASET}_GAE_final_score.csv"
ocgnn_location = f"{DATASET}_OCGNN_final_score.csv"
cola_location = f"{DATASET}_CoLA_final_score.csv"

gae_table = pd.read_csv(gae_location)
ocgnn_table = pd.read_csv(ocgnn_location)
cola_table = pd.read_csv(cola_location)

gae_table.rename(columns={'combined_score': 'GAE_score'}, inplace=True)
ocgnn_table.rename(columns={'combined_score': 'OCGNN_score'}, inplace=True)
cola_table.rename(columns={'combined_score': 'COLA_score'}, inplace=True)

In [None]:
# Plot auc-roc comparison curve for all models on the two datasets

fpr_GAE, tpr_GAE, _ = roc_curve(gae_table['is_pattern'], gae_table['GAE_score'])
auc_GAE = roc_auc_score(gae_table['is_pattern'], gae_table['GAE_score'])

fpr_OCGNN, tpr_OCGNN, _ = roc_curve(ocgnn_table['is_pattern'], ocgnn_table['OCGNN_score'])
auc_OCGNN = roc_auc_score(ocgnn_table['is_pattern'], ocgnn_table['OCGNN_score'])

fpr_CoLA, tpr_CoLA, _ = roc_curve(cola_table['is_pattern'], cola_table['CoLA_score'])
auc_CoLA = roc_auc_score(cola_table['is_pattern'], cola_table['CoLA_score'])

plt.figure(figsize=(8, 6))
plt.plot(fpr_GAE, tpr_GAE, label=f'GAE (AUC = {auc_GAE:.2f})')
plt.plot(fpr_OCGNN, tpr_OCGNN, label=f'OCGNN (AUC = {auc_OCGNN:.2f})')
plt.plot(fpr_CoLA, tpr_CoLA, label=f'CoLA (AUC = {auc_CoLA:.2f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Chance')

plt.xlabel('False Positive Rate (FP Rate)')
plt.ylabel('True Positive Rate (TP Rate)')
plt.title(f'{DATASET} Dataset AUC-ROC Curve Comparison for Different Models')
plt.legend(loc='lower right')
plt.savefig(f"{DATASET}_auc.png", dpi=300, bbox_inches="tight")

plt.show()

In [None]:
# Print other metrics

table_eval = table_eval.sort_values(by='combined_score', ascending=False)

top_n = 100_000

table_eval['predicted_label'] = 0
table_eval.iloc[:top_n, table_eval.columns.get_loc('predicted_label')] = 1

y_true = table_eval['is_pattern']
y_pred = table_eval['predicted_label']


tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

print("Confusion Matrix:")
print(f"TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}")

confusion_df = pd.DataFrame([[tn, fp], [fn, tp]],
                            index=['Actual Normal', 'Actual Anomaly'],
                            columns=['Predicted Normal', 'Predicted Anomaly'])
print(confusion_df)


accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")
print(f"F1 Score: {f1*100:.2f}%")

In [None]:
y_true = table_eval['is_pattern']
y_scores = table_eval['combined_score']

fpr, tpr, thresholds = roc_curve(y_true, y_scores)

roc_auc = auc(fpr, tpr)
print(f"AUC-ROC with adjusted weights: {roc_auc:.4f}")

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Guess')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Read the raw scores of the gnn based models

DATASET = "HI-Small"
f"../results/synthetic/{DATASET}_GAE_100_epochs.csv"
gae_scores_loc = f"../results/synthetic/{DATASET}_GAE_100_epochs.csv"
ocgnn_scores_loc = f"../results/synthetic/{DATASET}_OCGNN_100_epochs.csv"
cola_scores_loc = f"../results/synthetic/{DATASET}_CoLA_100_epochs.csv"

In [None]:
gae_scores = pd.read_csv(gae_scores_loc)
ocgnn_scores = pd.read_csv(ocgnn_scores_loc)
cola_scores = pd.read_csv(cola_scores_loc)

In [None]:
if_scores_location = f"../datasets/synthetic/04_if_output/{DATASET}_if_scores.csv"
trans_location = f"../datasets/synthetic/02_preprocessed/{DATASET}-transactions.parquet"

if_scores = pd.read_csv(if_scores_location)
if_scores['transaction_id'] = if_scores['transaction_id'].astype(str)

normal_percentage = 70
threshold = if_scores['scores'].quantile(normal_percentage / 100)
normal_ids = list(if_scores[if_scores['scores'] < threshold]['transaction_id'].values)
anomalous_ids = list(if_scores[if_scores['scores'] >= threshold]['transaction_id'].values)

transactions = pd.read_parquet(trans_location)
real_laundering_ids = list(transactions[transactions['is_laundering']==1]['transaction_id'].values)

normal_ids_set = set(normal_ids)
anomalous_ids_set = set(anomalous_ids)
real_laundering_ids_set = set(real_laundering_ids)
ids_to_remove = normal_ids_set.intersection(real_laundering_ids_set)

normal_ids_set.difference_update(ids_to_remove)
anomalous_ids_set.update(ids_to_remove)

normal_ids = list(normal_ids_set)
anomalous_ids = list(anomalous_ids_set)

In [None]:
gae_scores["id"] = gae_scores["id"].astype(str)
ocgnn_scores["id"] = ocgnn_scores["id"].astype(str)
cola_scores["id"] = cola_scores["id"].astype(str)

In [None]:
gae_scores = gae_scores[gae_scores['id'].isin(anomalous_ids)]
ocgnn_scores = ocgnn_scores[ocgnn_scores['id'].isin(anomalous_ids)]
cola_scores = cola_scores[cola_scores['id'].isin(anomalous_ids)]

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
gae_scores["norm_score"] = scaler.fit_transform(gae_scores[["score"]])
ocgnn_scores["norm_score"] = scaler.fit_transform(ocgnn_scores[["score"]])
cola_scores["norm_score"] = scaler.fit_transform(cola_scores[["score"]])

In [None]:
# Plot the distribution of scores of the GNN models before post-processing the score

plt.figure(figsize=(12, 6))

sns.histplot(gae_scores['norm_score'], bins=50, kde=False, color='blue', label='GAE model scores', stat='density', alpha=0.5)
sns.histplot(ocgnn_scores['norm_score'], bins=50, kde=False, color='green', label='OCGNN model scores', stat='density', alpha=0.5)
sns.histplot(cola_scores['norm_score'], bins=50, kde=False, color='red', label='CoLA model scores', stat='density', alpha=0.5)

plt.xlabel('Normalized Score')
plt.ylabel('Density')
plt.title(f'{DATASET} - Raw Scores Distribution on the Inference Subset')
plt.legend()
plt.savefig(f"{DATASET}_scores.png", dpi=300, bbox_inches='tight')

plt.show()