# Visualisation Results

This notebook will analyse the results coming from the GARG-AML scores, AutoAudit and Flowscope. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
models = [
    'flowscope', 
    'autoaudit', 
    'gargaml_u',
    'gargaml_d', 
    'gargaml_tree_u',
    'gargaml_boost_u', 
    'gargaml_tree_d',
    'gargaml_boost_d'
]

patterns = [
    'laundering', 
    'separate',
    'new_mules', 
    'existing_mules'
]

In [None]:
datasets = []

n_nodes_list = [100, 10000, 100000] # Number of nodes in the graph
m_edges_list = [1, 2, 5] # Number of edges to attach from a new node to existing nodes
p_edges_list = [0.001, 0.01] # Probability of adding an edge between two nodes
generation_method_list = [
    'Barabasi-Albert', 
    'Erdos-Renyi', 
    'Watts-Strogatz'
    ] # Generation method for the graph
n_patterns_list = [3, 5] # Number of smurfing patterns to add

for n_nodes in n_nodes_list:
        for n_patterns in n_patterns_list:
            if n_patterns <= 0.06*n_nodes:
                for generation_method in generation_method_list:
                    if generation_method == 'Barabasi-Albert':
                        p_edges = 0
                        for m_edges in m_edges_list:
                            string_name = 'synthetic_' + generation_method + '_'  + str(n_nodes) + '_' + str(m_edges) + '_' + str(p_edges) + '_' + str(n_patterns)
                            datasets.append(string_name)
                    if generation_method == 'Erdos-Renyi':
                        m_edges = 0
                        for p_edges in p_edges_list:
                            string_name = 'synthetic_' + generation_method + '_'  + str(n_nodes) + '_' + str(m_edges) + '_' + str(p_edges) + '_' + str(n_patterns)
                            datasets.append(string_name)
                    if generation_method == 'Watts-Strogatz':
                        for m_edges in m_edges_list:
                            for p_edges in p_edges_list:
                                string_name = 'synthetic_' + generation_method + '_'  + str(n_nodes) + '_' + str(m_edges) + '_' + str(p_edges) + '_' + str(n_patterns)
                                datasets.append(string_name)

In [None]:
file = '../results-aa/synthetic_autoaudit_combined.csv'
df = pd.read_csv(file)
df = df.rename(columns={'Unnamed: 0': 'pattern'})
df.head()

In [None]:
file = '../results-0/synthetic_tree_False_3.csv'
df = pd.read_csv(file)
df = df.rename(columns={'Unnamed: 0': 'pattern'})
df.head()

In [None]:
eval(df[df['pattern']=='laundering'][datasets[0]].values[0])

In [None]:
def gargaml_results(directed):
    if directed:
        file = '../results-0/results_performance_directed_supervised.txt'
    else:
        file = '../results-0/results_performance_undirected_supervised.txt'
    with open(file, 'r') as f:
        lines = f.readlines()
        
    results_gargaml_dict = {}
    for line in lines:
        model = line.split(':', maxsplit=1)[0].split()[0]
        results = eval(line.split(':', maxsplit=1)[1])
        results_gargaml_dict[model] = results
    return results_gargaml_dict


In [None]:
results_flowscope = pd.read_csv('../results-0/results_flowscope.csv')
results_autoaudit = pd.read_csv('../results-aa/synthetic_autoaudit_combined.csv')
results_gargaml_undir = gargaml_results(False)
results_gargaml_dir = gargaml_results(True)
results_gargaml_tree_undir_3 = pd.read_csv('../results-0/synthetic_tree_False_3.csv')
results_gargaml_tree_undir_5 = pd.read_csv('../results-0/synthetic_tree_False_5.csv')
results_gargaml_tree_undir = results_gargaml_tree_undir_3.merge(
    results_gargaml_tree_undir_5, 
    on='Unnamed: 0', 
)
results_gargaml_tree_dir = pd.read_csv('../results-0/synthetic_tree_True.csv')

In [None]:
results_gargaml_dir

In [None]:
results_gargaml_undir[datasets[0]]['laundering']

In [None]:
results_flowscope.head()

In [None]:
results_dict = {
    dataset:{
        model:{
            pattern:{
                'AUC-ROC': 0,
                'AUC-PR': 0,
            }
            for pattern in patterns
        }
        for model in models
    }
    for dataset in datasets
}

In [None]:
for dataset in datasets:
    for model in models:
        if model =='flowscope':
            for pattern in patterns:
                ROC, PR = tuple(eval(results_flowscope[results_flowscope['Unnamed: 0']==pattern][dataset].values[0]))
                results_dict[dataset][model][pattern]['AUC-ROC'] = ROC
                results_dict[dataset][model][pattern]['AUC-PR'] = PR
        if model == 'autoaudit':
            for pattern in patterns:
                ROC, PR = tuple(eval(results_autoaudit[results_autoaudit['Unnamed: 0']==pattern][dataset].values[0]))
                results_dict[dataset][model][pattern]['AUC-ROC'] = ROC
                results_dict[dataset][model][pattern]['AUC-PR'] = PR
        if model == 'gargaml_u':
            for pattern in patterns:
                try:
                    ROC, PR = tuple(results_gargaml_undir[dataset][pattern])
                except:
                    ROC = PR = 0
                results_dict[dataset][model][pattern]['AUC-ROC'] = ROC
                results_dict[dataset][model][pattern]['AUC-PR'] = PR
        if model == 'gargaml_d':
            for pattern in patterns:
                try:
                    ROC, PR = tuple(results_gargaml_dir[dataset][pattern])
                except:
                    ROC = PR = 0
                results_dict[dataset][model][pattern]['AUC-ROC'] = ROC
                results_dict[dataset][model][pattern]['AUC-PR'] = PR
            
        if model == 'gargaml_tree_u':
            for pattern in patterns:
                try:
                    dict_values = eval(results_gargaml_tree_undir[results_gargaml_tree_undir['Unnamed: 0']==pattern][dataset].values[0])['tree']
                    ROC = dict_values['AUC_ROC']
                    PR = dict_values['AUC_PR']
                except:
                    ROC = PR = 0
                results_dict[dataset][model][pattern]['AUC-ROC'] = ROC
                results_dict[dataset][model][pattern]['AUC-PR'] = PR

        if model == 'gargaml_tree_d':
            for pattern in patterns:
                try:
                    dict_values = eval(results_gargaml_tree_dir[results_gargaml_tree_dir['Unnamed: 0']==pattern][dataset].values[0])['tree']
                    ROC = dict_values['AUC_ROC']
                    PR = dict_values['AUC_PR']
                except:
                    ROC = PR = 0
                results_dict[dataset][model][pattern]['AUC-ROC'] = ROC
                results_dict[dataset][model][pattern]['AUC-PR'] = PR

        if model == 'gargaml_boost_u':
            for pattern in patterns:
                try:
                    dict_values = eval(results_gargaml_tree_undir[results_gargaml_tree_undir['Unnamed: 0']==pattern][dataset].values[0])['boosting']
                    ROC = dict_values['AUC_ROC']
                    PR = dict_values['AUC_PR']
                except:
                    ROC = PR = 0
                results_dict[dataset][model][pattern]['AUC-ROC'] = ROC
                results_dict[dataset][model][pattern]['AUC-PR'] = PR
        if model == 'gargaml_boost_d':
            for pattern in patterns:
                try:
                    dict_values = eval(results_gargaml_tree_dir[results_gargaml_tree_dir['Unnamed: 0']==pattern][dataset].values[0])['boosting']
                    ROC = dict_values['AUC_ROC']
                    PR = dict_values['AUC_PR']
                except:
                    ROC = PR = 0
                results_dict[dataset][model][pattern]['AUC-ROC'] = ROC
                results_dict[dataset][model][pattern]['AUC-PR'] = PR


In [None]:
results_gargaml_tree_undir.head(
)

In [None]:
data = []

for dataset, method_data in results_dict.items():
    for method, pattern_data in method_data.items():
        for pattern, metrics in pattern_data.items():
            data.append({
                'dataset': dataset,
                'method': method,
                'pattern': pattern,
                'performance_metric': 'AUC-ROC', 
                'performance': metrics['AUC-ROC']
            })

            data.append({
                'dataset': dataset,
                'method': method,
                'pattern': pattern,
                'performance_metric': 'AUC-PR', 
                'performance': metrics['AUC-PR']
            })
df = pd.DataFrame(data)
df.head(10)

In [None]:
import seaborn as sns

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 6))
for i in range(2):
    for j in range(2): 
        pattern = patterns[i+2*j]

        sns.boxplot(ax=axes[i,j], x='performance_metric', y='performance', hue='method', data=df[df['pattern']==pattern], palette='tab10',showmeans=True, meanprops={'marker':'*', 'markerfacecolor':'xkcd:steel', 'markeredgecolor':'.3', 'markersize': 10}, medianprops={'color': 'black', 'linewidth':2,'label': '_median_', 'linewidth':3})
        axes[i,j].set_title('Performance for the different methods - Pattern: '+ pattern)
        axes[i,j].set_xlabel('Performance Metric')
        axes[i,j].set_ylabel('Value')
        axes[i,j].legend(title='method', bbox_to_anchor=(1.0, 1), loc='upper left')
        axes[i,j].grid(True,which="both",ls="--",c='gray', alpha=0.3)

        handles, labels = axes[i,j].get_legend_handles_labels()
        axes[i,j].legend(handles=handles,
                 labels=['FlowScope', 'AutoAudit', 'GARG-AML', 'GARG-AML+Tree'],
                 title='Method',
                 bbox_to_anchor=(1.0, 1),
                 loc='upper left')

plt.tight_layout()

plt.savefig('../results/boxplot_performance_full.pdf')

In [None]:
for n_nodes in n_nodes_list:
    fig, axes = plt.subplots(2, 2, figsize=(15, 6))
    for i in range(2):
        for j in range(2): 
            pattern = patterns[i+2*j]

            sns.boxplot(ax=axes[i,j], x='performance_metric', y='performance', hue='method', data=df[(df['pattern']==pattern)&(df['dataset'].str.contains('_'+str(n_nodes)+'_'))], palette='tab10',showmeans=True, meanprops={'marker':'*', 'markerfacecolor':'xkcd:steel', 'markeredgecolor':'.3', 'markersize': 10}, medianprops={'color': 'black', 'linewidth':2,'label': '_median_', 'linewidth':3})
            axes[i,j].set_title('Performance for the different methods - Pattern: '+ pattern)
            axes[i,j].set_xlabel('Performance Metric')
            axes[i,j].set_ylabel('Value')
            axes[i,j].legend(title='method', bbox_to_anchor=(1.0, 1), loc='upper left')
            axes[i,j].grid(True,which="both",ls="--",c='gray', alpha=0.3)

    fig.suptitle(f'Performance Analysis for #Nodes: {n_nodes}', fontsize=16)
    plt.tight_layout()  # Adjust layout to fit the global title

    plt.savefig('../results/boxplot_performance_'+str(n_nodes)+'_full.pdf')

In [None]:
def give_order(models, pattern, dataset):
    # Initialize a dictionary to store the results
    results = {model: {'AUC-ROC': 0, 'AUC-PR': 0} for model in models}

    for model in models:
        results[model]['AUC-ROC'] = results_dict[dataset][model][pattern]['AUC-ROC']
        results[model]['AUC-PR'] = results_dict[dataset][model][pattern]['AUC-PR']

    sorted_models_ROC = []
    sorted_models_PR = []

    sorted_models_ROC_ = sorted(models, key=lambda model: results[model]['AUC-ROC'], reverse=True)
    sorted_models_PR_ = sorted(models, key=lambda model: results[model]['AUC-PR'], reverse=True)
    sorted_models_ROC.append(sorted_models_ROC_)
    sorted_models_PR.append(sorted_models_PR_)

    return sorted_models_ROC, sorted_models_PR

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 6))

for idx, pattern in enumerate(patterns):
    sorted_models_ROC_list = []
    sorted_models_PR_list = []

    for dataset in datasets:
        order_ROC, order_PR = give_order(models, pattern, dataset)
        for i in order_ROC:
            sorted_models_ROC_list.append(i)
        for i in order_PR:
            sorted_models_PR_list.append(i)
    
    dict_order_ROC = {}
    dict_order_PR = {}

    for model in models:
        dict_order_ROC[model] = []
        dict_order_PR[model] = []
        for i in sorted_models_ROC_list:
            dict_order_ROC[model].append(i.index(model) + 1)
        for i in sorted_models_PR_list:
            dict_order_PR[model].append(i.index(model) + 1)

    combined_data = []
    for model in models:
        for value in dict_order_ROC[model]:
            combined_data.append({'Method': model, 'Rank': value, 'Metric': 'AUC-ROC'})
        for value in dict_order_PR[model]:
            combined_data.append({'Method': model, 'Rank': value, 'Metric': 'AUC-PR'})
    
    combined_df = pd.DataFrame(combined_data)

    # Plot the boxplot
    sns.boxplot(ax=axes[idx//2, idx%2], x='Method', y='Rank', hue='Metric', data=combined_df, palette='tab10', showmeans=True, medianprops={'color': 'black', 'linewidth':2,'label': '_median_', 'linewidth':3}, meanprops={'marker':'*', 'markerfacecolor':'xkcd:steel', 'markeredgecolor':'.3', 'markersize': 10})
    axes[idx//2, idx%2].set_title('Rank of the methods - Pattern: '+ pattern)
    axes[idx//2, idx%2].set_xlabel('Method')
    axes[idx//2, idx%2].set_ylabel('Rank')
    axes[idx//2, idx%2].legend(title='Metric', bbox_to_anchor=(1.0, 1), loc='upper left')
    axes[idx//2, idx%2].grid(True,which="both",ls="--",c='gray', alpha=0.3)  

    axes[idx//2, idx%2].set_xticklabels(['FlowScope', 'AutoAudit', 'GARG-AML', 'GARG-AML+Tree'])

plt.tight_layout()
plt.savefig('../results/boxplot_rank_full.pdf')

    

In [None]:
for n_nodes in n_nodes_list:
    datasets_nodes = [dataset for dataset in datasets if dataset.split('_')[2] == str(n_nodes)]    
    fig, axes = plt.subplots(2, 2, figsize=(15, 6))

    for idx, pattern in enumerate(patterns):
        sorted_models_ROC_list = []
        sorted_models_PR_list = []

        for dataset in datasets_nodes:
            order_ROC, order_PR = give_order(models, pattern, dataset)
            for i in order_ROC:
                sorted_models_ROC_list.append(i)
            for i in order_PR:
                sorted_models_PR_list.append(i)
        
        dict_order_ROC = {}
        dict_order_PR = {}

        for model in models:
            dict_order_ROC[model] = []
            dict_order_PR[model] = []
            for i in sorted_models_ROC_list:
                dict_order_ROC[model].append(i.index(model) + 1)
            for i in sorted_models_PR_list:
                dict_order_PR[model].append(i.index(model) + 1)

        combined_data = []
        for model in models:
            for value in dict_order_ROC[model]:
                combined_data.append({'Method': model, 'Rank': value, 'Metric': 'AUC-ROC'})
            for value in dict_order_PR[model]:
                combined_data.append({'Method': model, 'Rank': value, 'Metric': 'AUC-PR'})
        
        combined_df = pd.DataFrame(combined_data)

        # Plot the boxplot
        sns.boxplot(ax=axes[idx//2, idx%2], x='Method', y='Rank', hue='Metric', data=combined_df, palette='tab10', showmeans=True, medianprops={'color': 'black', 'linewidth':2,'label': '_median_', 'linewidth':3}, meanprops={'marker':'*', 'markerfacecolor':'xkcd:steel', 'markeredgecolor':'.3', 'markersize': 10})
        axes[idx//2, idx%2].set_title('Rank of the methods - Pattern: '+ pattern)
        axes[idx//2, idx%2].set_xlabel('Method')
        axes[idx//2, idx%2].set_ylabel('Rank')
        axes[idx//2, idx%2].legend(title='Metric', bbox_to_anchor=(1.0, 1), loc='upper left')
        axes[idx//2, idx%2].grid(True,which="both",ls="--",c='gray', alpha=0.3)  
    fig.suptitle(f'Rank Analysis for #Nodes: {n_nodes}', fontsize=16)
    plt.tight_layout()
    plt.savefig('boxplot_rank_'+str(n_nodes)+'_full.pdf')


## Statistical test of the ranks

Two statistical tests are applied. First, the Friedman test is used to test if there are statistically significant differences in the mean ranks of the methods. If there is, we apply the post-hoc Nemenyi test to compare the methods two-by-two and to construct the critical distance diagrams. 

In [None]:
import numpy as np
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp
import matplotlib.pyplot as plt

### Friedman Test

In [None]:
# Example scores of 4 classifiers over 6 datasets
# Rows = datasets, Columns = models
scores = np.array([
    # Model A, Model B, Model C, Model D
    [0.85, 0.80, 0.82, 0.88], # dataset 1
    [0.83, 0.79, 0.84, 0.85], # dataset 2
    [0.89, 0.88, 0.86, 0.90],
    [0.78, 0.76, 0.80, 0.79],
    [0.92, 0.91, 0.89, 0.94],
    [0.84, 0.82, 0.83, 0.87]
])

# Run the Friedman test
stat, p = friedmanchisquare(*scores.T)

print(f"Friedman test statistic: {stat:.4f}, p-value: {p:.4f}")

if p < 0.05:
    print("Significant differences found — proceeding with Critical Distance Diagram")

    # Compute ranks (1 = best, higher = worse)
    ranks = np.argsort(np.argsort(-scores, axis=1), axis=1) + 1  # descending order

    # Average ranks per model
    avg_ranks = np.mean(ranks, axis=0)
    print("Average ranks:", avg_ranks)

    nf = sp.posthoc_nemenyi_friedman(scores)
    nf.index=['Model A', 'Model B', 'Model C', 'Model D']
    nf.columns=['Model A', 'Model B', 'Model C', 'Model D']
    # Format: diagonal, non-significant, p<0.001, p<0.01, p<0.05
    cmap = ['1', '#fb6a4a',  '#08306b',  '#4292c6', '#c6dbef']
    heatmap_args = {'cmap': cmap, 'linewidths': 0.25, 'linecolor': '0.5', 'square': True}
    sp.sign_plot(nf, **heatmap_args)

else:
    print("No statistically significant differences found — skipping post-hoc analysis.")


In [None]:
avg_ranks_dict = {}
for i, model in enumerate(nf.index):
    avg_ranks_dict[model] = avg_ranks[i]
avg_ranks_dict
plt.figure(figsize=(10, 2))
plt.title('Critical difference diagram of average score ranks')
sp.critical_difference_diagram(avg_ranks_dict, nf)

In [None]:
df_selection = df[(df['pattern']=='laundering')&(df['performance_metric']=='AUC-ROC')][['dataset', 'method', 'performance']]
avg_rank = df_selection.groupby('dataset').performance.rank(ascending=False).groupby(df_selection.method).mean()
avg_rank

In [None]:
df_selection

In [None]:
dict_data = {}
for method in models:
    dict_data[method] = []
    for dataset in datasets:
        dict_data[method].append(results_dict[dataset][method]['laundering']['AUC-ROC'])

In [None]:
df_selection.groupby('dataset').performance.rank(ascending=False).groupby(df_selection.method).mean()

In [None]:
scores = np.array(list(dict_data.values())).T

# Run the Friedman test
stat, p = friedmanchisquare(*scores.T)

print(f"Friedman test statistic: {stat:.4f}, p-value: {p:.4f}")

if p<0.05:
    print("Significant differences found — proceeding with Critical Distance Diagram")

    # Compute ranks (1 = best, higher = worse)
    ranks = np.argsort(np.argsort(-scores, axis=1), axis=1) + 1  # descending order

    # Average ranks per model
    avg_ranks = np.mean(ranks, axis=0)
    print("Average ranks:", avg_ranks)

    nf = sp.posthoc_nemenyi_friedman(scores)
    model_names = list(dict_data.keys())
    nf.index=nf.columns=model_names
    
    # Format: diagonal, non-significant, p<0.001, p<0.01, p<0.05
    cmap = ['1', '#fb6a4a',  '#08306b',  '#4292c6', '#c6dbef']
    heatmap_args = {'cmap': cmap, 'linewidths': 0.25, 'linecolor': '0.5', 'square': True}
    sp.sign_plot(nf, **heatmap_args)

else:
    print("No statistically significant differences found — skipping post-hoc analysis.")


In [None]:
# Define the colors for the models based on their order in the boxplot
model_colors = {
    'flowscope': 'tab:blue',
    'autoaudit': 'tab:orange',
    'gargaml_u': 'tab:green',
    #'gargaml_d': 'tab:red', 
    'gargaml_tree_u': 'tab:red',#'tab:purple',
    'gargaml_boost_u': 'tab:brown', 
    'gargaml_tree_d': 'tab:pink',
    'gargaml_boost_d': 'tab:gray'
}

pretty_model_names = {
'flowscope': 'FlowScope',
   'autoaudit': 'AutoAudit',
   'gargaml_u': 'GARG-AML',
   'gargaml_d': 'GARG-AML Dir.', 
   'gargaml_tree_u': 'GARG-AML + Tree',
   'gargaml_boost_u': 'GARG-AML Boost Undir.',
   'gargaml_tree_d': 'GARG-AML Tree Dir.',
   'gargaml_boost_d': 'GARG-AML Boost Dir.'
}



avg_ranks_dict = dict(df_selection.groupby('dataset').performance.rank(ascending=False).groupby(df_selection.method).mean())

# --- IMPORTANT: Create a new dictionary with PRETTY labels ---
avg_ranks_pretty = {pretty_model_names[model]: rank for model, rank in avg_ranks_dict.items()}
nf.rename(index=pretty_model_names, columns=pretty_model_names, inplace=True)

# Also adapt the color palette using pretty labels
color_palette_pretty = {pretty_model_names[model]: model_colors[model] for model in avg_ranks_dict.keys()}

# Now plot with pretty names
plt.figure(figsize=(10, 2))
plt.title('Critical Difference Diagram of Average Score Ranks')
sp.critical_difference_diagram(
    ranks=avg_ranks_pretty,
    #ranks=avg_ranks_dict,
    sig_matrix=nf,  # Assuming nf matches the order of avg_ranks_dict
    label_fmt_left='{label} [{rank:.2f}]  ',
    label_fmt_right='  [{rank:.2f}] {label}',
    color_palette=color_palette_pretty,
    #color_palette={model: model_colors[model] for model in avg_ranks_dict.keys()}
)

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(8, 6.5))

for i_ax in range(4):
    pattern = patterns[i_ax]

    df_selection = df[(df['pattern']==pattern)&(df['performance_metric']=='AUC-ROC')][['dataset', 'method', 'performance']]
    avg_rank = df_selection.groupby('dataset').performance.rank(ascending=False).groupby(df_selection.method).mean()

    dict_data = {}
    for method in models:
        dict_data[method] = []
        for dataset in datasets:
            dict_data[method].append(results_dict[dataset][method][pattern]['AUC-ROC'])        

    scores = np.array(list(dict_data.values())).T

    # Run the Friedman test
    stat, p = friedmanchisquare(*scores.T)

    print(f"Friedman test statistic: {stat:.4f}, p-value: {p:.4f}")

    if p<0.05:
        print("Significant differences found — proceeding with Critical Distance Diagram")

        # Compute ranks (1 = best, higher = worse)
        ranks = np.argsort(np.argsort(-scores, axis=1), axis=1) + 1  # descending order

        # Average ranks per model
        avg_ranks = np.mean(ranks, axis=0)
        print("Average ranks:", avg_ranks)

        nf = sp.posthoc_nemenyi_friedman(scores)
        model_names = list(dict_data.keys())
        nf.index=nf.columns=model_names

        avg_ranks_dict = dict(df_selection.groupby('dataset').performance.rank(ascending=False).groupby(df_selection.method).mean())

        # --- IMPORTANT: Create a new dictionary with PRETTY labels ---
        avg_ranks_pretty = {pretty_model_names[model]: rank for model, rank in avg_ranks_dict.items()}
        nf.rename(index=pretty_model_names, columns=pretty_model_names, inplace=True)

        # Also adapt the color palette using pretty labels
        color_palette_pretty = {pretty_model_names[model]: model_colors[model] for model in avg_ranks_dict.keys()}
        
        axes[i_ax].set_title('Pattern: '+pattern)
        sp.critical_difference_diagram(
            ranks=avg_ranks_pretty,
            # ranks=avg_ranks_dict,
            sig_matrix=nf,
            label_fmt_left='{label} ({rank:.2f})  ',
            label_fmt_right='  ({rank:.2f}) {label}',
            label_props={'fontweight': 'bold'},
            color_palette= color_palette_pretty, 
            # color_palette={model: model_colors[model] for model in avg_ranks_dict.keys()},
            ax=axes[i_ax]
            )
        
        # Get correct handles and labels
        handles, labels = axes[i_ax].get_legend_handles_labels()

    else:
        print("No statistically significant differences found — skipping post-hoc analysis.")

plt.suptitle('Critical difference diagram for average rank according to the AUC-ROC')
plt.tight_layout()
plt.savefig('../results/CD_ROC_full.pdf')

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(8, 6.5))

for i_ax in range(4):
    pattern = patterns[i_ax]

    df_selection = df[(df['pattern']==pattern)&(df['performance_metric']=='AUC-PR')][['dataset', 'method', 'performance']]
    avg_rank = df_selection.groupby('dataset').performance.rank(ascending=False).groupby(df_selection.method).mean()

    dict_data = {}
    for method in models:
        dict_data[method] = []
        for dataset in datasets:
            dict_data[method].append(results_dict[dataset][method][pattern]['AUC-PR'])        

    scores = np.array(list(dict_data.values())).T

    # Run the Friedman test
    stat, p = friedmanchisquare(*scores.T)

    print(f"Friedman test statistic: {stat:.4f}, p-value: {p:.4f}")

    if p<0.05:
        print("Significant differences found — proceeding with Critical Distance Diagram")

        # Compute ranks (1 = best, higher = worse)
        ranks = np.argsort(np.argsort(-scores, axis=1), axis=1) + 1  # descending order

        # Average ranks per model
        avg_ranks = np.mean(ranks, axis=0)
        print("Average ranks:", avg_ranks)

        nf = sp.posthoc_nemenyi_friedman(scores)
        model_names = list(dict_data.keys())
        nf.index=nf.columns=model_names

        avg_ranks_dict = dict(df_selection.groupby('dataset').performance.rank(ascending=False).groupby(df_selection.method).mean())

        # --- IMPORTANT: Create a new dictionary with PRETTY labels ---
        avg_ranks_pretty = {pretty_model_names[model]: rank for model, rank in avg_ranks_dict.items()}
        nf.rename(index=pretty_model_names, columns=pretty_model_names, inplace=True)

        # Also adapt the color palette using pretty labels
        color_palette_pretty = {pretty_model_names[model]: model_colors[model] for model in avg_ranks_dict.keys()}

        axes[i_ax].set_title('Pattern: '+pattern)
        sp.critical_difference_diagram(
            ranks=avg_ranks_pretty,
            # ranks=avg_ranks_dict,
            sig_matrix=nf,
            label_fmt_left='{label} ({rank:.2f})  ',
            label_fmt_right='  ({rank:.2f}) {label}',
            label_props={'fontweight': 'bold'},
            color_palette= color_palette_pretty, 
            # color_palette={model: model_colors[model] for model in avg_ranks_dict.keys()},
            ax=axes[i_ax]
            )
    else:
        print("No statistically significant differences found — skipping post-hoc analysis.")

plt.suptitle('Critical difference diagram for average rank according to the AUC-PR')
plt.tight_layout()
plt.savefig('../results/CD_PR_full.pdf')

# IBM Dataset

In this part of the notebook, we analyse the results for the IBM data set. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# GARG-AML Undirected
file = '../results/results_performance_IBM_undirected.txt'
with open(file, 'r') as f:
    lines = f.readlines()

model_list = []
dataset_list = []
pattern_list = []
cutoff_list = []
AUCROC_list = []
AUCPR_list = []

for line in lines:
    long_split = line.strip().split('_')
    model = 'GARG-AML Undirected'
    dataset = long_split[0]
    pattern = long_split[1]
    cutoff_results = long_split[2].split(' ', maxsplit=3)
    cutoff = cutoff_results[0]
    try:
        results  = eval(cutoff_results[-1])
    except:
        results = [0, 0]
    AUCROC = results[0]
    AUCPR = results[1]
    model_list.append(model)
    dataset_list.append(dataset)
    pattern_list.append(pattern)
    cutoff_list.append(cutoff)
    AUCROC_list.append(AUCROC)
    AUCPR_list.append(AUCPR)


In [None]:
# GARG-AML Directed
file = '../results/results_performance_IBM_directed.txt'
with open(file, 'r') as f:
    lines = f.readlines()

for line in lines:
    long_split = line.strip().split('_')
    model = 'GARG-AML Directed'
    dataset = long_split[0]
    pattern = long_split[1]
    cutoff_results = long_split[2].split(' ', maxsplit=3)
    cutoff = cutoff_results[0]
    try:
        results  = eval(cutoff_results[-1])
    except:
        results = [0, 0]
    AUCROC = results[0]
    AUCPR = results[1]
    model_list.append(model)
    dataset_list.append(dataset)
    pattern_list.append(pattern)
    cutoff_list.append(cutoff)
    AUCROC_list.append(AUCROC)
    AUCPR_list.append(AUCPR)

In [None]:
# Flowscope HI-Small
file = '../results/flowscope_performance_HI-Small.txt'
with open(file, 'r') as f:
    lines = f.readlines()

for line in lines:
    long_split = line.strip().split('_')
    model = 'FlowScope'
    dataset = 'HI-Small'
    pattern = long_split[1]
    cutoff_results = long_split[3].split(' ', maxsplit=3)
    cutoff = cutoff_results[0]
    try:
        results  = eval(cutoff_results[-1])
    except:
        results = [0, 0]
    AUCROC = results[0]
    AUCPR = results[1]
    model_list.append(model)
    dataset_list.append(dataset)
    pattern_list.append(pattern)
    cutoff_list.append(cutoff)
    AUCROC_list.append(AUCROC)
    AUCPR_list.append(AUCPR)

In [None]:
# Flowscope LI-Large
file = '../results/flowscope_performance_LI-Large.txt'
with open(file, 'r') as f:
    lines = f.readlines()

for line in lines:
    long_split = line.strip().split('_')
    print(long_split)
    model = 'FlowScope'
    dataset = 'LI-Large'
    pattern = long_split[1]
    cutoff_results = long_split[3].split(' ', maxsplit=3)
    cutoff = cutoff_results[0]
    try:
        results  = eval(cutoff_results[-1])
    except:
        results = [0, 0]
    AUCROC = results[0]
    AUCPR = results[1]
    model_list.append(model)
    dataset_list.append(dataset)
    pattern_list.append(pattern)
    cutoff_list.append(cutoff)
    AUCROC_list.append(AUCROC)
    AUCPR_list.append(AUCPR)

In [None]:
models_IBM_tree = ['tree', 'boosting']
directed_list = ['undirected', 'directed']
IMB_data_list = ['HI-Small', 'LI-Large']
metrics_list = ['ROC', 'PR']
IBM_cutoffs = [0.1, 0.2, 0.3, 0.5, 0.9]
IBM_patterns = ['Is Laundering', 'FAN-OUT', 'FAN-IN', 'GATHER-SCATTER', 'SCATTER-GATHER', 'CYCLE', 'RANDOM', 'BIPARTITE', 'STACK']

for ds in IMB_data_list:
    for d in directed_list:
        for m in models_IBM_tree:
            for cut_off in IBM_cutoffs:
                for pattern in IBM_patterns:
                    model_list.append('GARG-AML '+d.capitalize()+' '+m.capitalize())
                    dataset_list.append(ds)
                    pattern_list.append(pattern)
                    cutoff_list.append(str(cut_off))
                    for pm in metrics_list:
                        file = '../results/'+ds+'_AUC_'+pm+'_'+m+'_'+d+'_combined.csv'
                        df = pd.read_csv(file, index_col=0)
                        if pm == 'ROC':
                            AUC = df.loc[cut_off][pattern]
                            AUCROC_list.append(AUC)
                        else:
                            AUC = df.loc[cut_off][pattern]
                            AUCPR_list.append(AUC)



In [None]:
AUC

In [None]:
results_df = pd.DataFrame({
    'model': model_list,
    'dataset': dataset_list,
    'pattern': pattern_list,
    'cutoff': cutoff_list,
    'AUC-ROC': AUCROC_list,
    'AUC-PR': AUCPR_list
})

In [None]:
results_df.fillna(0, inplace=True)

In [None]:
results_df.tail()

In [None]:
results_plot_df = results_df[(results_df['dataset']==dataset) & (results_df['pattern'].isin(['Is Laundering', 'GATHER-SCATTER', 'SCATTER-GATHER'])) & (results_df['cutoff'].isin(['0.1', '0.5' ,'0.9']))]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

datasets = ['HI-Small', 'LI-Large']
for dataset in datasets:
    results_plot_df = results_df[(results_df['dataset']==dataset) & (results_df['pattern'].isin(['Is Laundering', 'GATHER-SCATTER', 'SCATTER-GATHER'])) & (results_df['cutoff'].isin(['0.1', '0.5' ,'0.9']))]

    # Ensure consistent aesthetics
    sns.set(style="whitegrid")

    # Assume results_plot_df is already defined
    # Extract unique patterns and cutoffs
    patterns = results_plot_df['pattern'].unique()
    cutoffs = results_plot_df['cutoff'].unique()
    models = results_plot_df['model'].unique()
    palette = sns.color_palette("tab10", len(models)+1)
    del (palette[1])

    # Create a mapping from model names to colors
    model_color_map = dict(zip(models, palette))

    # Create the subplot grid
    fig, axes = plt.subplots(len(patterns), len(cutoffs), figsize=(7 * len(cutoffs), 3 * len(patterns)), squeeze=False)

    for i, pattern in enumerate(patterns):
        for j, cutoff in enumerate(cutoffs):
            ax = axes[i, j]
            subset = results_plot_df[(results_plot_df['pattern'] == pattern) & (results_plot_df['cutoff'] == cutoff)]
            ylim_max = subset['AUC-ROC'].max()*1.1
            ax.set_ylim(0, ylim_max)
            # Primary bar plot for AUC-ROC
            for idx, model in enumerate(models):
                model_data = subset[subset['model'] == model]
                roc = model_data['AUC-ROC'].values[0]
                ax.bar(idx - 0.2, model_data['AUC-ROC'], width=0.4, label=model if i == 0 and j == 0 else "", 
                    color=model_color_map[model], edgecolor='black')
                ax.text(idx - 0.23, roc-0.001, f"{roc*100:.1f}%", ha='center', va='bottom', fontsize=8)
            
            # Twin axis for AUC-PR
            ylim_max_2 = subset['AUC-PR'].max()*1.1
            ax2 = ax.twinx()
            ax2.set_ylim(0, ylim_max_2)
            for idx, model in enumerate(models):
                model_data = subset[subset['model'] == model]
                pr = model_data['AUC-PR'].values[0]
                ax2.bar(idx + 0.2, model_data['AUC-PR'], width=0.4, label=None, 
                        color=model_color_map[model], hatch='//', alpha=0.7, edgecolor='black')
                ax2.text(idx + 0.28, pr, f"{pr*100:.3f}%", ha='center', va='bottom', fontsize=8)

            ax.set_title(f"Pattern: {pattern} | Cutoff: {cutoff}")
            ax.set_xticks([])
            ax.set_xticklabels([])
            #ax.set_xticks(range(len(models)))
            #ax.set_xticklabels(models, rotation=45, ha='right')
            ax.set_ylabel("AUC-ROC")
            ax2.set_ylabel("AUC-PR")

    # Add legend only once
    handles = [plt.Rectangle((0,0),1,1, color=model_color_map[m]) for m in models]
    fig.legend(handles, models, loc='upper center', ncol=len(models), title='Models - '+dataset)
    plt.tight_layout(rect=[0, 0, 1, 0.94])
    plt.savefig('../results/'+dataset+'_AUC-ROC_AUC-PR.pdf')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

datasets = ['HI-Small', 'LI-Large']
for dataset in datasets:
    results_plot_df = results_df[(results_df['dataset']==dataset)]

    # Ensure consistent aesthetics
    sns.set(style="whitegrid")

    # Assume results_plot_df is already defined
    # Extract unique patterns and cutoffs
    patterns = results_plot_df['pattern'].unique()
    cutoffs = results_plot_df['cutoff'].unique()
    models = results_plot_df['model'].unique()
    palette = sns.color_palette("tab10", len(models)+1)
    del (palette[1])

    # Create a mapping from model names to colors
    model_color_map = dict(zip(models, palette))

    # Create the subplot grid
    fig, axes = plt.subplots(len(patterns), len(cutoffs), figsize=(7 * len(cutoffs), 3 * len(patterns)), squeeze=False)

    for i, pattern in enumerate(patterns):
        for j, cutoff in enumerate(cutoffs):
            ax = axes[i, j]
            subset = results_plot_df[(results_plot_df['pattern'] == pattern) & (results_plot_df['cutoff'] == cutoff)]
            ylim_max = subset['AUC-ROC'].max()*1.1
            ax.set_ylim(0, ylim_max)
            # Primary bar plot for AUC-ROC
            for idx, model in enumerate(models):
                model_data = subset[subset['model'] == model]
                roc = model_data['AUC-ROC'].values[0]
                ax.bar(idx - 0.2, model_data['AUC-ROC'], width=0.4, label=model if i == 0 and j == 0 else "", 
                    color=model_color_map[model], edgecolor='black')
                ax.text(idx - 0.23, roc-0.001, f"{roc*100:.1f}%", ha='center', va='bottom', fontsize=8)
            
            # Twin axis for AUC-PR
            ylim_max_2 = subset['AUC-PR'].max()*1.1
            ax2 = ax.twinx()
            ax2.set_ylim(0, ylim_max_2)
            for idx, model in enumerate(models):
                model_data = subset[subset['model'] == model]
                pr = model_data['AUC-PR'].values[0]
                ax2.bar(idx + 0.2, model_data['AUC-PR'], width=0.4, label=None, 
                        color=model_color_map[model], hatch='//', alpha=0.7, edgecolor='black')
                ax2.text(idx + 0.28, pr, f"{pr*100:.3f}%", ha='center', va='bottom', fontsize=8)

            ax.set_title(f"Pattern: {pattern} | Cutoff: {cutoff}")
            ax.set_xticks([])
            ax.set_xticklabels([])
            #ax.set_xticks(range(len(models)))
            #ax.set_xticklabels(models, rotation=45, ha='right')
            ax.set_ylabel("AUC-ROC")
            ax2.set_ylabel("AUC-PR")

    # Add legend only once
    handles = [plt.Rectangle((0,0),1,1, color=model_color_map[m]) for m in models]
    fig.legend(handles, models, loc='upper center', ncol=len(models), title='Models - '+dataset)
    plt.tight_layout(rect=[0, 0, 1, 0.94])
    plt.savefig('../results/'+dataset+'_AUC-ROC_AUC-PR_full.pdf')