In [None]:
# LOAD MODULES
# Standard library
import os
import sys

# Third party
from tqdm import tqdm

# NOTE: Your script is not in the root directory. We must hence change the system path
DIR = "../"
os.chdir(DIR)
sys.path.append(DIR)

In [None]:
from src.data.pattern_construction import define_ML_labels, summarise_ML_labels
from src.methods.gargaml_scores import define_gargaml_scores
import pandas as pd

In [None]:
dataset = "HI-Small"  
directed = False
score_type = "basic" # basic or weighted_average

str_directed = "directed" if directed else "undirected"
results_df_measures = pd.read_csv("results/"+dataset+"_GARGAML_"+str_directed+".csv")

results_df = define_gargaml_scores(results_df_measures, directed=directed, score_type=score_type)

transactions_df_extended, pattern_columns = define_ML_labels(
    path_trans = "data/"+dataset+"_Trans.csv",
    path_patterns = "data/"+dataset+"_Patterns.txt"
)

laundering_combined, _, _ = summarise_ML_labels(transactions_df_extended,pattern_columns)

In [None]:
from_data = transactions_df_extended[["Account", "From Bank"]].drop_duplicates()
from_data.columns = ["Account", "Bank"]
to_data = transactions_df_extended[["Account.1", "To Bank"]].drop_duplicates()
to_data.columns = ["Account", "Bank"]
total_data = pd.concat([from_data, to_data], axis=0).drop_duplicates()
total_data.shape

In [None]:
column = "Is Laundering"
laundering_combined[[column]].hist(log=True)

In [None]:
cutoff = 0.2
laundering_combined["Label"] = ((laundering_combined[column]>cutoff)*1).values

labels = []
for node in results_df.index:
    label = int(laundering_combined.loc[node]["Label"])
    labels.append(label)

results_df["Label"] = labels

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Filter the DataFrame by label
label_0 = results_df[results_df["Label"] == 0]["GARGAML"]
label_1 = results_df[results_df["Label"] == 1]["GARGAML"]

# Calculate the bin edges
all_data = np.concatenate([label_0, label_1])
bins = np.histogram_bin_edges(all_data, bins=20)

# Plot histograms
plt.figure(figsize=(10, 5))

# Plot histogram for label 0
plt.hist(label_0, bins=bins, alpha=0.5, label='Label 0', density=True)

# Plot histogram for label 1
plt.hist(label_1, bins=bins, alpha=0.5, label='Label 1', density=True)

# Add labels and title
plt.xlabel('GARGAML')
plt.ylabel('Relative Frequency')
plt.title('Distribution of GARGAML Scores by Label: "'+ column +'" at cut-off '+ str(cutoff))
plt.legend()

# Show plot
plt.savefig("results/"+dataset+"_GARGAML_"+str_directed+"_histogram_"+str(int(cutoff*100))+".pdf")

In [None]:
# Divergence measure
mean_0 = np.mean(label_0)
variance_0 = np.var(label_0)
mean_1 = np.mean(label_1)
variance_1 = np.var(label_1)

divergence = (mean_0 - mean_1)**2 + 0.5*(variance_0 + variance_1)

print("Divergence measure: ", divergence)

In [None]:
def lift_curve_values(y_val, y_pred, steps):
    vals_lift = [] #The lift values to be plotted

    df_lift = pd.DataFrame()
    df_lift['Real'] = y_val
    df_lift['Pred'] = y_pred
    df_lift.sort_values('Pred',
                        ascending=False,
                        inplace=True)

    global_ratio = df_lift['Real'].sum() / len(df_lift['Real'])

    for step in steps:
        data_len = int(np.ceil(step*len(df_lift)))
        data_lift = df_lift.iloc[:data_len, :]
        val_lift = data_lift['Real'].sum()/data_len
        vals_lift.append(val_lift/global_ratio)

    return(vals_lift)

In [None]:
values = np.linspace(0.01, 1, 100)
lift = lift_curve_values(results_df["Label"], results_df["GARGAML"], values)
plt.plot(values, lift)
plt.title('Lift curve of GARGAML Scores by Label: "'+ column +'" at cut-off '+ str(cutoff))
plt.xlabel('Proportion of data')
plt.ylabel('Lift')
plt.savefig("results/"+dataset+"_GARGAML_"+str_directed+"_lift_curve_"+str(int(cutoff*100))+".pdf")