In [None]:
# LOAD MODULES
# Standard library
import os
import sys

# Third party
from tqdm import tqdm

# NOTE: Your script is not in the root directory. We must hence change the system path
DIR = "../"
os.chdir(DIR)
sys.path.append(DIR)

In [None]:
import networkx as nx
import pandas as pd

from src.methods.utils.measure_functions_directed import *
from src.data.graph_construction import construct_IBM_graph
from src.utils.graph_processing import graph_community
from src.methods.utils.neighbourhood_functions import GARG_AML_nodeselection
from src.data.pattern_construction import define_ML_labels, summarise_ML_labels
from src.methods.gargaml_scores import define_gargaml_scores

In [None]:
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

In [None]:
dataset = "HI-Small"
path = "data/"+dataset+"_Trans.csv"
directed = True

G = construct_IBM_graph(path=path, directed = directed)
G_reduced = graph_community(G, 10)

G_reduced_und = G_reduced.to_undirected()
G_reduced_rev = G_reduced.reverse(copy=True)

nodes = list(G_reduced.nodes)
measure_00_list = []
measure_01_list = []
measure_02_list = []
measure_10_list = []
measure_11_list = []
measure_12_list = []
measure_20_list = []
measure_21_list = []
measure_22_list = []

for node in tqdm(nodes):
    G_ego_second_und = nx.ego_graph(G_reduced_und, node, 2) #Use both incoming and outgoing edges
    G_ego_second = nx.subgraph(G_reduced, G_ego_second_und.nodes)
    G_ego_second_rev = nx.ego_graph(G_reduced_rev, node, 2) #Look at the reverse graph to get the incoming edges

    nodes_0, nodes_1, nodes_2, nodes_ordered = GARG_AML_nodeselection(G_ego_second, node, directed = True, G_ego_second_und = G_ego_second_und, G_ego_second_rev = G_ego_second_rev)

    adj_full = nx.adjacency_matrix(G_ego_second, nodelist=nodes_ordered).toarray()

    size_0 = len(nodes_0)
    size_1 = len(nodes_1)
    size_2 = len(nodes_2)

    measure_00 = measure_00_function(adj_full, size_0)
    measure_01 = measure_01_function(adj_full, size_0, size_1)
    measure_02 = measure_02_function(adj_full, size_0, size_1, size_2)
    measure_10 = measure_10_function(adj_full, size_0, size_1)
    measure_11 = measure_11_function(adj_full, size_0, size_1)
    measure_12 = measure_12_function(adj_full, size_0, size_1, size_2)  
    measure_20 = measure_20_function(adj_full, size_0, size_2)
    measure_21 = measure_21_function(adj_full, size_0, size_1, size_2)
    measure_22 = measure_22_function(adj_full, size_2)

    measure_00_list.append(measure_00[0])
    measure_01_list.append(measure_01[0])
    measure_02_list.append(measure_02[0])
    measure_10_list.append(measure_10[0])
    measure_11_list.append(measure_11[0])
    measure_12_list.append(measure_12[0])
    measure_20_list.append(measure_20[0])
    measure_21_list.append(measure_21[0])
    measure_22_list.append(measure_22[0])

data_dict = {
    "node": nodes, 
    "measure_00": measure_00_list,
    "measure_01": measure_01_list, 
    "measure_02": measure_02_list,
    "measure_10": measure_10_list,
    "measure_11": measure_11_list,
    "measure_12": measure_12_list,
    "measure_20": measure_20_list,
    "measure_21": measure_21_list,
    "measure_22": measure_22_list,
             }

measure_df = pd.DataFrame(data_dict)



In [None]:
measure_df.head()

In [None]:
X = measure_df.drop(columns = ["node"])
clf = IsolationForest(random_state=1997)
clf.fit(X)

In [None]:
y_pred = clf.predict(X)
y_scores = clf.score_samples(X)

In [None]:
plt.hist(y_scores, bins = 100)
plt.yscale("log")

In [None]:
y_pred_01 = (1-y_pred)/2
plt.hist(y_pred_01, bins = 100)

In [None]:
measure_df["anomaly_score"] = y_scores
measure_df["anomaly_pred"] = y_pred_01

In [None]:
dataset = "HI-Small"  
directed = False
score_type = "basic" # basic or weighted_average

str_directed = "directed" if directed else "undirected"
results_df_measures = pd.read_csv("results/"+dataset+"_GARGAML_"+str_directed+".csv")

results_df = define_gargaml_scores(results_df_measures, directed=directed, score_type=score_type)

transactions_df_extended, pattern_columns = define_ML_labels(
    path_trans = "data/"+dataset+"_Trans.csv",
    path_patterns = "data/"+dataset+"_Patterns.txt"
)

laundering_combined, _, _ = summarise_ML_labels(transactions_df_extended,pattern_columns)

In [None]:
from_data = transactions_df_extended[["Account", "From Bank"]].drop_duplicates()
from_data.columns = ["Account", "Bank"]
to_data = transactions_df_extended[["Account.1", "To Bank"]].drop_duplicates()
to_data.columns = ["Account", "Bank"]
total_data = pd.concat([from_data, to_data], axis=0).drop_duplicates()
total_data.shape

In [None]:
column = "Is Laundering"
laundering_combined[[column]].hist(log=True)

cutoff = 0.2
laundering_combined["Label"] = ((laundering_combined[column]>cutoff)*1).values

labels = []
for node in results_df.index:
    label = int(laundering_combined.loc[node]["Label"])
    labels.append(label)

results_df["Label"] = labels

In [None]:
measure_df[['node', 'anomaly_score', 'anomaly_pred']]

In [None]:
results_df

In [None]:
df_test = pd.merge(
    left = measure_df[['node', 'anomaly_score', 'anomaly_pred']],
    right = results_df[['node', 'Label']],
    on = "node"
)

In [None]:
df_test.head()

In [None]:
df_test["anomaly_score"] = abs(df_test["anomaly_score"])
df_test["anomaly_score"].hist(log=True)

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [None]:
y_pred = df_test["anomaly_score"]
y_test = df_test["Label"]

In [None]:
import numpy as np
np.mean(y_test)

In [None]:
AUC_ROC = roc_auc_score(y_test, y_pred); print("AUC ROC:", AUC_ROC)
AUC_PR = average_precision_score(y_test, y_pred); print("AUC PR:", AUC_PR)

In [None]:
# Extract the node with the highest and lowest anomaly score
# Plot their second order ego network

node_max = df_test.loc[df_test["anomaly_score"].idxmax()]["node"]
node_min = df_test.loc[df_test["anomaly_score"].idxmin()]["node"]

G_ego_max = nx.ego_graph(G_reduced, node_max, 2) #Use both incoming and outgoing edges
G_ego_min = nx.ego_graph(G_reduced, node_min, 2) #Use both incoming and outgoing edges


In [None]:
nx.draw(G_ego_max, with_labels=True)

In [None]:
nx.draw(G_ego_min, with_labels=True)

In [None]:
import numpy as np
r = df_corr["anomaly_score"]['Is Laundering']
N = len(df_test)
t_stat = r*np.sqrt((N-2)/(1-r**2)); print(t_stat)

# Two-tailed t-test
from scipy.stats import t
p = 2*(1-t.cdf(np.abs(t_stat), N-2)); print(p)
if p < 0.05:
    print("Reject null hypothesis")
    print("There is a significant correlation between anomaly score and laundering")
else:
    print("Fail to reject null hypothesis")
    print("There is no significant correlation between anomaly score and laundering")