## Real-World Data Experiment

In [48]:
import numpy as np
import networkx as nx
import pandas as pd
import igraph as ig
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

%matplotlib inline

input_data_path = "../datasets/real/Real-Agg_Trans.csv"
results_path = "../results/Real-Agg_scores.csv"

In [22]:
transactions = pd.read_csv(input_data_path, delimiter=';')

In [23]:
transactions = transactions.rename(
    columns={'start_id': 'source',
             'end_id': 'target',
             'total': 'amount'})
transactions["transaction_id"] = transactions.index

### Extract Weakly Connected Components (WCC)

In [24]:
edges = transactions.loc[:, ['source', 'target', 'amount', 'year_from', 'year_to', 'count', 'transaction_id']]

graph = ig.Graph.DataFrame(edges, use_vids=False, directed=True)
all_nodes = pd.DataFrame([x["name"] for x in graph.vs()], columns=["name"])

weak_components = sorted(
    [(x, len(x)) for x in graph.components(mode="weak")], reverse=True, key=lambda x: x[1]
)

In [None]:
total_nodes = graph.vcount()
total_edges = graph.ecount()

weak_components = sorted(graph.components(mode="weak"), key=len, reverse=True)

top_components = []
for i, component in enumerate(weak_components[:3], start=1):
    num_nodes = len(component)
    num_edges = graph.subgraph(component).ecount()
    
    top_components.append({
        "Component": i,
        "# Nodes": num_nodes,
        "% of Total Nodes": f"{(num_nodes / total_nodes) * 100:.4f}%",
        "# Edges": num_edges,
        "% of Total Edges": f"{(num_edges / total_edges) * 100:.4f}%"
    })

top_components_df = pd.DataFrame(top_components)
print(top_components_df)

In [28]:
# Retain only the largest WCC
largest_weak_component = weak_components[0]
largest_component_nodes = set(graph.vs[largest_weak_component]["name"])

filtered_transactions = edges[
    edges["source"].isin(largest_component_nodes) & edges["target"].isin(largest_component_nodes)
]
transactions = filtered_transactions.copy()

### Feature Engineering

In [30]:
transactions['years'] = transactions['year_to'] - transactions['year_from'] + 1
transactions['amount_per_year'] = transactions['amount'] / transactions['years']
transactions['count_per_year'] = transactions['count'] / transactions['years']

sub_df = transactions[["source", "target", "amount"]]

src_group = sub_df.groupby("source").agg(
    total_sent = ("amount", "sum"),
    avg_sent = ("amount", "mean"),
    stddev_sent = ("amount", "std"),
    src_total_counterparties = ("target", "nunique"),
).reset_index()

dst_group = sub_df.groupby("target").agg(
    total_received = ("amount", "sum"),
    avg_received = ("amount", "mean"),
    stddev_received = ("amount", "std"),
    dst_total_counterparties = ("source", "nunique"),
).reset_index()

transactions = transactions.merge(src_group, on=["source"], how="left")
transactions = transactions.merge(dst_group, on=["target"], how="left")

transactions['percentage_of_total_sent'] = (transactions['amount'] / transactions['total_sent']) * 100
transactions['percentage_of_total_received'] = (transactions['amount'] / transactions['total_received']) * 100
transactions['percentage_of_avg_sent'] = (transactions['amount'] / transactions['avg_sent']) * 100
transactions['percentage_of_avg_received'] = (transactions['amount'] / transactions['avg_received']) * 100

total_interactions = transactions.groupby(['source', 'target']).size().reset_index(name='total_interactions')
transactions = transactions.merge(total_interactions, how='left', on=['source', 'target'])
del total_interactions

src_interactions = transactions.groupby('source').size().reset_index(name='src_interactions')
dst_interactions = transactions.groupby('target').size().reset_index(name='dst_interactions')

transactions = transactions.merge(src_interactions, on='source', how='left')
transactions = transactions.merge(dst_interactions, on='target', how='left')

transactions['counterparty_diversity'] = transactions.groupby('source')['target'].transform('nunique') / transactions['total_interactions']

In [31]:
transactions['transaction_frequency'] = (
    transactions['total_interactions'] / transactions['years']
)

transactions['transaction_amount_variance'] = (
    transactions['stddev_sent'] / (transactions['avg_sent'] + 1e-10)
)

transactions['transaction_ratio'] = (
    transactions['total_sent'] / (transactions['total_received'] + 1e-10)
)

transactions['unique_counterparty_ratio'] = (
    (transactions['src_total_counterparties'] + transactions['dst_total_counterparties']) 
    / (transactions['total_interactions'] + 1e-10)
)

In [32]:
graph = nx.from_pandas_edgelist(transactions, source='source', target='target', create_using=nx.DiGraph())

degree_centrality = nx.degree_centrality(graph)
pagerank_scores = nx.pagerank(graph)

transactions['source_degree_centrality'] = transactions['source'].map(degree_centrality)
transactions['target_degree_centrality'] = transactions['target'].map(degree_centrality)
transactions['source_pagerank'] = transactions['source'].map(pagerank_scores)
transactions['target_pagerank'] = transactions['target'].map(pagerank_scores)

### Feature Selection

In [33]:
selected_features = [
    'amount', 'count', 'years', 'amount_per_year', 'count_per_year', 
    'total_sent', 'avg_sent', 'stddev_sent', 'total_received', 
    'avg_received', 'stddev_received', 'src_total_counterparties', 
    'dst_total_counterparties', 'counterparty_diversity', 
    'percentage_of_total_sent', 'percentage_of_total_received', 
    'percentage_of_avg_sent', 'percentage_of_avg_received', 
    'transaction_ratio', 'transaction_frequency', 
    'transaction_amount_variance', 'unique_counterparty_ratio',
    'source_degree_centrality', 'target_degree_centrality',
    'source_pagerank', 'target_pagerank']
X_ids = transactions[['transaction_id', 'amount']]
X_train = transactions[selected_features]
X_train = X_train.fillna(0)

In [None]:
correlation_matrix = X_train.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title("Correlation Matrix of Selected Features")
plt.show()

high_correlation_pairs = [
    (col1, col2) for col1 in correlation_matrix.columns for col2 in correlation_matrix.columns 
    if col1 != col2 and abs(correlation_matrix.loc[col1, col2]) > 0.9
]

features_to_drop = set()
for col1, col2 in high_correlation_pairs:
    features_to_drop.add(col2)

X_train_reduced = X_train.drop(columns=features_to_drop)

print("\nReduced features after removing highly correlated pairs:")
print(X_train_reduced.columns.tolist())

### Isolation Forest Model

#### Instantiate and train model

In [38]:
# Model hyperparameters
trees= 100
samples = 0.01
jobs = -1
state = 42

model = IsolationForest(n_estimators=trees,
                        max_samples=samples,
                        n_jobs=jobs,
                        random_state=state)

In [None]:
# Train the IF model
model.fit(X_train_reduced)

#### Perform inference and export predictions

In [41]:
# Use the trained IF model to compute anomaly scores for the training data
scores = model.decision_function(X_train_reduced)

In [42]:
# Invert and scale Isolation Forest scores to [0, 1], where higher scores indicate anomalies
inverted_scores = -scores
inverted_scores = (inverted_scores - inverted_scores.min()) / (inverted_scores.max() - inverted_scores.min())
scaled_scores = inverted_scores

In [43]:
evaluation = pd.concat([X_ids, X_train_reduced], axis=1)
evaluation["scores"] = scaled_scores

evaluation = evaluation.sort_values(by='scores', ascending=False)

In [44]:
# Set the 90% percentile as threshold to separate between normal and abnomal transactions
THRESHOLD = 0.9
threshold = evaluation['scores'].quantile(THRESHOLD)

prediction = [1 if score>=threshold else 0 for score in evaluation['scores']]
evaluation['prediction'] = prediction

In [49]:
# Save results dataframe
results = evaluation[['transaction_id', 'amount', 'count', 'years', 'total_sent', 'scores', 'prediction']].copy()
results.to_csv(results_path, index = False)

## Evaluation

In [50]:
# Read the dataframe containing scores and prediction labels
results = pd.read_csv(results_path)

In [None]:
sampled_results = results.sample(frac=0.5, random_state=42)
sns.set(style="whitegrid")

plt.figure(figsize=(10, 6))
sns.scatterplot(x='amount', y='scores', data=sampled_results, hue='prediction', palette='viridis', s=50, edgecolor='k', alpha=0.7, legend=False)

plt.title("Scores vs Transaction Amount")
plt.xlabel("Transaction Amount")
plt.ylabel("Scores")
plt.grid(False)
plt.show()

#### Plot Results

In [7]:
results = pd.read_csv("results_real.csv")
results = results.merge(transactions, on="transaction_id", how="left")

In [10]:
anomalous = results[results['prediction']==1]
normal = results[results['prediction']==0]

In [None]:
print("Total amount of transactions identified normal ", normal['amount_x'].sum())
print("Total amount of transactions identified anomalous ", anomalous['amount_x'].sum())

print("Median amount  of transactions identified normal ", normal["amount_x"].median())
print("Median amount  of transactions identified anomalous ", anomalous["amount_x"].median())

#### Weakly Connected Components from anomalous transactions graph only

In [None]:
edges = results.loc[:, ['source', 'target', 'transaction_id', 'prediction']]
edges = edges[edges['prediction']==1]

graph = ig.Graph.DataFrame(edges, use_vids=False, directed=True)

total_nodes = graph.vcount()
total_edges = graph.ecount()

weak_components = sorted(graph.components(mode="weak"), key=len, reverse=True)

top_components = []
for i, component in enumerate(weak_components[:3], start=1):
    num_nodes = len(component)
    num_edges = graph.subgraph(component).ecount()
    
    top_components.append({
        "Component": i,
        "# Nodes": num_nodes,
        "% of Total Nodes": f"{(num_nodes / total_nodes) * 100:.4f}%",
        "# Edges": num_edges,
        "% of Total Edges": f"{(num_edges / total_edges) * 100:.4f}%"
    })

top_components_df = pd.DataFrame(top_components)
print(top_components_df)