In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

from NEExT import NEExT

%reload_ext autoreload
%autoreload 2

In [2]:
edge_file_path = "../data/ABCDO-full/edges.csv"
node_graph_mapping_file_path = "../data/ABCDO-full/graph_mapping.csv"
features_file_path = "../data/ABCDO-full/features.csv"

edges_df = pd.read_csv(edge_file_path)
mapping_df = pd.read_csv(node_graph_mapping_file_path)
features_df = pd.read_csv(features_file_path)

In [3]:
import os


def semi_supervised_set(df, col="is_outlier", hide_frac={0: 0.1, 1: 0.1}, seed=42):
    _df = df.copy()
    np.random.seed(seed)

    for _cls, frac in hide_frac.items():
        mask = _df[col] == _cls
        drop_indices = np.random.choice(_df[mask].index, size=int(len(_df[mask]) * frac), replace=False)
        _df.loc[drop_indices, col] = -1

    return _df

ground_truth_df = features_df.copy()
features_df = semi_supervised_set(ground_truth_df)
ground_truth_df = ground_truth_df.rename(columns={"node_id": "graph_id"})[["graph_id", "is_outlier"]].sort_values("graph_id").reset_index(drop=True)

edge_file_path = "../data/ABCDO-partial/edges.csv"
node_graph_mapping_file_path = "../data/ABCDO-partial/graph_mapping.csv"
features_file_path = "../data/ABCDO-partial/features.csv"
ground_truth_labels_file_path = "../data/ABCDO-partial/ground_truth_labels.csv"

os.makedirs("../data/ABCDO-partial/", exist_ok=True)
edges_df.to_csv(edge_file_path, index=False)
mapping_df.to_csv(node_graph_mapping_file_path, index=False)
features_df.to_csv(features_file_path, index=False)
ground_truth_df.to_csv(ground_truth_labels_file_path, index=False)

In [4]:
from NEExT.collections import EgonetCollection
from NEExT.features import NodeFeatures, StructuralNodeFeatures
from NEExT.io import GraphIO
from NEExT.ml_models import MLModels

target = "is_outlier"
sample_size = 5

graph_io = GraphIO()
subgraph_collection = EgonetCollection()


# graph_collection = graph_io.read_from_csv(
#     edges_path=edge_file_path,
#     node_graph_mapping_path=node_graph_mapping_file_path,
#     node_features_path=features_file_path,
#     graph_type="igraph",
# )
graph_collection = graph_io.load_from_dfs(
    edges_df=edges_df,
    node_graph_df=mapping_df,
    node_features_df=features_df,
    graph_type="igraph",
)
subgraph_collection.create_egonets_from_graphs(
    graph_collection=graph_collection,
    egonet_target=target,
    egonet_algorithm="k_hop_egonet",
    skip_features=["community_id"],
    max_hop_length=1,
)

In [5]:
structural_node_features = StructuralNodeFeatures(
    graph_collection=subgraph_collection,
    feature_list=["all"],
    feature_vector_length=3,
    n_jobs=8,
)
node_features = NodeFeatures(
    subgraph_collection,
    feature_list=["random_community_feature"],
)

structural_features = structural_node_features.compute()
features = node_features.compute()

Computing structural node features:   0%|          | 0/1000 [00:00<?, ?it/s]

In [27]:
from NEExT.builders import EmbeddingBuilder
from NEExT.ml_models import OutlierDetector, OutlierDataset
from sklearn.metrics import accuracy_score, balanced_accuracy_score

for s in ["structural_embedding"]:
    emb_builder = EmbeddingBuilder(
        subgraph_collection,
        strategy=s,
        structural_features=structural_features,
        features=features,
    )
    embeddings = emb_builder.compute(10, 6)

    # ml_models = MLModels(
    #     graph_collection=subgraph_collection,
    #     embedding=embeddings,
    #     model_type="semi-supervised",
    # )

    # results = ml_models.compute()
    # print(s, f"Model trained with average accuracy: {np.mean(results['accuracy']):.4f}")
    

    dataset = OutlierDataset(subgraph_collection, embeddings, standardize=False)
    detector = OutlierDetector().fit(dataset.X, dataset.y)
    out = detector.predict_full_df(dataset.unlabeled_graphs, dataset.X_unlabeled)
    
    out_unlab = out.merge(ground_truth_df[ground_truth_df["graph_id"].isin(out["graph_id"])]).sort_values(
        "is_outlier", ascending=False
    )
    bl_acc = balanced_accuracy_score(out_unlab["is_outlier"], out_unlab["pred"])
    print(s, f"Model trained with average balanced accuracy: {bl_acc:.4f} on unknown nodes")
    
    out = detector.predict(dataset.X)
    bl_acc = balanced_accuracy_score(ground_truth_df["is_outlier"], out)
    print(s, f"Model trained with average balanced accuracy: {bl_acc:.4f} on entire set")

structural_embedding Model trained with average balanced accuracy: 0.8000 on unknown nodes
structural_embedding Model trained with average balanced accuracy: 0.8516 on entire set
