In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

from NEExT import NEExT

%reload_ext autoreload
%autoreload 2

In [2]:
edge_file_path = "../data/ABCDO-full/edges.csv"
node_graph_mapping_file_path = "../data/ABCDO-full/graph_mapping.csv"
features_file_path = "../data/ABCDO-full/features.csv"

edges = pd.read_csv(edge_file_path)
mapping = pd.read_csv(node_graph_mapping_file_path)
_features = pd.read_csv(features_file_path)

In [None]:
import os


def semi_supervised_set(df, col="is_outlier", hide_frac={0: 0.1, 1: 0.1}, seed=42):
    np.random.seed(seed)

    for _cls, frac in hide_frac.items():
        mask = df[col] == _cls
        drop_indices = np.random.choice(df[mask].index, size=int(len(df[mask]) * frac), replace=False)
        df.loc[drop_indices, col] = -1

    return df


_features = semi_supervised_set(_features)

edge_file_path = "../data/ABCDO-partial/edges.csv"
node_graph_mapping_file_path = "../data/ABCDO-partial/graph_mapping.csv"
features_file_path = "../data/ABCDO-partial/features.csv"

os.makedirs("../data/ABCDO-partial/", exist_ok=True)
edges.to_csv(edge_file_path, index=False)
mapping.to_csv(node_graph_mapping_file_path, index=False)
_features.to_csv(features_file_path, index=False)

In [4]:
from NEExT.collections import EgonetCollection
from NEExT.features import NodeFeatures, StructuralNodeFeatures
from NEExT.io import GraphIO
from NEExT.ml_models import MLModels

target = "is_outlier"
sample_size = 5

graph_io = GraphIO()
subgraph_collection = EgonetCollection()


graph_collection = graph_io.read_from_csv(
    edges_path=edge_file_path,
    node_graph_mapping_path=node_graph_mapping_file_path,
    node_features_path=features_file_path,
    graph_type="igraph",
)
subgraph_collection.create_egonets_from_graphs(
    graph_collection=graph_collection,
    egonet_target=target,
    egonet_algorithm="k_hop_egonet",
    skip_features=["community_id"],
    max_hop_length=1,
)

In [5]:
structural_node_features = StructuralNodeFeatures(
    graph_collection=subgraph_collection,
    feature_list=["all"],
    feature_vector_length=3,
    n_jobs=8,
)
node_features = NodeFeatures(
    subgraph_collection,
    feature_list=["random_community_feature"],
)

structural_features = structural_node_features.compute()
features = node_features.compute()

Computing structural node features:   0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
from NEExT.builders import EmbeddingBuilder

for s in ["structural_embedding"]:
    emb_builder = EmbeddingBuilder(
        subgraph_collection,
        strategy=s,
        structural_features=structural_features,
        features=features,
    )
    embeddings = emb_builder.compute(10, 6)

    ml_models = MLModels(
        graph_collection=subgraph_collection,
        embedding=embeddings,
        model_type="semi-supervised",
    )

    results = ml_models.compute()
    # print(s, f"Model trained with average accuracy: {np.mean(results['accuracy']):.4f}")

In [94]:
from NEExT.ml_models import OutlierDetector


detector = OutlierDetector(subgraph_collection, embeddings, standardize=False)
out = detector.predict_full_df()


In [95]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score

features_file_path = "../data/ABCDO-full/features.csv"
_features = pd.read_csv(features_file_path)
out = out.merge(_features[_features["node_id"].isin(out["graph_id"])].rename(columns={"node_id": "graph_id"})).sort_values(
    "is_outlier", ascending=False
)
balanced_accuracy_score(out["is_outlier"], out["pred"])

0.8210526315789474