In [2]:
import os
from itertools import product

import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, make_scorer, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from tqdm import tqdm

from NEExT import NEExT
from NEExT.builders import EmbeddingBuilder
from NEExT.collections import EgonetCollection
from NEExT.features import NodeFeatures, StructuralNodeFeatures
from NEExT.io import GraphIO
from NEExT.ml_models import MLModels, OutlierDataset, OutlierDetector

%reload_ext autoreload
%autoreload 2

In [3]:
def semi_supervised_set(features_df, col="is_outlier", hide_frac={0: 0.1, 1: 0.1}, seed=42):
    _features_df = features_df.copy()
    np.random.seed(seed)

    for _cls, frac in hide_frac.items():
        mask = _features_df[col] == _cls
        drop_indices = np.random.choice(_features_df[mask].index, size=int(len(_features_df[mask]) * frac), replace=False)
        _features_df.loc[drop_indices, col] = -1

    return _features_df


def initialize_graph(graph_data, frac):
    edges_df = pd.read_csv(graph_data["edge_file_path"])
    mapping_df = pd.read_csv(graph_data["node_graph_mapping_file_path"])
    features_df = pd.read_csv(graph_data["features_file_path"])

    ground_truth_df = features_df.copy()
    features_df = semi_supervised_set(ground_truth_df, hide_frac={0: frac[0], 1: frac[1]})
    ground_truth_df = (
        ground_truth_df.rename(columns={"node_id": "graph_id"})[["graph_id", "is_outlier"]].sort_values("graph_id").reset_index(drop=True)
    )
    return edges_df, mapping_df, features_df, ground_truth_df


def build_features(subgraph_collection, feature_vector_length, feature_list):
    structural_node_features = StructuralNodeFeatures(
        graph_collection=subgraph_collection,
        feature_list=["all"],
        feature_vector_length=feature_vector_length,
        n_jobs=8,
        show_progress=False,
    )
    node_features = NodeFeatures(
        subgraph_collection,
        feature_list=feature_list,
        show_progress=False,
    )
    structural_features = structural_node_features.compute()
    features = node_features.compute()
    return structural_features, features


def build_embeddings(subgraph_collection, structural_features, features, strategy, structural_embedding_dimension, feature_embedding_dimension):
    emb_builder = EmbeddingBuilder(
        subgraph_collection,
        strategy=strategy,
        structural_features=structural_features,
        features=features,
    )
    embeddings = emb_builder.compute(structural_embedding_dimension, feature_embedding_dimension)
    return embeddings

In [4]:
def build_outlier_detector(strategy='structural_embedding', structural_embedding_dimension=1, feature_embedding_dimension=1, top_k=1, threshold=0.0):
    embeddings = build_embeddings(
        subgraph_collection,
        structural_features,
        features,
        strategy=strategy,
        structural_embedding_dimension=structural_embedding_dimension,
        feature_embedding_dimension=feature_embedding_dimension,
    )
    dataset = OutlierDataset(subgraph_collection, embeddings, standardize=False)
    detector = OutlierDetector(top_k=top_k, threshold=threshold)
    return dataset, detector


def objective_outlier_detector(trial: optuna.Trial):
    params = dict(
        # strategy=trial.suggest_categorical(
        #     "strategy",
        #     ["structural_embedding",  
        #     #  "only_egonet_node_features", "separate_embedding", "combined_embedding", "merge_egonet_node_features",
        #      ],
        # ),
        structural_embedding_dimension=trial.suggest_int("structural_embedding_dimension", 1, 20),
        # feature_embedding_dimension=trial.suggest_int("feature_embedding_dimension", 1, 1),
        top_k=trial.suggest_int("top_k", 1, 20),
        threshold=trial.suggest_float("threshold", 0.0, .5, step=0.02),
    )

    dataset, detector = build_outlier_detector(**params)

    bal_accuracy = cross_val_score(
        detector,
        dataset.X_labeled,
        dataset.y_labeled,
        cv=StratifiedKFold(5),
        n_jobs=-1,
        scoring=make_scorer(balanced_accuracy_score),
    )
    return bal_accuracy.mean()

from sklearn.base import BaseEstimator
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
class DumbModel(BaseEstimator):
    def fit(self, X, y):
        return self
    
    def predict(self, X):
        return np.ones(len(X))


def objective_knn(trial: optuna.Trial):
    params = dict(
        structural_embedding_dimension=trial.suggest_int("structural_embedding_dimension", 1, 20),
        n_neighbors=trial.suggest_int("n_neighbors", 1, 50),
    )

    embeddings = build_embeddings(
        subgraph_collection,
        structural_features,
        features,
        strategy='structural_embedding',
        structural_embedding_dimension=params['structural_embedding_dimension'],
        feature_embedding_dimension=1,
    )
    dataset = OutlierDataset(subgraph_collection, embeddings, standardize=False)
    detector = KNeighborsClassifier(n_neighbors=params['n_neighbors'])
    detector = DumbModel()
    bal_accuracy = cross_val_score(
        detector,
        dataset.X_labeled,
        dataset.y_labeled,
        cv=StratifiedKFold(5),
        n_jobs=-1,
        scoring=make_scorer(balanced_accuracy_score),
    )
    return bal_accuracy.mean()


import lightgbm as lgbm


def objective_lgbm(trial: optuna.Trial):
    params = dict(
        structural_embedding_dimension=trial.suggest_int("structural_embedding_dimension", 1, 20),
        radius=trial.suggest_float("radius", 0, 10),
    )

    embeddings = build_embeddings(
        subgraph_collection,
        structural_features,
        features,
        strategy='structural_embedding',
        structural_embedding_dimension=params['structural_embedding_dimension'],
        feature_embedding_dimension=1,
    )
    dataset = OutlierDataset(subgraph_collection, embeddings, standardize=False)
    detector = lgbm.LGBMClassifier(max_depth=30, min_data_in_leaf=1)
     
    bal_accuracy = cross_val_score(
        detector,
        dataset.X_labeled,
        dataset.y_labeled,
        cv=StratifiedKFold(5),
        n_jobs=-1,
        scoring=make_scorer(balanced_accuracy_score),
    )
    return bal_accuracy.mean()



def score_unlabeled_gt(build_model, ground_truth_df, study):
    dataset, detector = build_model(**study.best_trial.params)
    detector.fit(dataset.X_labeled, dataset.y_labeled)

    out = detector.predict_full_df(dataset.unlabeled_graphs, dataset.X_unlabeled)
    out_unlab = out.merge(ground_truth_df[ground_truth_df["graph_id"].isin(out["graph_id"])]).sort_values("is_outlier", ascending=False)
    bl_acc = balanced_accuracy_score(out_unlab["is_outlier"], out_unlab["pred"])
    return bl_acc

In [5]:
graph_io = GraphIO()
path = '../local/data/simulated'
name = 'abcdo_data_1000_200_0.1'
frac = (.8, .3)

graph_data = {
    'name': name,
    'edge_file_path': f'{path}/{name}/edges.csv',
    'node_graph_mapping_file_path': f'{path}/{name}/graph_mapping.csv',
    'features_file_path': f'{path}/{name}/features.csv',
    'skip_features': ['community_id', 'random_community_feature'],
    'feature_list':[],
    'target': 'is_outlier'
}
edges_df, mapping_df, features_df, ground_truth_df = initialize_graph(graph_data, frac)

In [6]:
graph_collection = graph_io.load_from_dfs(
    edges_df=edges_df,
    node_graph_df=mapping_df,
    node_features_df=features_df,
    graph_type="igraph",
)
subgraph_collection = EgonetCollection()
subgraph_collection.create_egonets_from_graphs(
    graph_collection=graph_collection,
    egonet_target=graph_data["target"],
    egonet_algorithm="k_hop_egonet",
    skip_features=graph_data["skip_features"],
    max_hop_length=1,
    # n_iterations=n_iterations,
    # resolution=resolution,
)
structural_features, features = build_features(subgraph_collection, feature_vector_length=6, feature_list=graph_data["feature_list"])

In [7]:
# study = optuna.create_study(direction="maximize")
# study.optimize(objective_outlier_detector, n_trials=50, n_jobs=5)

In [8]:
study = optuna.create_study(direction="maximize")
study.optimize(objective_knn, n_trials=50, n_jobs=5)

[I 2025-03-22 19:11:16,251] A new study created in memory with name: no-name-bba59da0-f6e4-46ca-919e-bf2efb94097e


[I 2025-03-22 19:11:24,595] Trial 1 finished with value: 0.5 and parameters: {'structural_embedding_dimension': 1, 'n_neighbors': 49}. Best is trial 1 with value: 0.5.
[I 2025-03-22 19:11:24,921] Trial 0 finished with value: 0.5 and parameters: {'structural_embedding_dimension': 3, 'n_neighbors': 42}. Best is trial 1 with value: 0.5.
[I 2025-03-22 19:11:25,002] Trial 2 finished with value: 0.5 and parameters: {'structural_embedding_dimension': 9, 'n_neighbors': 36}. Best is trial 1 with value: 0.5.
[I 2025-03-22 19:11:25,549] Trial 3 finished with value: 0.5 and parameters: {'structural_embedding_dimension': 17, 'n_neighbors': 7}. Best is trial 1 with value: 0.5.
[I 2025-03-22 19:11:25,569] Trial 5 finished with value: 0.5 and parameters: {'structural_embedding_dimension': 16, 'n_neighbors': 38}. Best is trial 1 with value: 0.5.
[I 2025-03-22 19:11:26,007] Trial 6 finished with value: 0.5 and parameters: {'structural_embedding_dimension': 12, 'n_neighbors': 46}. Best is trial 1 with va

KeyboardInterrupt: 

In [9]:
study = optuna.create_study(direction="maximize")
study.optimize(objective_lgbm, n_trials=50, n_jobs=5)

[I 2025-03-22 17:23:22,168] A new study created in memory with name: no-name-ddfc14c5-f49e-4928-9f21-be718cf79a34


[LightGBM] [Info] Number of positive: 112, number of negative: 128
[LightGBM] [Info] Number of positive: 112, number of negative: 128
[LightGBM] [Info] Number of positive: 112, number of negative: 128
[LightGBM] [Info] Number of positive: 112, number of negative: 128
[LightGBM] [Info] Number of positive: 112, number of negative: 128
[LightGBM] [Info] Number of positive: 112, number of negative: 128
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.258738 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 601
[LightGBM] [Info] Number of positive: 112, number of negative: 128
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.268798 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 656
[LightGBM] [Info] Number of data points in the train set: 240, number of used features: 11[LightGBM] [Info] Number of data points in the trai

[W 2025-03-22 17:34:50,434] Trial 1 failed with parameters: {'structural_embedding_dimension': 17, 'radius': 5.5560190797773785} because of the following error: KeyboardInterrupt().
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/quak/miniconda3/envs/neext/lib/python3.11/site-packages/joblib/externals/loky/process_executor.py", line 463, in _process_worker
    r = call_item()
        ^^^^^^^^^^^
  File "/home/quak/miniconda3/envs/neext/lib/python3.11/site-packages/joblib/externals/loky/process_executor.py", line 291, in __call__
    return self.fn(*self.args, **self.kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/quak/miniconda3/envs/neext/lib/python3.11/site-packages/joblib/parallel.py", line 598, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/quak/miniconda3/envs/neext/lib/python3.11/site-packages/joblib/parallel.py", line 598, in <listcomp>
    return [fu

KeyboardInterrupt: 

In [None]:
# for each 

In [20]:
embeddings = build_embeddings(
    subgraph_collection,
    structural_features,
    features,
    strategy='structural_embedding',
    structural_embedding_dimension=3,
    feature_embedding_dimension=1,
)
dataset = OutlierDataset(subgraph_collection, embeddings, standardize=False)
model = RadiusNeighborsClassifier(10)

In [21]:
model.fit(dataset.X_labeled, dataset.y_labeled)

In [22]:
model.predict(dataset.X_unlabeled)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.