In [1]:
import pandas as pd
import numpy as np
from NEExT.NEExT import NEExT
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import xgboost
import random
import plotly.express as px
from scipy.stats import wasserstein_distance
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error, precision_score, recall_score

In [2]:
def build_embedding(dataset_name):
    
    edge_file = "https://raw.githubusercontent.com/elmspace/ugaf_experiments_data/main/real_world_graphs/%s/processed_data/edge_file.csv"%(dataset_name)
    graph_label_file = "https://raw.githubusercontent.com/elmspace/ugaf_experiments_data/main/real_world_graphs/%s/processed_data/graph_label_mapping_file.csv"%(dataset_name)
    node_graph_mapping_file = "https://raw.githubusercontent.com/elmspace/ugaf_experiments_data/main/real_world_graphs/%s/processed_data/node_graph_mapping_file.csv"%(dataset_name)

    nxt = NEExT(quiet_mode="off")
    nxt.load_data_from_csv(edge_file=edge_file, node_graph_mapping_file=node_graph_mapping_file, graph_label_file=graph_label_file)

    features = ["basic_expansion", "self_walk", "page_rank", "degree_centrality", "closeness_centrality", "load_centrality",
           "eigenvector_centrality", "lsme"]
    for feat_name in features:
        nxt.compute_graph_feature(feat_name=feat_name, feat_vect_len=4)
    
    nxt.pool_graph_features(pool_method="concat")
    df = nxt.graph_c.global_feature_vector.copy(deep=True)
    feat_vect = nxt.graph_c.global_feature_vector_cols[:]
    return df, feat_vect

In [3]:
def compute_feat_distance(data, feats, sample_size):
    graph_ids = data["graph_id"].unique().tolist()
    
    graph_samples = []
    while len(graph_samples) < sample_size:
        g1 = random.sample(graph_ids, 1)[0]
        g2 = random.sample(graph_ids, 1)[0]
        if g1 != g2:
            graph_samples.append([g1, g2])

    results = pd.DataFrame()
    for feat in feats:
        distance_measures = []
        for graph_sample in graph_samples:
            i = graph_sample[0]
            j = graph_sample[1]
                
            graph_id1 = graph_ids[i]
            graph_id2 = graph_ids[j]
    
            feat1 = data[data["graph_id"] == graph_id1][feat].tolist()
            feat2 = data[data["graph_id"] == graph_id2][feat].tolist()

            w = wasserstein_distance(feat1, feat2)
            distance_measures.append(w)

        results[feat] = distance_measures
        
    return results

In [4]:
def select_features_with_variance_correlation(data, num_features):
    # Calculate variance for each feature
    variances = data.var()
    selected_features = []
    
    # Step 1: Select the feature with the highest variance
    first_feature = variances.idxmax()
    selected_features.append(first_feature)
    remaining_features = list(set(data.columns) - set(selected_features))

    # Iteratively select subsequent features
    for _ in range(1, num_features):
        best_feature = None
        best_score = float('-inf')
        
        for feature in remaining_features:
            # Calculate the correlation of this feature with already selected features
            max_correlation = max([abs(data[feature].corr(data[selected])) for selected in selected_features])
            
            # Calculate the score (variance / max_correlation)
            score = variances[feature] / (max_correlation + 1e-5)  # Adding small epsilon to avoid division by zero
            
            # Select the feature with the highest score
            if score > best_score:
                best_score = score
                best_feature = feature

        if best_feature is not None:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)

    return selected_features

In [5]:
feats, feat_vect = build_embedding("NCI1")
feat_dist = compute_feat_distance(feats, feat_vect, 500)
selected_feat = select_features_with_variance_correlation(feat_dist, len(feat_dist))

Building subgraphs:: 100%|███████████████████████████████████████████████████████| 4110/4110 [00:01<00:00, 3702.81it/s]
Filtering graphs:: 100%|█████████████████████████████████████████████████████████| 4110/4110 [00:01<00:00, 2091.92it/s]
Resrting node indices:: 100%|███████████████████████████████████████████████████| 4110/4110 [00:00<00:00, 18331.77it/s]
Assigning graph labels:: 100%|█████████████████████████████████████████████████| 4110/4110 [00:00<00:00, 690150.91it/s]
Building features: 100%|█████████████████████████████████████████████████████████| 4110/4110 [00:02<00:00, 1852.53it/s]
Building features: 100%|█████████████████████████████████████████████████████████| 4110/4110 [00:02<00:00, 1999.73it/s]
Building features: 100%|██████████████████████████████████████████████████████████| 4110/4110 [00:06<00:00, 604.05it/s]
Building features: 100%|█████████████████████████████████████████████████████████| 4110/4110 [00:01<00:00, 2700.56it/s]
Building features: 100%|████████████████

In [16]:
selected_feat = [i.replace("feat_", "") for i in selected_feat]

In [17]:
feature_importance = pd.DataFrame()
feature_importance["features"] = selected_feat
feature_importance["dataset"] = "NCI1"

In [18]:
feature_importance.to_csv("./data/feat_importance.csv", index=False)

In [19]:
feature_importance

Unnamed: 0,features,dataset
0,page_rank_0,NCI1
1,lsme_1,NCI1
2,page_rank_1,NCI1
3,degree_centrality_0,NCI1
4,degree_centrality_1,NCI1
5,page_rank_2,NCI1
6,closeness_centrality_1,NCI1
7,degree_centrality_2,NCI1
8,closeness_centrality_0,NCI1
9,page_rank_3,NCI1
