In [70]:
import numpy as np
import networkx as nx
# from karateclub import FeatherGraph

In [71]:
import os
import random
import torch
import numpy as np
# 时间种子
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(0)

In [72]:
from data_process.create_latency import *
from tqdm import *
import torch 
import networkx as nx
import pickle 


def get_dataset_graph(dataset='201_acc'):
    raw_data = pickle.load(open('data/201_acc_data.pkl', 'rb'))
    all_keys = raw_data['key']
    
    all_graphs = []
    if dataset == '201_acc':
        for key in tqdm(all_keys):
            arch_str = get_arch_str_from_arch_vector(key)             # 架构向量转字符串
            adj_matrix, features = info2mat(arch_str) 
            graph = nx.from_numpy_array(adj_matrix)   # 邻接矩阵 -> 图
            # 添加节点特征
            for j in range(graph.number_of_nodes()):
                graph.nodes[j]['feature'] = np.eye(7, dtype=int)[features[j]]
                # graph.nodes[j]['feature'] = features[j]
            all_graphs.append(graph)
    elif dataset == 'nasbench101':
        pass 
    
    elif dataset == 'nnlqp':
        pass
    
    
    return all_graphs

all_graphs = get_dataset_graph()

100%|██████████| 15284/15284 [00:00<00:00, 27004.47it/s]


In [73]:
from karateclub.graph_embedding.graph2vec import Graph2Vec
from sklearn.cluster import KMeans

def get_graph2vec_clusters(graphs, dimensions=64, epochs=10, n_clusters=5, random_state=42):
    """
    使用 Graph2Vec 生成图嵌入，并进行 KMeans 聚类。

    Parameters
    ----------
    graphs : list of networkx.Graph
        输入的一组图
    dimensions : int
        嵌入维度
    workers : int
        并行 worker 数
    epochs : int
        训练轮数
    n_clusters : int
        聚类簇数
    random_state : int
        随机种子

    Returns
    -------
    embeddings : np.ndarray
        图的嵌入表示
    labels : np.ndarray
        每个图的聚类标签
    """
    # Graph2Vec 表征
    model = Graph2Vec(dimensions=dimensions, workers=4, epochs=epochs)
    model.fit(graphs)
    embeddings = model.get_embedding()

    return embeddings
embeddings = get_graph2vec_clusters(all_graphs, dimensions=32, epochs=10)

In [74]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize

def select_diverse_by_kmeans(embeddings: np.ndarray, n_clusters: int = 100):
    # 归一化能让距离更稳定（特别是用余弦度量时）
    X = normalize(embeddings)  # L2-normalize
    km = KMeans(n_clusters=n_clusters, n_init="auto")
    labels = km.fit_predict(X)
    centers = km.cluster_centers_

    # 每簇挑“离簇中心最近”的索引
    selected_idx = []
    for c in range(n_clusters):
        idx = np.where(labels == c)[0]
        if idx.size == 0:
            continue
        # 计算该簇内到中心的距离
        dist = np.linalg.norm(X[idx] - centers[c], axis=1)
        rep = idx[np.argmin(dist)]
        selected_idx.append(rep)

    return np.array(selected_idx, dtype=int)

cluster_idx = select_diverse_by_kmeans(embeddings, 100)

In [75]:
cluster_idx, cluster_idx.shape

(array([ 4244, 14530, 12736,  4703,  4510,  9333, 12370,  1880,  8418,
         9178,  6270,  4332,    80,  2295,  4788,  9066,  8048,  9771,
        13886,  6377,  8304,  4275,  7887,  8001, 14528,  9364,  2231,
         9717,  1384,  3471, 14314,  3921, 13735,  4881,   368,  2468,
         3641,  1430,  4087,  7770,  4338,  6622,  5943,  8776,  9460,
         4532, 13973, 13624, 13386,   461, 11621,  9796,   387,  8265,
        14597,  8530, 11594,  2649,  5782,  8501,   402,  5958,  2989,
        10472,  9669,  4358, 11892,  2290, 12660,  1492,  6931,  6816,
         2197,  1103, 15087,  1271,  1699,  1827, 14709, 13643, 14823,
         9647,  4347,  8973, 15082,  2495, 13249,  7719, 12829, 15278,
        14581,  9313, 11169,  8632,  3258,  5361,  1810,  3976, 11153,
         9851]),
 (100,))

In [76]:
set_seed(0)
def sample_method(method='ours', sample_num=100):
    if method == 'random':
        cluster_idx = np.random.choice(len(all_graphs), sample_num, replace=False)
    elif method == 'ours':
        all_graphs = get_dataset_graph(dataset='201_acc')
        embeddings = get_graph2vec_clusters(all_graphs, dimensions=32, epochs=10)
        cluster_idx = select_diverse_by_kmeans(embeddings, n_clusters=sample_num)
    return cluster_idx

sample_idx = sample_method('ours', sample_num=100)
print('Done!')

100%|██████████| 15284/15284 [00:00<00:00, 19591.48it/s]


Done!


In [1]:
from models.sampler import sample_method
sample_idx = sample_method('ours', sample_num=100)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 15284/15284 [00:00<00:00, 21940.98it/s]
