# 库导入

In [1]:
# -*- coding: utf-8 -*-
import networkx as nx
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, matthews_corrcoef
)
import random
import os
import pandas as pd

# 待调用的函数

In [2]:
# 0. 评价指标计算函数
def evaluate_metrics(y_true, y_scores):
    # 动态阈值：使用阳性分数中位数作为阈值（若没有阳性分数则取0）
    threshold = np.median([s for s in y_scores if s > 0]) if any(y_scores) else 0
    y_pred = [int(score > threshold) for score in y_scores]
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_true, y_scores),
        'average_precision': average_precision_score(y_true, y_scores),
        'mcc': matthews_corrcoef(y_true, y_pred)
    }

# 4. 预先构建结构特征
def precompute_structural_features(G, edge_list, structural_features):
    """
    预计算给定图G中若干边的结构特征，避免重复计算。
    返回字典：{(u,v): [feat1, feat2, ...]}，顺序与structural_features参数对应。
    """
    feature_dict = {}
    neighbors = {n: set(G.neighbors(n)) for n in G.nodes()}
    degrees = dict(G.degree())
    for u, v in edge_list:
        # 确保无向边用有序元组表示
        u, v = tuple(sorted((u, v)))
        Nu, Nv = neighbors[u], neighbors[v]
        inter = Nu & Nv        # 公共邻居集合
        union = Nu | Nv        # 邻居并集
        feature_values = []
        for feat in structural_features:
            if feat == 'CN':
                # Common Neighbors 数量
                feature_values.append(len(inter))
            elif feat == 'JC':
                # Jaccard 系数 = 公共邻居数 / 并集邻居数
                feature_values.append(len(inter) / len(union) if union else 0.0)
            elif feat == 'AA':
                # Adamic-Adar 指数 = Σ(1/log(deg(w))) for w in 公共邻居
                aa_score = sum(1 / np.log(degrees[w]) for w in inter if degrees[w] > 1)
                feature_values.append(aa_score)
            elif feat == 'PA':
                # Preferential Attachment = 度数乘积
                feature_values.append(degrees[u] * degrees[v])
            elif feat == 'HI':
                # 自定义指标：度数差绝对值 (Hub Index 或其他定义)
                feature_values.append(abs(degrees[u] - degrees[v]))
            elif feat == 'RA':
                # Resource Allocation 指数 = Σ(1/deg(w)) for w in 公共邻居
                ra_score = sum(1 / degrees[w] for w in inter) if inter else 0.0
                feature_values.append(ra_score)
            else:
                # 如果传入了未定义的特征编码，则填0占位
                feature_values.append(0.0)
        feature_dict[(u, v)] = np.array(feature_values, dtype=float)
    return feature_dict

# 5. 五折交叉验证进行链路预测评估
def link_prediction_cross_validation(G, structural_features):
    """
    对图G进行5折交叉验证的链路预测实验，使用指定的结构特征集合。
    参数:
        G: 原始完整图 (networkx.Graph)
        structural_features: 元组或列表，指定使用的结构特征 (例如 ('CN','JC','AA','PA'))
    返回:
        avg_results: 每种特征的平均评价指标字典，例如{'CN': {'accuracy': ..., ...}, 'JC': {...}, ...}
    """
    all_positive_edges = list(G.edges())
    kf = KFold(n_splits=5, shuffle=True)
    # 动态初始化每种特征的度量结果列表
    fold_metrics = {feat: [] for feat in structural_features}

    for fold, (train_idx, test_idx) in enumerate(kf.split(all_positive_edges), start=1):
        print(f"\n----- Fold {fold}/5 -----")
        # 划分训练集和测试集的正样本边
        train_pos_edges = [all_positive_edges[i] for i in train_idx]
        test_pos_edges = [all_positive_edges[i] for i in test_idx]
        # 构建训练子图（移除测试集正边，避免信息泄漏）
        G_train = nx.Graph()
        G_train.add_nodes_from(G.nodes())         # 保留所有节点
        G_train.add_edges_from(train_pos_edges)   # 添加训练正样本边

        # 准备测试集负样本边（不存在于原图的随机边，数量与测试正样本相同）
        existing_edges = set(tuple(sorted(edge)) for edge in G.edges())
        test_neg_edges = []
        nodes = list(G.nodes())
        while len(test_neg_edges) < len(test_pos_edges):
            u, v = random.choice(nodes), random.choice(nodes)
            if u == v:
                continue
            pair = tuple(sorted((u, v)))
            if pair in existing_edges or pair in test_neg_edges:
                continue
            test_neg_edges.append(pair)

        # 合并测试边集（正样本边 + 负样本边）
        test_edges = [tuple(sorted(e)) for e in test_pos_edges] + test_neg_edges
        # 基于训练子图预计算所有测试边的结构特征
        features = precompute_structural_features(G_train, test_edges, structural_features)
        # 构建标签列表（1表示正样本边，0表示负样本边）
        y_true = [1] * len(test_pos_edges) + [0] * len(test_neg_edges)

        # 初始化每种特征的评分列表
        score_lists = {feat: [] for feat in structural_features}
        # 将每条测试边对应的特征值加入各自的评分列表
        for edge in test_edges:
            feature_values = features[tuple(sorted(edge))]
            for idx, feat in enumerate(structural_features):
                score_lists[feat].append(feature_values[idx])

        # 计算每种特征方法在当前折的指标，并保存结果
        for feat in structural_features:
            metrics = evaluate_metrics(y_true, score_lists[feat])
            fold_metrics[feat].append(metrics)
            print(f"{feat} - 折{fold}结果: {metrics}")

    # 计算5折平均结果
    avg_results = {}
    for feat, metrics_list in fold_metrics.items():
        avg_results[feat] = {}
        for metric_name in metrics_list[0].keys():
            # 计算该特征在所有折上的平均值
            values = [fold_result[metric_name] for fold_result in metrics_list]
            avg_results[feat][metric_name] = float(np.mean(values))
    return avg_results

# 主函数

In [14]:
######### 参数设置 #########
dataset_name = "facebook"  # 可替换为其他数据集名称
edges_file_path = f"norm_dataset/{dataset_name}_edges.txt"
G = nx.read_edgelist(edges_file_path, nodetype=int)

# 可调节的结构特征列表，例如只选择部分特征或增加新特征
structural_features = ('RA','CN','JC','AA','PA','HI')
runs = 10  # 重复实验次数

# 主运行：多次重复实验以获取平均结果
all_rows = []
for run in range(runs):
    print(f"\n=== 运行 {run + 1}/{runs} ===")
    results = link_prediction_cross_validation(G, structural_features=structural_features)
    # 将结果整理为一行记录
    row_data = {}
    for feat, metrics in results.items():
        for metric, value in metrics.items():
            col_name = f"{feat}_{metric}"
            row_data[col_name] = round(value, 4)
    all_rows.append(row_data)
    print(f"第 {run+1} 次运行结果: {row_data}")

# 保存结果到 Excel 文件
output_dir = f"results_baseline/{dataset_name}"
os.makedirs(output_dir, exist_ok=True)
param_str = "_".join(structural_features)
file_name = f"{dataset_name}_{param_str}.xlsx"
file_path = os.path.join(output_dir, file_name)

# 如果文件已存在则追加，否则创建新文件
if os.path.exists(file_path):
    existing = pd.read_excel(file_path)
    df = pd.concat([existing, pd.DataFrame(all_rows)], ignore_index=True)
else:
    df = pd.DataFrame(all_rows)
df.to_excel(file_path, index=False)
print(f"✅ 实验结果已保存至: {file_name}")


=== 运行 1/10 ===

----- Fold 1/5 -----
RA - 折1结果: {'accuracy': 0.7777525925086417, 'precision': 0.997866937531742, 'recall': 0.5566951889839633, 'f1': 0.7146806343663611, 'roc_auc': 0.9930716010718619, 'average_precision': 0.9928660584721636, 'mcc': 0.6193209646044661}
CN - 折1结果: {'accuracy': 0.7733042443474811, 'precision': 0.9948696901292838, 'recall': 0.5494418314727716, 'f1': 0.7079180812616361, 'roc_auc': 0.9907601154953171, 'average_precision': 0.9889367785101024, 'mcc': 0.6113016954636298}
JC - 折1结果: {'accuracy': 0.7726242420808069, 'precision': 0.9958771387342816, 'recall': 0.5475151583838613, 'f1': 0.706570624154448, 'roc_auc': 0.989031885579359, 'average_precision': 0.9882194808824778, 'mcc': 0.6106362803185695}
AA - 折1结果: {'accuracy': 0.7765059216864056, 'precision': 0.9956323006602337, 'recall': 0.5554485181617272, 'f1': 0.7130801687763713, 'roc_auc': 0.9919310212513425, 'average_precision': 0.991239567685859, 'mcc': 0.6165411908165852}
PA - 折1结果: {'accuracy': 0.75100583668