In [3]:
import torch
import time
import h5py
import torch.version
import sys
import os
sys.path.append("..")

from data import get_database_path
if torch.cuda.is_available():
    print("CUDA is available")
    print("Torch version:", torch.version.cuda)
    print("Number of GPUs available:", torch.cuda.device_count())
    print("GPU name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available")

CUDA is available
Torch version: 11.8
Number of GPUs available: 1
GPU name: NVIDIA GeForce RTX 4070


In [4]:
database_path = get_database_path()
output_dim = [2,4,16,32]
embedding_points_file_name = {}
embedding_points_file = {}
embedding_points = {}
for dim in output_dim:
    embedding_points_file_name[dim] = "embedding_points_dim{}.h5".format(dim)
    embedding_points_file[dim] = os.path.join(database_path, embedding_points_file_name[dim])
    embedding_points[dim] = h5py.File(embedding_points_file[dim])

In [5]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from falkon import Falkon
from falkon.kernels import GaussianKernel
from sklearn.metrics import accuracy_score, roc_auc_score

# 假设您已经加载了 embedding_points 字典

def balance_data(X_sm, y_sm, X_bsm, y_bsm):
    n_samples = X_bsm.shape[0]
    indices_sm = np.random.choice(X_sm.shape[0], size=n_samples, replace=False)
    sorted_indices_sm = np.sort(indices_sm)
    X_sm_downsampled = X_sm[sorted_indices_sm]
    y_sm_downsampled = y_sm[sorted_indices_sm]
    # 合并数据
    X_balanced = np.vstack([X_sm_downsampled, X_bsm])
    y_balanced = np.hstack([y_sm_downsampled, y_bsm])
    return X_balanced, y_balanced

def preprocess_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test

def train_falkon_classifier(X_train, y_train, X_test, y_test, sigma=1.0, penalty=1e-6, M=500):
    y_train_reg = y_train.copy()
    y_train_reg[y_train == 0] = -1
    y_train_reg[y_train == 1] = 1
    kernel = GaussianKernel(sigma=sigma)
    model = Falkon(
        kernel=kernel,
        penalty=penalty,
        M=M,
        options={'use_cpu': False}
    )
    model.fit(X_train, y_train_reg)
    y_pred = model.predict(X_test)
    y_pred_labels = (y_pred >= 0).astype(int)
    accuracy = accuracy_score(y_test, y_pred_labels)
    auc = roc_auc_score(y_test, y_pred)
    return accuracy, auc

def plot_performance(results, bsm_event):
    dims = sorted(results.keys())
    accuracies = [results[dim][bsm_event]['accuracy'] for dim in dims]
    aucs = [results[dim][bsm_event]['auc'] for dim in dims]
    
    plt.figure(figsize=(10,6))
    plt.plot(dims, accuracies, marker='o', label='准确率')
    plt.plot(dims, aucs, marker='s', label='AUC')
    plt.xlabel('嵌入维度')
    plt.ylabel('性能指标')
    plt.title(f'BSM 信号 {bsm_event} 不同嵌入维度的分类性能')
    plt.legend()
    plt.grid(True)
    plt.show()

def main():
    embedding_dims = [2, 4, 8, 16, 32, 64]
    bsm_events = ['charged_Higgs', 'leptoquark', 'neutral_Higgs', 'neutral_boson']
    results = {}
    
    for dim in embedding_dims:
        results[dim] = {}
        for bsm_event in bsm_events:
            print(f'\n处理嵌入维度为 {dim}，BSM 信号为 {bsm_event} 的数据...')
            
            # 获取数据
            X_sm = embedding_points[dim]['SM']
            y_sm = np.zeros(X_sm.shape[0])
            
            X_bsm = embedding_points[dim][bsm_event]
            y_bsm = np.ones(X_bsm.shape[0])
            
            # 下采样 SM events
            X_balanced, y_balanced = balance_data(X_sm, y_sm, X_bsm, y_bsm)
            
            # 数据预处理
            X_train, X_test, y_train, y_test = preprocess_data(X_balanced, y_balanced)
            
            # 模型训练和评估
            accuracy, auc = train_falkon_classifier(X_train, y_train, X_test, y_test)
            print(f'维度 {dim}，信号 {bsm_event} 的测试准确率: {accuracy:.4f}, AUC: {auc:.4f}')
            results[dim][bsm_event] = {'accuracy': accuracy, 'auc': auc}
    
    # 结果分析
    for bsm_event in bsm_events:
        plot_performance(results, bsm_event)
    
if __name__ == '__main__':
    main()


处理嵌入维度为 2，BSM 信号为 charged_Higgs 的数据...
