In [1]:
import numpy as np
import torch
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity, rbf_kernel
from sklearn.metrics import mutual_info_score
from scipy.stats import entropy
from sklearn.decomposition import PCA
import os
import csv
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors



In [2]:
def load_pretrained_emb_weight(model, dataset):
    path = os.path.join('saved', model, dataset, 'pretrain.pth')
    saved = torch.load(path, map_location='cpu', weights_only=False)
    pretrained_emb_weight = saved['item_embedding.weight']
    return pretrained_emb_weight.detach().numpy()[1:]


def get_emb_all(dataset):
    model_list = ['sasrec', 'adrec', 'diffurec', 'dreamrec']
    emb_list = []
    for model in model_list:
        emb_list.append(load_pretrained_emb_weight(model, dataset))
    emb_list.append(np.random.normal(size=emb_list[0].shape))  # 随机高斯分布
    model_list.append('gaussian')
    print(f"Embedding shape: {emb_list[0].shape}")
    return emb_list, model_list

In [7]:
def norm(matrix):
    return (matrix - matrix.mean(axis=0)) / (matrix.std(axis=0) + 1e-9)


def compute_embedding_variance(embeddings):
    embeddings = embeddings
    return np.var(embeddings, axis=0)


def compute_cosine_similarity(embeddings):
    embeddings = embeddings
    return cosine_similarity(embeddings)


def compute_mutual_distances(embeddings):
    embeddings = embeddings
    distances = pdist(embeddings, metric='euclidean')
    return squareform(distances)


def compute_singular_value_spectrum(embeddings):
    embeddings = norm(embeddings)  # 标准化
    _, s, _ = np.linalg.svd(embeddings, full_matrices=False)
    return s


def singular_value_entropy(singular_values):
    normalized_s = singular_values / (np.sum(singular_values) + 1e-9)
    return entropy(normalized_s)


def singular_value_variance(singular_values):
    return np.var(singular_values)


def evaluate_singular_value(singular_values):
    return singular_value_entropy(singular_values), singular_value_variance(singular_values)


def isotropy_score(embeddings):
    embeddings = norm(embeddings)  # 标准化
    cov = np.cov(embeddings, rowvar=False)
    eigvals = np.linalg.eigvalsh(cov)
    return np.min(eigvals) / (np.max(eigvals) + 1e-9)


def centered_kernel_alignment(X, Y, gamma=1):
    X = norm(X)  # 标准化
    Y = norm(Y)  # 标准化
    K_X = rbf_kernel(X, gamma=gamma)
    K_Y = rbf_kernel(Y, gamma=gamma)
    K_X_centered = K_X - K_X.mean(axis=0) - K_X.mean(axis=1).reshape(-1, 1) + K_X.mean()
    K_Y_centered = K_Y - K_Y.mean(axis=0) - K_Y.mean(axis=1).reshape(-1, 1) + K_Y.mean()
    return np.trace(np.dot(K_X_centered.T, K_Y_centered)) / (
                np.linalg.norm(K_X_centered) * np.linalg.norm(K_Y_centered))


def kl_to_gaussian(embeddings):
    emb_norm = norm(embeddings)  # 标准化
    cov = np.cov(emb_norm, rowvar=False)
    d = embeddings.shape[1]
    kl = 0.5 * (np.trace(cov) - d - np.log(np.linalg.det(cov) + 1e-9))
    return kl


def mutual_info(embeddings):
    emb_norm = embeddings
    labels = np.arange(embeddings.shape[0])
    pca_proj = PCA(n_components=1).fit_transform(emb_norm).squeeze()
    hist, bin_edges = np.histogram(pca_proj, bins=10)
    bins = np.digitize(pca_proj, bin_edges[:-1])
    mi = mutual_info_score(labels, bins)
    return mi


def covariance_entropy(embeddings):
    emb_norm = embeddings 
    cov = np.cov(emb_norm, rowvar=False)
    eigvals = np.linalg.eigvalsh(cov)
    eigvals = np.maximum(eigvals, 1e-9)
    eigvals /= eigvals.sum()
    return entropy(eigvals)


# 2. 轮廓系数（Silhouette Score）
from sklearn.cluster import KMeans

# 计算轮廓系数（Silhouette Score）
def compute_silhouette_score(embeddings, n_clusters=16):
    """使用KMeans生成标签并计算轮廓系数"""
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)  # 使用KMeans进行聚类，生成标签
    score = silhouette_score(embeddings, labels)  # 计算轮廓系数
    return score

In [8]:
import csv
import numpy as np

# List of datasets
datasets = ['baby',	'beauty', 'ml-100k', 'sports', 'toys', 'yelp']

# Collect all results
all_results = []

for dataset in datasets:
    embeddings_list, model_names = get_emb_all(dataset)
    for embeddings, model_name in zip(embeddings_list, model_names):
        print(f"====== Evaluating {model_name} embeddings for dataset {dataset} ======")
        
        # 基础指标
        variances = compute_embedding_variance(embeddings)
        embedding_variance = np.mean(variances)
        
        cos_similarity = compute_cosine_similarity(embeddings)
        cosine_similarity_score = np.mean(cos_similarity)
        
        mutual_distances = compute_mutual_distances(embeddings)
        mutual_distance_mean = np.mean(mutual_distances)
        
        # 奇异值谱指标
        singular_values = compute_singular_value_spectrum(embeddings)
        singular_value_top5 = singular_values[:5]
        
        sv_entropy, sv_variance = evaluate_singular_value(singular_values)
        
        # 空间各向同性
        iso_score = isotropy_score(embeddings)
        
        # 高级指标
        kl_score = kl_to_gaussian(embeddings)
        
        mi_score = mutual_info(embeddings)
        
        cov_entropy = covariance_entropy(embeddings)
        
        # 新添加的指标
        silhouette = compute_silhouette_score(embeddings)
        
        # 输出
        print(f"Embedding Variance: {embedding_variance:.3f}")
        print(f"Cosine Similarity: {cosine_similarity_score:.3f}")
        print(f"Mutual Distance mean: {mutual_distance_mean:.3f}")
        print(f"Top 5 Singular Values: {singular_value_top5}")
        print(f"Singular Value Entropy: {sv_entropy:.3f}, Singular Value Variance: {sv_variance:.3f}")
        print(f"Isotropy Score: {iso_score:.3f}")
        print(f"KL Divergence to Gaussian: {kl_score:.3f}")
        print(f"Mutual Information: {mi_score:.3f}")
        print(f"Covariance Matrix Entropy: {cov_entropy:.3f}")
        print(f"Silhouette Score: {silhouette:.3f}")
        print("________________________________\n")
        
        # 保存每个模型的评估结果
        all_results.append([dataset, model_name, round(embedding_variance, 3), round(cosine_similarity_score, 3), 
                            round(mutual_distance_mean, 3), str(np.round(singular_value_top5, 3)),
                            round(sv_entropy, 3), round(sv_variance, 3), round(iso_score, 3),
                            round(kl_score, 3), round(mi_score, 3), round(cov_entropy, 3), 
                            round(silhouette, 3)])




Embedding shape: (4731, 128)
Embedding Variance: 0.010
Cosine Similarity: 0.236
Mutual Distance mean: 1.589
Top 5 Singular Values: [84.73951 79.89948 79.4387  78.92072 78.59443]
Singular Value Entropy: 4.848, Singular Value Variance: 38.886
Isotropy Score: 0.235
KL Divergence to Gaussian: 1.086
Mutual Information: 1.829
Covariance Matrix Entropy: 4.836
Silhouette Score: 0.006
________________________________

Embedding Variance: 0.006
Cosine Similarity: 0.051
Mutual Distance mean: 1.259
Top 5 Singular Values: [247.87593 163.99637 145.34181 138.3336  130.0571 ]
Singular Value Entropy: 4.740, Singular Value Variance: 1017.591
Isotropy Score: 0.016
KL Divergence to Gaussian: 10.375
Mutual Information: 1.729
Covariance Matrix Entropy: 4.330
Silhouette Score: 0.021
________________________________

Embedding Variance: 0.252
Cosine Similarity: 0.002
Mutual Distance mean: 7.925
Top 5 Singular Values: [106.31131   90.61787   85.6862    81.52818   81.354614]
Singular Value Entropy: 4.845, Singu

In [9]:
def save_results_to_csv(results, filename="embedding_results.csv"):
    # Define header
    header = ["Dataset", "Model", "Embedding Variance", "Cosine Similarity", "Mutual Distance Mean",
              "Top 5 Singular Values", "Singular Value Entropy", "Singular Value Variance",
              "Isotropy Score", "KL Divergence to Gaussian", "Mutual Information", "Covariance Matrix Entropy", "Silhouette Score"]

    # Write results to CSV
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header)
        for row in results:
            writer.writerow(row)
    print(f"Results saved to {filename}")

In [10]:
# 保存结果到CSV文件
# Save results to CSV file
save_results_to_csv(all_results)

Results saved to embedding_results.csv
