# 抽样部分数据，可视化这些数据的相似度

## 一、抽样数据

In [2]:
import json
import numpy as np
import faiss
from tqdm import tqdm
import random

In [3]:
input_file = "/home/sduu39/zHongchang/data/label/5_Human Computer Interaction/viewLabel5.jsonl"

In [4]:
sample_vectors = []
with open(input_file, "r", encoding='utf-8') as f:
    for i, line in enumerate(tqdm(f)):
        data = json.loads(line.strip())
        sample_vectors.append(np.array(data["embedding"]))

131613it [00:20, 6577.36it/s]


## 归一化

In [5]:
sample_vectors = np.array(sample_vectors)
sample_vectors.shape

(131613, 384)

In [6]:
sample_vectors = sample_vectors.astype(np.float32)
print(np.linalg.norm(sample_vectors[0]))

0.99999994


In [7]:
sample_vectors[1]

array([ 2.30395701e-02, -4.03438509e-02,  1.46691035e-02, -1.54435439e-02,
        1.49138672e-02, -1.96768492e-02, -8.96444544e-02, -3.36187035e-02,
        2.70972755e-02,  1.14897929e-01, -3.62145640e-02, -5.76086380e-02,
       -4.56924224e-03,  2.75424245e-04,  8.84654969e-02, -2.60847025e-02,
        3.96507643e-02,  1.17651056e-02,  2.03225575e-02,  5.72025180e-02,
        8.97500739e-02, -8.34339706e-04,  1.26810269e-02,  9.79178119e-03,
       -3.66717838e-02,  5.05457595e-02, -2.59494912e-02,  6.32760748e-02,
        2.83195097e-02, -3.48956026e-02, -5.81027055e-03,  4.48822118e-02,
       -3.29613453e-03, -4.39762473e-02,  1.56366453e-02,  2.17087008e-02,
        2.39710719e-03, -1.32379467e-02, -8.16273838e-02, -6.72636181e-02,
       -8.25667381e-02,  6.08041212e-02,  2.26326697e-02,  1.63059141e-02,
        4.51813042e-02, -1.83505900e-02, -3.51723693e-02, -3.69665064e-02,
       -6.08245954e-02, -1.60998479e-02, -1.19929291e-01, -9.45299044e-02,
        2.46723406e-02, -

In [8]:
print("{0:.10f}".format(np.linalg.norm(sample_vectors[0])))

0.9999999404


## 计算

In [9]:
SUBSET_SIZE = len(sample_vectors)  #全对比子集的大小
OUTPUT_DIR="/home/sduu39/zHongchang/data/label/4_Computer Graphics/"

In [None]:
import os

sim_mat = sample_vectors @ sample_vectors.T           #sub是10000*328的数组 进行相乘，对角线是自己乘自己，其他的是与另一个向量的乘，也就是
# 只取上三角（不含对角线），每一对(i,j)只保留一次
row_idx, col_idx = np.triu_indices(SUBSET_SIZE, k=1)
vals = sim_mat[row_idx, col_idx]   # 形状是 (N*(N-1)/2, )

print("相似度对数 =", vals.shape[0])  # 可以看一下数量

np.save("/home/sduu39/zHongchang/data/label/5_Human Computer Interaction/pairwise_subset_similarities.npy", vals)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
vals=np.load("/home/sduu39/zHongchang/data/label/1_Computing Systems/pairwise_subset_similarities_sample100000.npy")
plt.figure(figsize=(6, 4))
plt.hist(vals, bins=1000, range=(0, 1), color='steelblue', alpha=0.8)
plt.title("Pairwise Cosine Similarity (tau_050)")
plt.xlabel("Cosine similarity")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig("/home/sduu39/zHongchang/data/label/1_Computing Systems/pairwise_subset_distribution.png",dpi=600)
plt.close()

In [50]:
#从sample_vectors中选择QUERY_SIZE条数据，去查询他们的TOP_K的最相似度
TOP_K = 1000
QUERY_SIZE = 20000
OUTPUT_DIR = "/home/sduu39/zHongchang/data/plt"
d = sample_vectors.shape[1]
index = faiss.IndexFlatIP(d)
index.add(sample_vectors)
print(f"Searching Top-{TOP_K} similarities for {QUERY_SIZE} random queries ...")
query_ids = np.random.choice(len(sample_vectors), QUERY_SIZE, replace=False)
queries = sample_vectors[query_ids]

D, I = index.search(queries, TOP_K)
D = D[:, 1:]  # 去掉自身，(QUERY_SIZE,TOP_K-1)，每个查询样本对应的TOP-K相似度，按降序排序
I = I[:, 1:]  # 对应的相似样本的索引位置



Searching Top-1000 similarities for 20000 random queries ...


In [51]:
# 保存部分结果
np.save(os.path.join(OUTPUT_DIR, "faiss_topk_similarities.npy"), D)
np.save(os.path.join(OUTPUT_DIR, "faiss_topk_indices.npy"), I)

# 可视化高相似部分分布
plt.figure(figsize=(6, 4))
plt.hist(D.flatten(), bins=1000, range=(0.5, 1), color='orange', alpha=0.8)
plt.title(f"FAISS Top-{TOP_K} Cosine Similarity Distribution")
plt.xlabel("Cosine similarity")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, f"faiss_top{TOP_K}_distribution.png"))
plt.close()


# 统计得到的相似度

In [1]:
import numpy as np

FILE = "/home/sduu39/zHongchang/data/pl/label1Sim.npy"

# 使用 mmap_mode='r'，避免一次性把几十 GB 全读进内存
vals = np.load(FILE, mmap_mode='r')  # 1D array
print("vals shape:", vals.shape, "dtype:", vals.dtype)

# 你说是 8w 个节点，这里顺便算一下理论长度验证一下
N = 80000
total_pairs_with_diag = N * N
total_pairs_no_diag = N * (N - 1)
print("theoretical total pairs (with diag):   ", total_pairs_with_diag)
print("theoretical total pairs (no diag):     ", total_pairs_no_diag)

vals shape: (6399920000,) dtype: float32
theoretical total pairs (with diag):    6400000000
theoretical total pairs (no diag):      6399920000


In [2]:
thresholds = [0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70]

In [3]:
counts = {t: 0 for t in thresholds}

# 分块大小（可以根据内存情况调大/调小）
BLOCK = 10_000_000

n = vals.shape[0]
print("start counting, total elements:", n)

for start in range(0, n, BLOCK):
    end = min(start + BLOCK, n)
    chunk = np.array(vals[start:end])  # 转成普通 ndarray，方便后面操作
    # 一次遍历 chunk，对所有 threshold 做 count
    for t in thresholds:
        counts[t] += np.count_nonzero(chunk >= t)

print("\n=== Counts for directed pairs (i,j 都算) ===")
for t in thresholds:
    print(f"sim >= {t:.2f}: {counts[t]:,d}")

# 如果你的 vals 来自完整的相似度矩阵去对角线，
# 每一对 (i,j) 和 (j,i) 都在里面，则无向边数量大约是 counts[t] / 2
print("\n=== Approx counts for undirected pairs (每对只算一次，大约除以2) ===")
for t in thresholds:
    approx_undirected = counts[t] // 2
    print(f"sim >= {t:.2f}: ~{approx_undirected:,d} (unique pairs)")

start counting, total elements: 6399920000

=== Counts for directed pairs (i,j 都算) ===
sim >= 0.40: 76,348,418
sim >= 0.45: 29,977,910
sim >= 0.50: 11,128,200
sim >= 0.55: 3,946,422
sim >= 0.60: 1,354,572
sim >= 0.65: 458,780
sim >= 0.70: 158,070

=== Approx counts for undirected pairs (每对只算一次，大约除以2) ===
sim >= 0.40: ~38,174,209 (unique pairs)
sim >= 0.45: ~14,988,955 (unique pairs)
sim >= 0.50: ~5,564,100 (unique pairs)
sim >= 0.55: ~1,973,211 (unique pairs)
sim >= 0.60: ~677,286 (unique pairs)
sim >= 0.65: ~229,390 (unique pairs)
sim >= 0.70: ~79,035 (unique pairs)
