In [None]:
import pandas as pd
# 存指标
csv_data = pd.read_excel('../dictionary.xlsx')  # 假设CSV文件名为target_metrics.csv
unique_metrics = csv_data['metric'].dropna().unique().tolist()  # 获取唯一值列表
unique_metrics_df = pd.DataFrame(unique_metrics, columns=['Unique Metrics'])
unique_metrics_df.to_csv('unique_metrics.csv', index=False)
print("唯一值已保存为 unique_metrics.csv")

In [17]:
# TF-IDF向量化 精度较低，快
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 读取JSON文件
with open('../json/grouped_data_full.json', 'r') as f:
    json_data = json.load(f)

# 读取CSV文件
csv_data = pd.read_excel('../dictionary.xlsx')  # 假设CSV文件名为target_metrics.csv
unique_keys = csv_data['key'].dropna().unique().tolist()  # 提取唯一的key列表

# 提取JSON中的指标（metric字段）
json_metrics = [item["metric"] for item in json_data]

# 使用TF-IDF向量化来计算相似度
vectorizer = TfidfVectorizer().fit(json_metrics + unique_keys)
json_vectors = vectorizer.transform(json_metrics)
key_vectors = vectorizer.transform(unique_keys)

# 设置相似度阈值
similarity_threshold = 0.1
filtered_data = []

# 遍历JSON中的每个metric
for i, json_metric in enumerate(json_metrics):
    # 计算该JSON指标与所有唯一key的相似度
    similarities = cosine_similarity(json_vectors[i], key_vectors).flatten()
    
    # 检查是否有相似度大于阈值的key
    matched_key_indices = [j for j, sim in enumerate(similarities) if sim >= similarity_threshold]
    
    # 如果找到匹配的key
    if matched_key_indices:
        # 找到相似度最高的key
        best_match_index = matched_key_indices[similarities[matched_key_indices].argmax()]
        best_key = unique_keys[best_match_index]
        best_similarity = similarities[best_match_index]
        
        # 获取匹配到的key对应的metric
        standard_metric = csv_data.loc[csv_data['key'] == best_key, 'metric'].iloc[0]
        
        # 添加匹配到的key和metric，相似度，并保留原数据
        filtered_entry = json_data[i].copy()
        filtered_entry['key_word'] = best_key
        filtered_entry['standard_metric'] = standard_metric
        filtered_entry['similarities'] = best_similarity
        filtered_data.append(filtered_entry)

# 将结果保存为新的JSON文件
with open('../json/filtered_data_full.json', 'w') as f:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
    json.dump(filtered_data, f, indent=4)

print("筛选完成，结果已保存为 filtered_data.json")


筛选完成，结果已保存为 filtered_data.json


In [19]:
# bert 嵌入 高精度基于语义，慢
import json
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm  # 用于进度条

# 加载BERT模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# 定义函数获取BERT嵌入并显示进度条
def get_bert_embeddings_with_progress(text_list, description="Calculating embeddings"):
    embeddings = []
    for text in tqdm(text_list, desc=description):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128, padding='max_length')
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(embedding)
    return embeddings

# 加载JSON和目标数据
with open('../json/grouped_data_full.json', 'r') as f:
    json_data = json.load(f)

# 读取CSV文件
csv_data = pd.read_excel('../dictionary.xlsx') 
target_labels = csv_data['key'].dropna().unique().tolist()

# 提取JSON中的metric并获取BERT嵌入
json_metrics = [item["metric"] for item in json_data]
json_embeddings = get_bert_embeddings_with_progress(json_metrics, "Calculating JSON metrics embeddings")
target_embeddings = get_bert_embeddings_with_progress(target_labels, "Calculating target label embeddings")

# 设置相似度阈值
similarity_threshold = 0.7  # 
filtered_data = []

# 比较每个JSON metric和目标标签的相似度，并显示进度条
for i, json_metric in tqdm(enumerate(json_metrics), desc="Calculating similarities", total=len(json_metrics)):
    similarities = cosine_similarity([json_embeddings[i]], target_embeddings).flatten()
    max_similarity_index = similarities.argmax()
    max_similarity = similarities[max_similarity_index]
    
    # 如果相似度超过阈值，则保留该数据
    if max_similarity >= similarity_threshold:
        matched_label = target_labels[max_similarity_index]
        standard_metric = csv_data.loc[csv_data['key'] == matched_label, 'metric'].iloc[0]
        
        filtered_entry = json_data[i].copy()
        filtered_entry['key_word'] = matched_label
        filtered_entry['standard_metric'] = standard_metric
        filtered_entry['similarities'] = float(max_similarity)  # 转换为float类型
        filtered_data.append(filtered_entry)

# 将结果保存为新的JSON文件
with open('filtered_data.json', 'w') as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=4)

print("筛选完成，结果已保存为 filtered_data.json")


Calculating JSON metrics embeddings: 100%|██████████| 15/15 [00:05<00:00,  2.77it/s]
Calculating target label embeddings: 100%|██████████| 434/434 [02:35<00:00,  2.79it/s]
Calculating similarities: 100%|██████████| 15/15 [00:00<00:00, 272.76it/s]

筛选完成，结果已保存为 filtered_data.json



