In [7]:
pip install pandas seaborn

Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import json
import pandas as pd

# 定义根目录
root_dir = "output/xbox_infer"

# 用于存储整理后的数据
data = []



# 遍历根目录下的所有文件夹
for folder in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder)
    
    # 检查是否是文件夹
    if os.path.isdir(folder_path):
        # 定位all_metrics.jsonl文件
        metrics_file = os.path.join(folder_path, "all_metrics.jsonl")
        
        # 读取jsonl文件
        if os.path.exists(metrics_file):
            with open(metrics_file, 'r') as f:
                # 将metrics按模型和任务类型进行存储
                for line in f:
                    metric_data = json.loads(line)
                    model = folder  # 以文件夹名称作为模型名称
                    task_name = metric_data.get("task_name", "unknown_task")
                    metrics = metric_data.get("metrics", {})
                    for metric_name, metric_value in metrics.items():
                        if metric_name =="hit@5":
                            data.append([model, task_name, metric_name, metric_value])
                        if task_name in ["query2item", "sparse_query2item"]:
                            if metric_name =="coverage@5":
                                data.append([model, task_name, metric_name, metric_value])


# 创建DataFrame
df = pd.DataFrame(data, columns=["model", "task", "metric", "value"])

# 保存为CSV文件
output_csv_path = "output/all_models_metrics_filtered.csv"
df.to_csv(output_csv_path, index=False)

df.head()


Unnamed: 0,model,task,metric,value
0,bge_m3_1,user2item,hit@5,0.13898
1,bge_m3_1,gpt_summary,hit@5,0.105263
2,bge_m3_1,query2item,coverage@5,0.242179
3,bge_m3_1,sparse_query2item,coverage@5,0.406221
4,bge_m3_1,title2item,hit@5,0.994792


In [3]:
df

Unnamed: 0,model,task,metric,value
0,bge_m3_1,user2item,hit@5,0.13898
1,bge_m3_1,gpt_summary,hit@5,0.105263
2,bge_m3_1,query2item,coverage@5,0.242179
3,bge_m3_1,sparse_query2item,coverage@5,0.406221
4,bge_m3_1,title2item,hit@5,0.994792
5,bge_m3_1,item2item,hit@5,0.906528
6,bge_m3_1,queryuser2item,hit@5,0.87484
7,bge_m3_1,misspell2item,hit@5,0.941019
8,bge_m3_1,gpt_misspell,hit@5,0.0
9,bge_m3_1,gpt_summary_query,hit@5,0.111111


In [18]:
import os
import json  
import pandas as pd

root_dir = "/home/aiscuser/RecAI/RecLM-emb/data/xbox"
tasks_count = []

# 遍历 train 和 test 目录，读取每个任务的 jsonl 文件条数
for subfolder in ["train", "test"]:
    subfolder_path = os.path.join(root_dir, subfolder)
    if os.path.isdir(subfolder_path):
        for jsonl_file in os.listdir(subfolder_path):
            if jsonl_file.endswith(".jsonl"):
                file_path = os.path.join(subfolder_path, jsonl_file)
                with open(file_path, 'r') as f:
                    line_count = sum(1 for line in f)
                tasks_count.append([subfolder, jsonl_file.replace(".jsonl", ""), line_count])

# 将任务计数数据转换为 DataFrame
tasks_df = pd.DataFrame(tasks_count, columns=["subset", "task", "line_count"])
df = pd.read_csv("/home/aiscuser/RecAI/RecLM-emb/output/all_models_metrics_filtered.csv")

merged_df = pd.merge(df, tasks_df, how="left", on="task")

# 输出到 CSV 文件
output_csv_path = "output/merged_models_metrics.csv"
merged_df.to_csv(output_csv_path, index=False)

# 展示前几行数据
merged_df.head(15)


Unnamed: 0,model,task,metric,value,subset,line_count
0,bge-m3_v6,user2item,hit@5,0.15634,train,55152
1,bge-m3_v6,user2item,hit@5,0.15634,test,50000
2,bge-m3_v6,query2item,coverage@5,0.242882,train,1152
3,bge-m3_v6,query2item,coverage@5,0.242882,test,1138
4,bge-m3_v6,sparse_query2item,coverage@5,0.411528,test,1093
5,bge-m3_v6,title2item,hit@5,0.997396,train,2734
6,bge-m3_v6,title2item,hit@5,0.997396,test,384
7,bge-m3_v6,item2item,hit@5,0.961424,train,863
8,bge-m3_v6,item2item,hit@5,0.961424,test,674
9,bge-m3_v6,queryuser2item,hit@5,0.89348,train,12000


In [None]:
tasks_count

[['train', 'queryuser2item', 12000],
 ['train', 'title2item', 2685],
 ['train', 'relativequery2item', 380],
 ['train', 'negquery2item', 2127],
 ['train', 'item2item', 889],
 ['train', 'gpt_data_v2', 20741],
 ['train', 'user2item', 55343],
 ['train', 'misspell2item', 3840],
 ['train', 'query2item', 1152],
 ['train', 'gpt_data', 847],
 ['test', 'queryuser2item', 25000],
 ['test', 'title2item', 384],
 ['test', 'relativequery2item', 76],
 ['test', 'negquery2item', 4349],
 ['test', 'sparse_query2item', 1082],
 ['test', 'item2item', 665],
 ['test', 'user2item', 30000],
 ['test', 'gpt_summary_query', 454],
 ['test', 'misspell2item', 366],
 ['test', 'query2item', 1127],
 ['test', 'gpt_query', 1547],
 ['test', 'gpt_summary', 227],
 ['test', 'gpt_misspell', 137]]

In [21]:
# 使用 pivot_table 将数据转换为所需格式
pivot_df = merged_df.pivot_table(index='task', columns='subset', values='line_count', aggfunc='sum').reset_index()
pivot_df.rename(columns={'train': 'train_count', 'test': 'test_count'}, inplace=True)

# 遍历唯一的模型名称，创建每个模型的评分列
models = df['model'].unique()

# 用于存储每个模型的评分数据
model_scores_list = []

for model in models:
    model_scores = df[df['model'] == model][['task', 'metric', 'value']].drop_duplicates()

    # 保持 metric 列用于后续合并
    model_scores_pivot = model_scores.pivot_table(index=['task', 'metric'], values='value').reset_index()
    model_scores_pivot.columns = ['task', 'metric', f"{model}"]

    # 将每个模型的评分数据加入列表中
    model_scores_list.append(model_scores_pivot)

# 将所有模型的评分数据逐步合并到一个数据框中
merged_scores_df = model_scores_list[0]
for model_scores in model_scores_list[1:]:
    merged_scores_df = pd.merge(merged_scores_df, model_scores, on=['task', 'metric'], how='left')

# 将行数数据与模型评分合并
final_df = pd.merge(pivot_df, merged_scores_df, on='task', how='left')

# 保存为 CSV 文件
output_csv_path = "output/processed_metrics_summary.csv"
final_df.to_csv(output_csv_path, index=False)



In [25]:
final_df[final_df['task'].isin(['gpt_summary', 'user2item'])]

Unnamed: 0,task,test_count,train_count,metric,bge-m3_v6,bge-m3_v5_50_200k,bge-m3_history_only,e5-v1
2,gpt_summary,76.0,,hit@5,0.132308,0.105263,0.112308,0.124615
12,user2item,200000.0,220608.0,hit@5,0.15634,0.14462,0.14462,0.15856


In [22]:
final_df[["task","metric","bge-m3-base","bge-m3_v4"]] # ,"test_count","train_count"

KeyError: "['bge-m3-base', 'bge-m3_v4'] not in index"