In [1]:
import json

In [36]:
import json

def load_json_stream(file_path):
    """
    讀取一個包含多個連續JSON對象的文件。
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            file_content = f.read().replace('}\n{', '},{')
        json_array_string = f"[{file_content}]"
        return json.loads(json_array_string)
    except Exception:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return [json.loads(line) for line in f if line.strip()]
        except json.JSONDecodeError as e:
            print(f"解析文件 '{file_path}' 時出錯: {e}")
            return None

def analyze_scenarios_by_id(file1_path, file2_path):
    """
    以 scenario ID 為基準進行匹配和分析，並記錄僅存在於單一文件中的案例ID。
    """
    data1 = load_json_stream(file1_path)
    data2 = load_json_stream(file2_path)

    if data1 is None or data2 is None:
        return None

    scenarios_f1 = {s['scenario']: s for s in data1}
    scenarios_f2 = {s['scenario']: s for s in data2}

    all_scenario_ids = set(scenarios_f1.keys()) | set(scenarios_f2.keys())
    
    # 初始化計數器和ID列表
    success_f1 = 0
    success_f2 = 0
    both_failed = 0
    only_in_f1_ids = []
    only_in_f2_ids = []

    for sid in all_scenario_ids:
        scenario1 = scenarios_f1.get(sid)
        scenario2 = scenarios_f2.get(sid)

        in_f1 = scenario1 is not None
        in_f2 = scenario2 is not None

        is_s1_success = False
        if in_f1:
            is_s1_success = all(not turn.get('validation_errors') 
                                for conv in scenario1.get('conversations', []) 
                                for turn in conv.get('turns', []))
            if is_s1_success:
                success_f1 += 1

        is_s2_success = False
        if in_f2:
            is_s2_success = all(not turn.get('validation_errors') 
                                for conv in scenario2.get('conversations', []) 
                                for turn in conv.get('turns', []))
            if is_s2_success:
                success_f2 += 1
        
        # --- 更新統計 ---
        if in_f1 and not in_f2:
            only_in_f1_ids.append(sid) # 收集ID
        elif not in_f1 and in_f2:
            only_in_f2_ids.append(sid) # 收集ID
        elif in_f1 and in_f2:
            if not is_s1_success and not is_s2_success:
                both_failed += 1

    return {
        "file1_path": file1_path,
        "file2_path": file2_path,
        "file1_total": len(scenarios_f1),
        "file2_total": len(scenarios_f2),
        "file1_success_count": success_f1,
        "file2_success_count": success_f2,
        "both_failed_count": both_failed,
        "total_unique_scenarios": len(all_scenario_ids),
        # 返回排序後的ID列表
        "only_in_file1_ids": sorted(only_in_f1_ids),
        "only_in_file2_ids": sorted(only_in_f2_ids),
    }

# --- 主程序 ---
# 将模型版本作为可调整的变量
model_version = "deepseek-v3-0324" 

file_pairs = {
    "Multiple": (f"{model_version}_results/BFCL_v3_multiple_{model_version}_results_llamaindex.json", f"{model_version}_results/BFCL_v3_multiple_{model_version}_results.json"),
    "Parallel": (f"{model_version}_results/BFCL_v3_parallel_{model_version}_results_llamaindex.json", f"{model_version}_results/BFCL_v3_parallel_{model_version}_results.json"),
    "Parallel Multiple": (f"{model_version}_results/BFCL_v3_parallel_multiple_{model_version}_results_llamaindex.json", f"{model_version}_results/BFCL_v3_parallel_multiple_{model_version}_results.json"),
    "Simple": (f"{model_version}_results/BFCL_v3_simple_{model_version}_results_llamaindex.json", f"{model_version}_results/BFCL_v3_simple_{model_version}_results.json")
}

overall_total_scenarios = 0
overall_success_f1 = 0
overall_success_f2 = 0

category_results = {} # 用于存储每个分类的结果

# 遍歷每一對文件並打印分析結果
for group_name, (file1, file2) in file_pairs.items():
    results = analyze_scenarios_by_id(file1, file2)
    if results:
        # 累加總體統計
        overall_total_scenarios += results['total_unique_scenarios']
        overall_success_f1 += results['file1_success_count']
        overall_success_f2 += results['file2_success_count']

        category_results[group_name] = {
            "total": results['file1_total'], # 假设文件1和文件2的总案例数相同，或者以文件1为基准
            "success_f1": results['file1_success_count'],
            "success_f2": results['file2_success_count']
        }

# --- 打印格式化的結果 ---
print(f"模型版本: {model_version}")
print("\n---")
print("### 详细分类结果 ###")
# 按照 specified order: Simple, Multiple, Parallel, Parallel Multiple
output_order = ["Simple", "Multiple", "Parallel", "Parallel Multiple"]

for category in output_order:
    if category in category_results:
        res = category_results[category]
        print(f"**{category}**:")
        print(f"  文件1 (LlamaIndex): {res['success_f1']}/{res['total']}")
        print(f"  文件2 (原始): {res['success_f2']}/{res['total']}")
        print("---")


# --- 打印整體統計結果 ---
print("\n### 整體統計結果 ###")
print(f"整體總案例數: {overall_total_scenarios}")
print(f"整體成功數 (文件1 - LlamaIndex): {overall_success_f1}/1000")
print(f"整體成功數 (文件2 - 原始): {overall_success_f2}/1000")

# 计算并打印整体成功率
overall_accuracy_f1 = (overall_success_f1 / overall_total_scenarios * 100) if overall_total_scenarios > 0 else 0
overall_accuracy_f2 = (overall_success_f2 / overall_total_scenarios * 100) if overall_total_scenarios > 0 else 0

print(f"整體成功率 (文件1 - LlamaIndex): {overall_accuracy_f1:.2f}%")
print(f"整體成功率 (文件2 - 原始): {overall_accuracy_f2:.2f}%")

模型版本: deepseek-v3-0324

---
### 详细分类结果 ###
**Simple**:
  文件1 (LlamaIndex): 390/400
  文件2 (原始): 377/400
---
**Multiple**:
  文件1 (LlamaIndex): 189/200
  文件2 (原始): 191/200
---
**Parallel**:
  文件1 (LlamaIndex): 181/200
  文件2 (原始): 180/200
---
**Parallel Multiple**:
  文件1 (LlamaIndex): 179/200
  文件2 (原始): 175/200
---

### 整體統計結果 ###
整體總案例數: 1000
整體成功數 (文件1 - LlamaIndex): 939/1000
整體成功數 (文件2 - 原始): 923/1000
整體成功率 (文件1 - LlamaIndex): 93.90%
整體成功率 (文件2 - 原始): 92.30%


In [37]:
import json

def load_json_stream(file_path):
    """
    讀取一個包含多個連續JSON對象的文件。
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            file_content = f.read().replace('}\n{', '},{')
        json_array_string = f"[{file_content}]"
        return json.loads(json_array_string)
    except Exception:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return [json.loads(line) for line in f if line.strip()]
        except json.JSONDecodeError as e:
            print(f"解析文件 '{file_path}' 時出錯: {e}")
            return None

def analyze_scenarios_by_id(file1_path, file2_path):
    """
    以 scenario ID 為基準進行匹配和分析，並記錄僅存在於單一文件中的案例ID。
    """
    data1 = load_json_stream(file1_path)
    data2 = load_json_stream(file2_path)

    if data1 is None or data2 is None:
        return None

    scenarios_f1 = {s['scenario']: s for s in data1}
    scenarios_f2 = {s['scenario']: s for s in data2}

    all_scenario_ids = set(scenarios_f1.keys()) | set(scenarios_f2.keys())
    
    # 初始化計數器和ID列表
    success_f1 = 0
    success_f2 = 0
    both_failed = 0
    only_in_f1_ids = []
    only_in_f2_ids = []

    for sid in all_scenario_ids:
        scenario1 = scenarios_f1.get(sid)
        scenario2 = scenarios_f2.get(sid)

        in_f1 = scenario1 is not None
        in_f2 = scenario2 is not None

        is_s1_success = False
        if in_f1:
            is_s1_success = all(not turn.get('validation_errors') 
                                for conv in scenario1.get('conversations', []) 
                                for turn in conv.get('turns', []))
            if is_s1_success:
                success_f1 += 1

        is_s2_success = False
        if in_f2:
            is_s2_success = all(not turn.get('validation_errors') 
                                for conv in scenario2.get('conversations', []) 
                                for turn in conv.get('turns', []))
            if is_s2_success:
                success_f2 += 1
        
        # --- 更新統計 ---
        if in_f1 and not in_f2:
            only_in_f1_ids.append(sid) # 收集ID
        elif not in_f1 and in_f2:
            only_in_f2_ids.append(sid) # 收集ID
        elif in_f1 and in_f2:
            if not is_s1_success and not is_s2_success:
                both_failed += 1

    return {
        "file1_path": file1_path,
        "file2_path": file2_path,
        "file1_total": len(scenarios_f1),
        "file2_total": len(scenarios_f2),
        "file1_success_count": success_f1,
        "file2_success_count": success_f2,
        "both_failed_count": both_failed,
        "total_unique_scenarios": len(all_scenario_ids),
        # 返回排序後的ID列表
        "only_in_file1_ids": sorted(only_in_f1_ids),
        "only_in_file2_ids": sorted(only_in_f2_ids),
    }

# --- 主程序 ---
# 将模型版本作为可调整的变量
model_version = "kimi-k2" 

file_pairs = {
    "Multiple": (f"{model_version}_results/BFCL_v3_multiple_{model_version}_results_llamaindex.json", f"{model_version}_results/BFCL_v3_multiple_{model_version}_results.json"),
    "Parallel": (f"{model_version}_results/BFCL_v3_parallel_{model_version}_results_llamaindex.json", f"{model_version}_results/BFCL_v3_parallel_{model_version}_results.json"),
    "Parallel Multiple": (f"{model_version}_results/BFCL_v3_parallel_multiple_{model_version}_results_llamaindex.json", f"{model_version}_results/BFCL_v3_parallel_multiple_{model_version}_results.json"),
    "Simple": (f"{model_version}_results/BFCL_v3_simple_{model_version}_results_llamaindex.json", f"{model_version}_results/BFCL_v3_simple_{model_version}_results.json")
}

overall_total_scenarios = 0
overall_success_f1 = 0
overall_success_f2 = 0

category_results = {} # 用于存储每个分类的结果

# 遍歷每一對文件並打印分析結果
for group_name, (file1, file2) in file_pairs.items():
    results = analyze_scenarios_by_id(file1, file2)
    if results:
        # 累加總體統計
        overall_total_scenarios += results['total_unique_scenarios']
        overall_success_f1 += results['file1_success_count']
        overall_success_f2 += results['file2_success_count']

        category_results[group_name] = {
            "total": results['file1_total'], # 假设文件1和文件2的总案例数相同，或者以文件1为基准
            "success_f1": results['file1_success_count'],
            "success_f2": results['file2_success_count']
        }

# --- 打印格式化的結果 ---
print(f"模型版本: {model_version}")
print("\n---")
print("### 详细分类结果 ###")
# 按照 specified order: Simple, Multiple, Parallel, Parallel Multiple
output_order = ["Simple", "Multiple", "Parallel", "Parallel Multiple"]

for category in output_order:
    if category in category_results:
        res = category_results[category]
        print(f"**{category}**:")
        print(f"  文件1 (LlamaIndex): {res['success_f1']}/{res['total']}")
        print(f"  文件2 (原始): {res['success_f2']}/{res['total']}")
        print("---")


# --- 打印整體統計結果 ---
print("\n### 整體統計結果 ###")
print(f"整體總案例數: {overall_total_scenarios}")
print(f"整體成功數 (文件1 - LlamaIndex): {overall_success_f1}/1000")
print(f"整體成功數 (文件2 - 原始): {overall_success_f2}/1000")

# 计算并打印整体成功率
overall_accuracy_f1 = (overall_success_f1 / overall_total_scenarios * 100) if overall_total_scenarios > 0 else 0
overall_accuracy_f2 = (overall_success_f2 / overall_total_scenarios * 100) if overall_total_scenarios > 0 else 0

print(f"整體成功率 (文件1 - LlamaIndex): {overall_accuracy_f1:.2f}%")
print(f"整體成功率 (文件2 - 原始): {overall_accuracy_f2:.2f}%")

模型版本: kimi-k2

---
### 详细分类结果 ###
**Simple**:
  文件1 (LlamaIndex): 376/400
  文件2 (原始): 372/400
---
**Multiple**:
  文件1 (LlamaIndex): 181/200
  文件2 (原始): 183/200
---
**Parallel**:
  文件1 (LlamaIndex): 168/200
  文件2 (原始): 181/200
---
**Parallel Multiple**:
  文件1 (LlamaIndex): 170/200
  文件2 (原始): 174/200
---

### 整體統計結果 ###
整體總案例數: 1000
整體成功數 (文件1 - LlamaIndex): 895/1000
整體成功數 (文件2 - 原始): 910/1000
整體成功率 (文件1 - LlamaIndex): 89.50%
整體成功率 (文件2 - 原始): 91.00%


In [41]:
import json

def load_json_stream(file_path):
    """
    讀取一個包含多個連續JSON對象的文件。
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            file_content = f.read().replace('}\n{', '},{')
        json_array_string = f"[{file_content}]"
        return json.loads(json_array_string)
    except Exception:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return [json.loads(line) for line in f if line.strip()]
        except json.JSONDecodeError as e:
            print(f"解析文件 '{file_path}' 時出錯: {e}")
            return None

def analyze_scenarios_by_id(file1_path, file2_path):
    """
    以 scenario ID 為基準進行匹配和分析，並記錄僅存在於單一文件中的案例ID。
    """
    data1 = load_json_stream(file1_path)
    data2 = load_json_stream(file2_path)

    if data1 is None or data2 is None:
        return None

    scenarios_f1 = {s['scenario']: s for s in data1}
    scenarios_f2 = {s['scenario']: s for s in data2}

    all_scenario_ids = set(scenarios_f1.keys()) | set(scenarios_f2.keys())
    
    # 初始化計數器和ID列表
    success_f1 = 0
    success_f2 = 0
    both_failed = 0
    only_in_f1_ids = []
    only_in_f2_ids = []

    for sid in all_scenario_ids:
        scenario1 = scenarios_f1.get(sid)
        scenario2 = scenarios_f2.get(sid)

        in_f1 = scenario1 is not None
        in_f2 = scenario2 is not None

        is_s1_success = False
        if in_f1:
            is_s1_success = all(not turn.get('validation_errors') 
                                for conv in scenario1.get('conversations', []) 
                                for turn in conv.get('turns', []))
            if is_s1_success:
                success_f1 += 1

        is_s2_success = False
        if in_f2:
            is_s2_success = all(not turn.get('validation_errors') 
                                for conv in scenario2.get('conversations', []) 
                                for turn in conv.get('turns', []))
            if is_s2_success:
                success_f2 += 1
        
        # --- 更新統計 ---
        if in_f1 and not in_f2:
            only_in_f1_ids.append(sid) # 收集ID
        elif not in_f1 and in_f2:
            only_in_f2_ids.append(sid) # 收集ID
        elif in_f1 and in_f2:
            if not is_s1_success and not is_s2_success:
                both_failed += 1

    return {
        "file1_path": file1_path,
        "file2_path": file2_path,
        "file1_total": len(scenarios_f1),
        "file2_total": len(scenarios_f2),
        "file1_success_count": success_f1,
        "file2_success_count": success_f2,
        "both_failed_count": both_failed,
        "total_unique_scenarios": len(all_scenario_ids),
        # 返回排序後的ID列表
        "only_in_file1_ids": sorted(only_in_f1_ids),
        "only_in_file2_ids": sorted(only_in_f2_ids),
    }

# --- 主程序 ---
# 将模型版本作为可调整的变量
model_version = "gpt-4o" 

file_pairs = {
    "Multiple": (f"{model_version}_results/BFCL_v3_multiple_{model_version}_results_llamaindex.json", f"{model_version}_results/BFCL_v3_multiple_{model_version}_results.json"),
    "Parallel": (f"{model_version}_results/BFCL_v3_parallel_{model_version}_results_llamaindex.json", f"{model_version}_results/BFCL_v3_parallel_{model_version}_results.json"),
    "Parallel Multiple": (f"{model_version}_results/BFCL_v3_parallel_multiple_{model_version}_results_llamaindex.json", f"{model_version}_results/BFCL_v3_parallel_multiple_{model_version}_results.json"),
    "Simple": (f"{model_version}_results/BFCL_v3_simple_{model_version}_results_llamaindex.json", f"{model_version}_results/BFCL_v3_simple_{model_version}_results.json")
}

overall_total_scenarios = 0
overall_success_f1 = 0
overall_success_f2 = 0

category_results = {} # 用于存储每个分类的结果

# 遍歷每一對文件並打印分析結果
for group_name, (file1, file2) in file_pairs.items():
    results = analyze_scenarios_by_id(file1, file2)
    if results:
        # 累加總體統計
        overall_total_scenarios += results['total_unique_scenarios']
        overall_success_f1 += results['file1_success_count']
        overall_success_f2 += results['file2_success_count']

        category_results[group_name] = {
            "total": results['file1_total'], # 假设文件1和文件2的总案例数相同，或者以文件1为基准
            "success_f1": results['file1_success_count'],
            "success_f2": results['file2_success_count'],
            "only_in_file1_ids": results['only_in_file1_ids'],
            "only_in_file2_ids": results['only_in_file2_ids']
        }

# --- 打印格式化的結果 ---
print(f"模型版本: {model_version}")
print("\n---")
print("### 详细分类结果 ###")
# 按照 specified order: Simple, Multiple, Parallel, Parallel Multiple
output_order = ["Simple", "Multiple", "Parallel", "Parallel Multiple"]

for category in output_order:
    if category in category_results:
        res = category_results[category]
        print(f"**{category}**:")
        print(f"  文件1 (LlamaIndex): {res['success_f1']}/{res['total']}")
        if res['only_in_file2_ids']: # 如果文件2中有文件1没有的案例，说明文件1漏了这些
            print(f"    文件1缺失的案例 ({len(res['only_in_file2_ids'])}个): {res['only_in_file2_ids']}")
        print(f"  文件2 (原始): {res['success_f2']}/{res['total']}")
        if res['only_in_file1_ids']: # 如果文件1中有文件2没有的案例，说明文件2漏了这些
            print(f"    文件2缺失的案例 ({len(res['only_in_file1_ids'])}个): {res['only_in_file1_ids']}")
        print("---")


# --- 打印整體統計結果 ---
print("\n### 整體統計結果 ###")
print(f"整體總案例數: {overall_total_scenarios}")
print(f"整體成功數 (文件1 - LlamaIndex): {overall_success_f1}")
print(f"整體成功數 (文件2 - 原始): {overall_success_f2}")

# 计算并打印整体成功率
overall_accuracy_f1 = (overall_success_f1 / overall_total_scenarios * 100) if overall_total_scenarios > 0 else 0
overall_accuracy_f2 = (overall_success_f2 / overall_total_scenarios * 100) if overall_total_scenarios > 0 else 0

print(f"整體成功率 (文件1 - LlamaIndex): {overall_accuracy_f1:.2f}%")
print(f"整體成功率 (文件2 - 原始): {overall_accuracy_f2:.2f}%")

模型版本: gpt-4o

---
### 详细分类结果 ###
**Simple**:
  文件1 (LlamaIndex): 371/399
    文件1缺失的案例 (1个): ['simple_83']
  文件2 (原始): 363/399
---
**Multiple**:
  文件1 (LlamaIndex): 182/199
    文件1缺失的案例 (1个): ['multiple_5']
  文件2 (原始): 174/199
---
**Parallel**:
  文件1 (LlamaIndex): 185/199
    文件1缺失的案例 (1个): ['parallel_133']
  文件2 (原始): 157/199
---
**Parallel Multiple**:
  文件1 (LlamaIndex): 177/199
    文件1缺失的案例 (1个): ['parallel_multiple_63']
  文件2 (原始): 142/199
    文件2缺失的案例 (1个): ['parallel_multiple_194']
---

### 整體統計結果 ###
整體總案例數: 1000
整體成功數 (文件1 - LlamaIndex): 915
整體成功數 (文件2 - 原始): 836
整體成功率 (文件1 - LlamaIndex): 91.50%
整體成功率 (文件2 - 原始): 83.60%
