diff --git a/src/data/merbench_data.json b/src/data/merbench_data.json index 1a8c8c7..c39423e 100644 --- a/src/data/merbench_data.json +++ b/src/data/merbench_data.json @@ -1,15 +1,11 @@ { "stats": { - "total_runs": 879, - "models_evaluated": 16, + "total_runs": 1159, + "models_evaluated": 25, "test_cases": 3, "test_groups": ["easy", "hard", "medium"], - "providers": ["Amazon", "Google"], + "providers": ["Amazon", "Google", "OSS"], "models": [ - "bedrock:us.amazon.nova-lite-v1:0", - "bedrock:us.amazon.nova-micro-v1:0", - "bedrock:us.amazon.nova-premier-v1:0", - "bedrock:us.amazon.nova-pro-v1:0", "gemini-2.0-flash", "gemini-2.5-flash", "gemini-2.5-flash-lite", @@ -21,10 +17,23 @@ "gemini-2.5-pro", "gemini-2.5-pro-preview-03-25", "gemini-2.5-pro-preview-05-06", - "gemini-2.5-pro-preview-06-05" + "gemini-2.5-pro-preview-06-05", + "google/gemma-3-27b", + "gpt-oss-20b", + "llama-xlam-2-70b-fc-r", + "magistral-small-2509-mlx", + "qwen/qwen3-30b-a3b-2507", + "qwen3-30b-a3b-thinking-2507-mlx", + "qwen3-coder-30b-a3b-instruct-mlx", + "seed-oss-36b-instruct-mlx", + "us.amazon.nova-lite-v1:0", + "us.amazon.nova-micro-v1:0", + "us.amazon.nova-premier-v1:0", + "us.amazon.nova-pro-v1:0", + "xlam-2-32b-fc-r" ], - "total_cost": 21.312590574999998, - "avg_cost_per_run": 0.024246405659840726 + "total_cost": 21.591944374999997, + "avg_cost_per_run": 0.01862980532786885 }, "leaderboard": [ { @@ -94,15 +103,26 @@ "Provider": "Google" }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", - "Success_Rate": 5.0, - "Avg_Duration": 4.403742361633333, - "Avg_Tokens": 4974.583333333333, - "Avg_Cost": 0.0008166483333333334, - "Avg_Input_Cost": 0.00039106166666666675, - "Avg_Output_Cost": 0.0004255866666666667, - "Runs": 60, - "Provider": "Google" + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "Success_Rate": 10.256410256410257, + "Avg_Duration": 92.27195387179488, + "Avg_Tokens": 8166.794871794872, + "Avg_Cost": 0.0017201651282051282, + "Avg_Input_Cost": 0.0002469353846153846, + "Avg_Output_Cost": 0.0014732297435897433, + "Runs": 39, + "Provider": "OSS" + }, + { + "Model": "seed-oss-36b-instruct-mlx", + "Success_Rate": 6.25, + "Avg_Duration": 396.959792375, + "Avg_Tokens": 3053.4375, + "Avg_Cost": 0.000937006875, + "Avg_Input_Cost": 0.00046868062499999996, + "Avg_Output_Cost": 0.00046832624999999995, + "Runs": 16, + "Provider": "OSS" }, { "Model": "gemini-2.5-flash-preview-05-20", @@ -115,6 +135,17 @@ "Runs": 60, "Provider": "Google" }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Success_Rate": 5.0, + "Avg_Duration": 4.403742361633333, + "Avg_Tokens": 4974.583333333333, + "Avg_Cost": 0.0008166483333333334, + "Avg_Input_Cost": 0.00039106166666666675, + "Avg_Output_Cost": 0.0004255866666666667, + "Runs": 60, + "Provider": "Google" + }, { "Model": "gemini-2.5-flash-preview-04-17", "Success_Rate": 4.444444444444445, @@ -127,7 +158,18 @@ "Provider": "Google" }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-flash-lite", + "Success_Rate": 3.3333333333333335, + "Avg_Duration": 5.9025559164666666, + "Avg_Tokens": 9506.68888888889, + "Avg_Cost": 0.0012744855555555557, + "Avg_Input_Cost": 0.00084273, + "Avg_Output_Cost": 0.0004317555555555556, + "Runs": 90, + "Provider": "Google" + }, + { + "Model": "us.amazon.nova-premier-v1:0", "Success_Rate": 3.3333333333333335, "Avg_Duration": 63.18500474063333, "Avg_Tokens": 9528.966666666667, @@ -138,29 +180,29 @@ "Provider": "Amazon" }, { - "Model": "gemini-2.5-flash-lite", - "Success_Rate": 3.3333333333333335, - "Avg_Duration": 5.9025559164666666, - "Avg_Tokens": 9506.68888888889, - "Avg_Cost": 0.0012744855555555557, - "Avg_Input_Cost": 0.00084273, - "Avg_Output_Cost": 0.0004317555555555556, - "Runs": 90, - "Provider": "Google" + "Model": "gpt-oss-20b", + "Success_Rate": 2.2222222222222223, + "Avg_Duration": 47.89863104444444, + "Avg_Tokens": 3896.0222222222224, + "Avg_Cost": 0.00024661955555555556, + "Avg_Input_Cost": 8.149733333333333e-5, + "Avg_Output_Cost": 0.00016512222222222223, + "Runs": 45, + "Provider": "OSS" }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "us.amazon.nova-pro-v1:0", "Success_Rate": 0.0, - "Avg_Duration": 24.5429114128, - "Avg_Tokens": 2799.3166666666666, - "Avg_Cost": 0.000247573, - "Avg_Input_Cost": 0.000141421, - "Avg_Output_Cost": 0.000106152, + "Avg_Duration": 49.53060242716666, + "Avg_Tokens": 678.15, + "Avg_Cost": 0.0008442, + "Avg_Input_Cost": 0.00044196000000000003, + "Avg_Output_Cost": 0.00040224, "Runs": 60, "Provider": "Amazon" }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "us.amazon.nova-micro-v1:0", "Success_Rate": 0.0, "Avg_Duration": 18.831003638516666, "Avg_Tokens": 1783.85, @@ -171,13 +213,13 @@ "Provider": "Amazon" }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "us.amazon.nova-lite-v1:0", "Success_Rate": 0.0, - "Avg_Duration": 49.53060242716666, - "Avg_Tokens": 678.15, - "Avg_Cost": 0.0008442, - "Avg_Input_Cost": 0.00044196000000000003, - "Avg_Output_Cost": 0.00040224, + "Avg_Duration": 24.5429114128, + "Avg_Tokens": 2799.3166666666666, + "Avg_Cost": 0.000247573, + "Avg_Input_Cost": 0.000141421, + "Avg_Output_Cost": 0.000106152, "Runs": 60, "Provider": "Amazon" }, @@ -193,62 +235,84 @@ "Provider": "Google" }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "google/gemma-3-27b", "Success_Rate": 0.0, - "Avg_Duration": 5.680209022222223, - "Avg_Tokens": 5687.822222222222, - "Avg_Cost": 0.0010556422222222223, - "Avg_Input_Cost": 0.0004064955555555556, - "Avg_Output_Cost": 0.0006491466666666667, + "Avg_Duration": 120.44288773333334, + "Avg_Tokens": 6954.466666666666, + "Avg_Cost": 0.0007068297777777777, + "Avg_Input_Cost": 0.000521852, + "Avg_Output_Cost": 0.00018497777777777777, "Runs": 45, - "Provider": "Google" - } - ], - "pareto_data": [ + "Provider": "OSS" + }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Success_Rate": 0.0, - "Duration": 24.5429114128, - "total_tokens": 2799.3166666666666, - "total_cost": 0.000247573, - "input_cost": 0.000141421, - "output_cost": 0.000106152, - "Metric_request_tokens": 5237.814814814815, - "Metric_response_tokens": 982.8888888888889 + "Avg_Duration": 21.610165333333335, + "Avg_Tokens": 4383.355555555556, + "Avg_Cost": 0.00045898000000000005, + "Avg_Input_Cost": 0.00020111333333333334, + "Avg_Output_Cost": 0.0002578666666666667, + "Runs": 45, + "Provider": "OSS" }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "qwen/qwen3-30b-a3b-2507", "Success_Rate": 0.0, - "Duration": 18.831003638516666, - "total_tokens": 1783.85, - "total_cost": 9.043825000000001e-5, - "input_cost": 5.310025000000001e-5, - "output_cost": 3.733800000000001e-5, - "Metric_request_tokens": 7002.2307692307695, - "Metric_response_tokens": 1230.923076923077 + "Avg_Duration": 23.03898651111111, + "Avg_Tokens": 4277.977777777778, + "Avg_Cost": 0.0005597002222222222, + "Avg_Input_Cost": 0.00025939555555555554, + "Avg_Output_Cost": 0.00030030466666666663, + "Runs": 45, + "Provider": "OSS" }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", - "Success_Rate": 3.3333333333333335, - "Duration": 63.18500474063333, - "total_tokens": 9528.966666666667, - "total_cost": 0.03564125, - "input_cost": 0.020867708333333335, - "output_cost": 0.014773541666666666, - "Metric_request_tokens": 17269.827586206895, - "Metric_response_tokens": 2445.2758620689656 + "Model": "magistral-small-2509-mlx", + "Success_Rate": 0.0, + "Avg_Duration": 582.1930898666667, + "Avg_Tokens": 4438.333333333333, + "Avg_Cost": 0.0041603000000000005, + "Avg_Input_Cost": 0.0012485999999999999, + "Avg_Output_Cost": 0.0029117, + "Runs": 15, + "Provider": "OSS" }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "llama-xlam-2-70b-fc-r", "Success_Rate": 0.0, - "Duration": 49.53060242716666, - "total_tokens": 678.15, - "total_cost": 0.0008442, - "input_cost": 0.00044196000000000003, - "output_cost": 0.00040224, - "Metric_request_tokens": 8286.75, - "Metric_response_tokens": 1885.5 + "Avg_Duration": 238.0563474, + "Avg_Tokens": 8591.266666666666, + "Avg_Cost": 0.0026572266666666663, + "Avg_Input_Cost": 0.00233784, + "Avg_Output_Cost": 0.0003193866666666667, + "Runs": 15, + "Provider": "OSS" + }, + { + "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Success_Rate": 0.0, + "Avg_Duration": 5.680209022222223, + "Avg_Tokens": 5687.822222222222, + "Avg_Cost": 0.0010556422222222223, + "Avg_Input_Cost": 0.0004064955555555556, + "Avg_Output_Cost": 0.0006491466666666667, + "Runs": 45, + "Provider": "Google" }, + { + "Model": "xlam-2-32b-fc-r", + "Success_Rate": 0.0, + "Avg_Duration": 171.86225006666666, + "Avg_Tokens": 8210.2, + "Avg_Cost": 0.000417768, + "Avg_Input_Cost": 0.0002986213333333333, + "Avg_Output_Cost": 0.00011914666666666667, + "Runs": 15, + "Provider": "OSS" + } + ], + "pareto_data": [ { "Model": "gemini-2.0-flash", "Success_Rate": 0.0, @@ -380,209 +444,220 @@ "output_cost": 0.03217803921568627, "Metric_request_tokens": 4894.078431372549, "Metric_response_tokens": 1306.6470588235295 + }, + { + "Model": "google/gemma-3-27b", + "Success_Rate": 0.0, + "Duration": 120.44288773333334, + "total_tokens": 6954.466666666666, + "total_cost": 0.0007068297777777777, + "input_cost": 0.000521852, + "output_cost": 0.00018497777777777777, + "Metric_request_tokens": 5798.355555555556, + "Metric_response_tokens": 1156.111111111111 + }, + { + "Model": "gpt-oss-20b", + "Success_Rate": 2.2222222222222223, + "Duration": 47.89863104444444, + "total_tokens": 3896.0222222222224, + "total_cost": 0.00024661955555555556, + "input_cost": 8.149733333333333e-5, + "output_cost": 0.00016512222222222223, + "Metric_request_tokens": 3704.4242424242425, + "Metric_response_tokens": 1608.3333333333333 + }, + { + "Model": "llama-xlam-2-70b-fc-r", + "Success_Rate": 0.0, + "Duration": 238.0563474, + "total_tokens": 8591.266666666666, + "total_cost": 0.0026572266666666663, + "input_cost": 0.00233784, + "output_cost": 0.0003193866666666667, + "Metric_request_tokens": 7792.8, + "Metric_response_tokens": 798.4666666666667 + }, + { + "Model": "magistral-small-2509-mlx", + "Success_Rate": 0.0, + "Duration": 582.1930898666667, + "total_tokens": 4438.333333333333, + "total_cost": 0.0041603000000000005, + "input_cost": 0.0012485999999999999, + "output_cost": 0.0029117, + "Metric_request_tokens": 5351.142857142857, + "Metric_response_tokens": 4159.571428571428 + }, + { + "Model": "qwen/qwen3-30b-a3b-2507", + "Success_Rate": 0.0, + "Duration": 23.03898651111111, + "total_tokens": 4277.977777777778, + "total_cost": 0.0005597002222222222, + "input_cost": 0.00025939555555555554, + "output_cost": 0.00030030466666666663, + "Metric_request_tokens": 3242.4444444444443, + "Metric_response_tokens": 1035.5333333333333 + }, + { + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "Success_Rate": 10.256410256410257, + "Duration": 92.27195387179488, + "total_tokens": 8166.794871794872, + "total_cost": 0.0017201651282051282, + "input_cost": 0.0002469353846153846, + "output_cost": 0.0014732297435897433, + "Metric_request_tokens": 3086.6923076923076, + "Metric_response_tokens": 5080.102564102564 + }, + { + "Model": "qwen3-coder-30b-a3b-instruct-mlx", + "Success_Rate": 0.0, + "Duration": 21.610165333333335, + "total_tokens": 4383.355555555556, + "total_cost": 0.00045898000000000005, + "input_cost": 0.00020111333333333334, + "output_cost": 0.0002578666666666667, + "Metric_request_tokens": 3351.8888888888887, + "Metric_response_tokens": 1031.4666666666667 + }, + { + "Model": "seed-oss-36b-instruct-mlx", + "Success_Rate": 6.25, + "Duration": 396.959792375, + "total_tokens": 3053.4375, + "total_cost": 0.000937006875, + "input_cost": 0.00046868062499999996, + "output_cost": 0.00046832624999999995, + "Metric_request_tokens": 4463.625, + "Metric_response_tokens": 1643.25 + }, + { + "Model": "us.amazon.nova-lite-v1:0", + "Success_Rate": 0.0, + "Duration": 24.5429114128, + "total_tokens": 2799.3166666666666, + "total_cost": 0.000247573, + "input_cost": 0.000141421, + "output_cost": 0.000106152, + "Metric_request_tokens": 5237.814814814815, + "Metric_response_tokens": 982.8888888888889 + }, + { + "Model": "us.amazon.nova-micro-v1:0", + "Success_Rate": 0.0, + "Duration": 18.831003638516666, + "total_tokens": 1783.85, + "total_cost": 9.043825000000001e-5, + "input_cost": 5.310025000000001e-5, + "output_cost": 3.733800000000001e-5, + "Metric_request_tokens": 7002.2307692307695, + "Metric_response_tokens": 1230.923076923077 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Success_Rate": 3.3333333333333335, + "Duration": 63.18500474063333, + "total_tokens": 9528.966666666667, + "total_cost": 0.03564125, + "input_cost": 0.020867708333333335, + "output_cost": 0.014773541666666666, + "Metric_request_tokens": 17269.827586206895, + "Metric_response_tokens": 2445.2758620689656 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Success_Rate": 0.0, + "Duration": 49.53060242716666, + "total_tokens": 678.15, + "total_cost": 0.0008442, + "input_cost": 0.00044196000000000003, + "output_cost": 0.00040224, + "Metric_request_tokens": 8286.75, + "Metric_response_tokens": 1885.5 + }, + { + "Model": "xlam-2-32b-fc-r", + "Success_Rate": 0.0, + "Duration": 171.86225006666666, + "total_tokens": 8210.2, + "total_cost": 0.000417768, + "input_cost": 0.0002986213333333333, + "output_cost": 0.00011914666666666667, + "Metric_request_tokens": 8614.076923076924, + "Metric_response_tokens": 859.2307692307693 } ], "test_groups_data": [ { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.0-flash", "test_group": "easy", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.4, - "Score_UsedBothMCPTools": 0.275, - "total_cost": 0.000238407, - "input_cost": 0.00013509900000000002, - "output_cost": 0.00010330800000000001, - "total_tokens": 2682.1 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_cost": 0.00027638, + "input_cost": 7.36e-5, + "output_cost": 0.00020277999999999998, + "total_tokens": 1242.95 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.0-flash", "test_group": "hard", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.3, - "Score_UsedBothMCPTools": 0.225, - "total_cost": 0.00015518999999999998, - "input_cost": 8.3562e-5, - "output_cost": 7.162799999999999e-5, - "total_tokens": 1691.15 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_cost": 0.00027488, + "input_cost": 7.36e-5, + "output_cost": 0.00020128000000000002, + "total_tokens": 1239.2 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.0-flash", "test_group": "medium", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.65, - "Score_UsedBothMCPTools": 0.5, - "total_cost": 0.000349122, - "input_cost": 0.000205602, - "output_cost": 0.00014352, - "total_tokens": 4024.7 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.1, + "total_cost": 0.00031753, + "input_cost": 9.347e-5, + "output_cost": 0.00022406000000000002, + "total_tokens": 1494.85 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash", "test_group": "easy", - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.4, - "Score_UsedBothMCPTools": 0.35, - "total_cost": 0.00014982100000000001, - "input_cost": 9.1595e-5, - "output_cost": 5.8226e-5, - "total_tokens": 3032.9 + "Score_MermaidDiagramValid": 0.4, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.6333333333333333, + "total_cost": 0.018718946666666667, + "input_cost": 0.00037073999999999994, + "output_cost": 0.01834820666666666, + "total_tokens": 8388.733333333334 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash", "test_group": "hard", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.15, - "Score_UsedBothMCPTools": 0.05, - "total_cost": 3.7059750000000004e-5, - "input_cost": 3.0283750000000003e-5, - "output_cost": 6.776e-6, - "total_tokens": 913.65 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.6666666666666666, + "total_cost": 0.012811633333333329, + "input_cost": 0.0004289, + "output_cost": 0.012382733333333333, + "total_tokens": 7064.533333333334 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash", "test_group": "medium", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.2, - "Score_UsedBothMCPTools": 0.2, - "total_cost": 8.443400000000001e-5, - "input_cost": 3.7422e-5, - "output_cost": 4.7012e-5, - "total_tokens": 1405.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.6666666666666666, + "total_cost": 0.006741076666666666, + "input_cost": 0.00045869, + "output_cost": 0.006282386666666667, + "total_tokens": 5518.133333333333 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", - "test_group": "easy", - "Score_MermaidDiagramValid": 0.1, - "Score_UsageLimitNotExceeded": 0.9, - "Score_UsedBothMCPTools": 0.625, - "total_cost": 0.057880125000000004, - "input_cost": 0.0392845, - "output_cost": 0.018595624999999998, - "total_tokens": 17201.45 - }, - { - "Model": "bedrock:us.amazon.nova-premier-v1:0", - "test_group": "hard", - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.65, - "Score_UsedBothMCPTools": 0.35, - "total_cost": 0.023277375, - "input_cost": 0.011096125, - "output_cost": 0.01218125, - "total_tokens": 5412.95 - }, - { - "Model": "bedrock:us.amazon.nova-premier-v1:0", - "test_group": "medium", - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.75, - "Score_UsedBothMCPTools": 0.45, - "total_cost": 0.02576625, - "input_cost": 0.0122225, - "output_cost": 0.013543749999999998, - "total_tokens": 5972.5 - }, - { - "Model": "bedrock:us.amazon.nova-pro-v1:0", - "test_group": "easy", - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.2, - "Score_UsedBothMCPTools": 0.1, - "total_cost": 0.00161976, - "input_cost": 0.00086424, - "output_cost": 0.00075552, - "total_tokens": 1316.4 - }, - { - "Model": "bedrock:us.amazon.nova-pro-v1:0", - "test_group": "hard", - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0, - "total_tokens": 0.0 - }, - { - "Model": "bedrock:us.amazon.nova-pro-v1:0", - "test_group": "medium", - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.1, - "Score_UsedBothMCPTools": 0.1, - "total_cost": 0.0009128400000000002, - "input_cost": 0.00046164000000000003, - "output_cost": 0.0004512, - "total_tokens": 718.05 - }, - { - "Model": "gemini-2.0-flash", - "test_group": "easy", - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_cost": 0.00027638, - "input_cost": 7.36e-5, - "output_cost": 0.00020277999999999998, - "total_tokens": 1242.95 - }, - { - "Model": "gemini-2.0-flash", - "test_group": "hard", - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_cost": 0.00027488, - "input_cost": 7.36e-5, - "output_cost": 0.00020128000000000002, - "total_tokens": 1239.2 - }, - { - "Model": "gemini-2.0-flash", - "test_group": "medium", - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.1, - "total_cost": 0.00031753, - "input_cost": 9.347e-5, - "output_cost": 0.00022406000000000002, - "total_tokens": 1494.85 - }, - { - "Model": "gemini-2.5-flash", - "test_group": "easy", - "Score_MermaidDiagramValid": 0.4, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.6333333333333333, - "total_cost": 0.018718946666666667, - "input_cost": 0.00037073999999999994, - "output_cost": 0.01834820666666666, - "total_tokens": 8388.733333333334 - }, - { - "Model": "gemini-2.5-flash", - "test_group": "hard", - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.6666666666666666, - "total_cost": 0.012811633333333329, - "input_cost": 0.0004289, - "output_cost": 0.012382733333333333, - "total_tokens": 7064.533333333334 - }, - { - "Model": "gemini-2.5-flash", - "test_group": "medium", - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.6666666666666666, - "total_cost": 0.006741076666666666, - "input_cost": 0.00045869, - "output_cost": 0.006282386666666667, - "total_tokens": 5518.133333333333 - }, - { - "Model": "gemini-2.5-flash-lite", + "Model": "gemini-2.5-flash-lite", "test_group": "easy", "Score_MermaidDiagramValid": 0.1, "Score_UsageLimitNotExceeded": 1.0, @@ -910,333 +985,607 @@ "input_cost": 0.005070441176470588, "output_cost": 0.03375529411764706, "total_tokens": 7431.882352941177 - } - ], - "failure_analysis_data": [ - { - "Model": "bedrock:us.amazon.nova-lite-v1:0", - "Invalid Diagram": 60, - "MCP Tool Failure": 47, - "Usage Limit Exceeded": 33 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", - "Invalid Diagram": 60, - "MCP Tool Failure": 49, - "Usage Limit Exceeded": 45 + "Model": "google/gemma-3-27b", + "test_group": "easy", + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_cost": 0.00064909, + "input_cost": 0.000522114, + "output_cost": 0.000126976, + "total_tokens": 6594.866666666667 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", - "Invalid Diagram": 58, - "MCP Tool Failure": 32, - "Usage Limit Exceeded": 14 + "Model": "google/gemma-3-27b", + "test_group": "hard", + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_cost": 0.0007507366666666666, + "input_cost": 0.0005309819999999999, + "output_cost": 0.00021975466666666668, + "total_tokens": 7273.266666666666 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", - "Invalid Diagram": 60, - "MCP Tool Failure": 56, - "Usage Limit Exceeded": 54 + "Model": "google/gemma-3-27b", + "test_group": "medium", + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_cost": 0.0007206626666666666, + "input_cost": 0.00051246, + "output_cost": 0.00020820266666666666, + "total_tokens": 6995.266666666666 }, { - "Model": "gemini-2.0-flash", - "Invalid Diagram": 60, - "MCP Tool Failure": 58, - "Usage Limit Exceeded": 0 + "Model": "gpt-oss-20b", + "test_group": "easy", + "Score_MermaidDiagramValid": 0.06666666666666667, + "Score_UsageLimitNotExceeded": 0.8666666666666667, + "Score_UsedBothMCPTools": 0.5333333333333333, + "total_cost": 0.00033799866666666664, + "input_cost": 0.00013822799999999998, + "output_cost": 0.00019977066666666666, + "total_tokens": 6034.533333333334 }, { - "Model": "gemini-2.5-flash", - "Invalid Diagram": 39, - "MCP Tool Failure": 31, - "Usage Limit Exceeded": 0 + "Model": "gpt-oss-20b", + "test_group": "hard", + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.4666666666666667, + "Score_UsedBothMCPTools": 0.26666666666666666, + "total_cost": 0.00013997066666666667, + "input_cost": 3.162e-5, + "output_cost": 0.00010835066666666667, + "total_tokens": 1827.9333333333334 }, { - "Model": "gemini-2.5-flash-lite", - "Invalid Diagram": 87, - "MCP Tool Failure": 28, - "Usage Limit Exceeded": 0 + "Model": "gpt-oss-20b", + "test_group": "medium", + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.8666666666666667, + "Score_UsedBothMCPTools": 0.5666666666666667, + "total_cost": 0.0002618893333333334, + "input_cost": 7.4644e-5, + "output_cost": 0.00018724533333333338, + "total_tokens": 3825.6 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", - "Invalid Diagram": 57, - "MCP Tool Failure": 30, - "Usage Limit Exceeded": 0 + "Model": "llama-xlam-2-70b-fc-r", + "test_group": "easy", + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.9333333333333333, + "total_cost": 0.0026572266666666663, + "input_cost": 0.00233784, + "output_cost": 0.0003193866666666667, + "total_tokens": 8591.266666666666 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", - "Invalid Diagram": 45, - "MCP Tool Failure": 0, - "Usage Limit Exceeded": 0 + "Model": "magistral-small-2509-mlx", + "test_group": "easy", + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.8666666666666667, + "Score_UsedBothMCPTools": 0.4666666666666667, + "total_cost": 0.0041603000000000005, + "input_cost": 0.0012485999999999999, + "output_cost": 0.0029117, + "total_tokens": 4438.333333333333 }, { - "Model": "gemini-2.5-flash-preview-04-17", - "Invalid Diagram": 43, - "MCP Tool Failure": 22, - "Usage Limit Exceeded": 1 + "Model": "qwen/qwen3-30b-a3b-2507", + "test_group": "easy", + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_cost": 0.000562644, + "input_cost": 0.0002598453333333333, + "output_cost": 0.0003027986666666666, + "total_tokens": 4292.2 }, { - "Model": "gemini-2.5-flash-preview-05-20", - "Invalid Diagram": 57, - "MCP Tool Failure": 45, - "Usage Limit Exceeded": 0 + "Model": "qwen/qwen3-30b-a3b-2507", + "test_group": "hard", + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_cost": 0.0005579786666666666, + "input_cost": 0.00025931733333333333, + "output_cost": 0.0002986613333333333, + "total_tokens": 4271.333333333333 }, { - "Model": "gemini-2.5-flash-preview-09-2025", - "Invalid Diagram": 31, - "MCP Tool Failure": 19, - "Usage Limit Exceeded": 16 + "Model": "qwen/qwen3-30b-a3b-2507", + "test_group": "medium", + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_cost": 0.000558478, + "input_cost": 0.00025902399999999995, + "output_cost": 0.000299454, + "total_tokens": 4270.4 }, { - "Model": "gemini-2.5-pro", - "Invalid Diagram": 36, - "MCP Tool Failure": 3, - "Usage Limit Exceeded": 3 + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "test_group": "easy", + "Score_MermaidDiagramValid": 0.23076923076923078, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.34615384615384615, + "total_cost": 0.001714123076923077, + "input_cost": 0.0002305723076923077, + "output_cost": 0.0014835507692307693, + "total_tokens": 7997.846153846154 }, { - "Model": "gemini-2.5-pro-preview-03-25", - "Invalid Diagram": 37, - "MCP Tool Failure": 10, - "Usage Limit Exceeded": 9 + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "test_group": "hard", + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.46153846153846156, + "total_cost": 0.0015948738461538461, + "input_cost": 0.0002721169230769231, + "output_cost": 0.0013227569230769229, + "total_tokens": 7962.692307692308 }, { - "Model": "gemini-2.5-pro-preview-05-06", - "Invalid Diagram": 33, - "MCP Tool Failure": 9, - "Usage Limit Exceeded": 4 + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "test_group": "medium", + "Score_MermaidDiagramValid": 0.07692307692307693, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_cost": 0.0018514984615384613, + "input_cost": 0.00023811692307692308, + "output_cost": 0.0016133815384615385, + "total_tokens": 8539.846153846154 }, { - "Model": "gemini-2.5-pro-preview-06-05", - "Invalid Diagram": 36, - "MCP Tool Failure": 0, - "Usage Limit Exceeded": 0 - } - ], - "cost_breakdown_data": [ - { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "test_group": "easy", - "avg_total_cost": 0.000238, - "sum_total_cost": 0.004768, - "run_count": 20, - "avg_input_cost": 0.000135, - "sum_input_cost": 0.002702, - "avg_output_cost": 0.000103, - "sum_output_cost": 0.002066 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_cost": 0.00045768999999999996, + "input_cost": 0.00020134, + "output_cost": 0.00025634999999999997, + "total_tokens": 4381.066666666667 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "test_group": "hard", - "avg_total_cost": 0.000155, - "sum_total_cost": 0.003104, - "run_count": 20, - "avg_input_cost": 8.4e-5, - "sum_input_cost": 0.001671, - "avg_output_cost": 7.2e-5, - "sum_output_cost": 0.001433 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_cost": 0.00046003333333333337, + "input_cost": 0.00020100000000000003, + "output_cost": 0.00025903333333333336, + "total_tokens": 4386.133333333333 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "test_group": "medium", - "avg_total_cost": 0.000349, - "sum_total_cost": 0.006982, - "run_count": 20, - "avg_input_cost": 0.000206, - "sum_input_cost": 0.004112, - "avg_output_cost": 0.000144, - "sum_output_cost": 0.00287 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_cost": 0.00045921666666666667, + "input_cost": 0.00020100000000000003, + "output_cost": 0.00025821666666666666, + "total_tokens": 4382.866666666667 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "seed-oss-36b-instruct-mlx", "test_group": "easy", - "avg_total_cost": 0.00015, - "sum_total_cost": 0.002996, - "run_count": 20, - "avg_input_cost": 9.2e-5, - "sum_input_cost": 0.001832, - "avg_output_cost": 5.8e-5, - "sum_output_cost": 0.001165 - }, - { - "Model": "bedrock:us.amazon.nova-micro-v1:0", - "test_group": "hard", - "avg_total_cost": 3.7e-5, - "sum_total_cost": 0.000741, - "run_count": 20, - "avg_input_cost": 3e-5, - "sum_input_cost": 0.000606, - "avg_output_cost": 7e-6, - "sum_output_cost": 0.000136 + "Score_MermaidDiagramValid": 0.06666666666666667, + "Score_UsageLimitNotExceeded": 0.5333333333333333, + "Score_UsedBothMCPTools": 0.36666666666666664, + "total_cost": 0.000999474, + "input_cost": 0.0004999259999999999, + "output_cost": 0.0004995479999999999, + "total_tokens": 3257.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "seed-oss-36b-instruct-mlx", "test_group": "medium", - "avg_total_cost": 8.4e-5, - "sum_total_cost": 0.001689, - "run_count": 20, - "avg_input_cost": 3.7e-5, - "sum_input_cost": 0.000748, - "avg_output_cost": 4.7e-5, - "sum_output_cost": 0.00094 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0, + "total_tokens": 0.0 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "us.amazon.nova-lite-v1:0", "test_group": "easy", - "avg_total_cost": 0.05788, - "sum_total_cost": 1.157602, - "run_count": 20, - "avg_input_cost": 0.039284, - "sum_input_cost": 0.78569, - "avg_output_cost": 0.018596, - "sum_output_cost": 0.371912 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.4, + "Score_UsedBothMCPTools": 0.275, + "total_cost": 0.000238407, + "input_cost": 0.00013509900000000002, + "output_cost": 0.00010330800000000001, + "total_tokens": 2682.1 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "us.amazon.nova-lite-v1:0", "test_group": "hard", - "avg_total_cost": 0.023277, - "sum_total_cost": 0.465548, - "run_count": 20, - "avg_input_cost": 0.011096, - "sum_input_cost": 0.221922, - "avg_output_cost": 0.012181, - "sum_output_cost": 0.243625 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.3, + "Score_UsedBothMCPTools": 0.225, + "total_cost": 0.00015518999999999998, + "input_cost": 8.3562e-5, + "output_cost": 7.162799999999999e-5, + "total_tokens": 1691.15 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "us.amazon.nova-lite-v1:0", "test_group": "medium", - "avg_total_cost": 0.025766, - "sum_total_cost": 0.515325, - "run_count": 20, - "avg_input_cost": 0.012222, - "sum_input_cost": 0.24445, - "avg_output_cost": 0.013544, - "sum_output_cost": 0.270875 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.65, + "Score_UsedBothMCPTools": 0.5, + "total_cost": 0.000349122, + "input_cost": 0.000205602, + "output_cost": 0.00014352, + "total_tokens": 4024.7 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "us.amazon.nova-micro-v1:0", "test_group": "easy", - "avg_total_cost": 0.00162, - "sum_total_cost": 0.032395, - "run_count": 20, - "avg_input_cost": 0.000864, - "sum_input_cost": 0.017285, - "avg_output_cost": 0.000756, - "sum_output_cost": 0.01511 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.4, + "Score_UsedBothMCPTools": 0.35, + "total_cost": 0.00014982100000000001, + "input_cost": 9.1595e-5, + "output_cost": 5.8226e-5, + "total_tokens": 3032.9 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "us.amazon.nova-micro-v1:0", "test_group": "hard", - "avg_total_cost": 0.0, - "sum_total_cost": 0.0, - "run_count": 20, - "avg_input_cost": 0.0, - "sum_input_cost": 0.0, - "avg_output_cost": 0.0, - "sum_output_cost": 0.0 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.15, + "Score_UsedBothMCPTools": 0.05, + "total_cost": 3.7059750000000004e-5, + "input_cost": 3.0283750000000003e-5, + "output_cost": 6.776e-6, + "total_tokens": 913.65 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "us.amazon.nova-micro-v1:0", "test_group": "medium", - "avg_total_cost": 0.000913, - "sum_total_cost": 0.018257, - "run_count": 20, - "avg_input_cost": 0.000462, - "sum_input_cost": 0.009233, - "avg_output_cost": 0.000451, - "sum_output_cost": 0.009024 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.2, + "Score_UsedBothMCPTools": 0.2, + "total_cost": 8.443400000000001e-5, + "input_cost": 3.7422e-5, + "output_cost": 4.7012e-5, + "total_tokens": 1405.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-premier-v1:0", "test_group": "easy", - "avg_total_cost": 0.000276, - "sum_total_cost": 0.005528, - "run_count": 20, - "avg_input_cost": 7.4e-5, - "sum_input_cost": 0.001472, - "avg_output_cost": 0.000203, - "sum_output_cost": 0.004056 + "Score_MermaidDiagramValid": 0.1, + "Score_UsageLimitNotExceeded": 0.9, + "Score_UsedBothMCPTools": 0.625, + "total_cost": 0.057880125000000004, + "input_cost": 0.0392845, + "output_cost": 0.018595624999999998, + "total_tokens": 17201.45 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-premier-v1:0", "test_group": "hard", - "avg_total_cost": 0.000275, - "sum_total_cost": 0.005498, - "run_count": 20, - "avg_input_cost": 7.4e-5, - "sum_input_cost": 0.001472, - "avg_output_cost": 0.000201, - "sum_output_cost": 0.004026 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.65, + "Score_UsedBothMCPTools": 0.35, + "total_cost": 0.023277375, + "input_cost": 0.011096125, + "output_cost": 0.01218125, + "total_tokens": 5412.95 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-premier-v1:0", "test_group": "medium", - "avg_total_cost": 0.000318, - "sum_total_cost": 0.006351, - "run_count": 20, - "avg_input_cost": 9.3e-5, - "sum_input_cost": 0.001869, - "avg_output_cost": 0.000224, - "sum_output_cost": 0.004481 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.75, + "Score_UsedBothMCPTools": 0.45, + "total_cost": 0.02576625, + "input_cost": 0.0122225, + "output_cost": 0.013543749999999998, + "total_tokens": 5972.5 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-pro-v1:0", "test_group": "easy", - "avg_total_cost": 0.018719, - "sum_total_cost": 0.280784, - "run_count": 15, - "avg_input_cost": 0.000371, - "sum_input_cost": 0.005561, - "avg_output_cost": 0.018348, - "sum_output_cost": 0.275223 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.2, + "Score_UsedBothMCPTools": 0.1, + "total_cost": 0.00161976, + "input_cost": 0.00086424, + "output_cost": 0.00075552, + "total_tokens": 1316.4 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-pro-v1:0", "test_group": "hard", - "avg_total_cost": 0.012812, - "sum_total_cost": 0.192174, - "run_count": 15, - "avg_input_cost": 0.000429, - "sum_input_cost": 0.006434, - "avg_output_cost": 0.012383, - "sum_output_cost": 0.185741 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0, + "total_tokens": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-pro-v1:0", "test_group": "medium", - "avg_total_cost": 0.006741, - "sum_total_cost": 0.101116, - "run_count": 15, - "avg_input_cost": 0.000459, - "sum_input_cost": 0.00688, - "avg_output_cost": 0.006282, - "sum_output_cost": 0.094236 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.1, + "Score_UsedBothMCPTools": 0.1, + "total_cost": 0.0009128400000000002, + "input_cost": 0.00046164000000000003, + "output_cost": 0.0004512, + "total_tokens": 718.05 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "xlam-2-32b-fc-r", "test_group": "easy", - "avg_total_cost": 0.00228, - "sum_total_cost": 0.068387, - "run_count": 30, - "avg_input_cost": 0.001847, - "sum_input_cost": 0.055402, - "avg_output_cost": 0.000433, - "sum_output_cost": 0.012984 + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.8666666666666667, + "Score_UsedBothMCPTools": 0.8666666666666667, + "total_cost": 0.000417768, + "input_cost": 0.0002986213333333333, + "output_cost": 0.00011914666666666667, + "total_tokens": 8210.2 + } + ], + "failure_analysis_data": [ + { + "Model": "gemini-2.0-flash", + "Invalid Diagram": 60, + "MCP Tool Failure": 58, + "Usage Limit Exceeded": 0 }, { - "Model": "gemini-2.5-flash-lite", - "test_group": "hard", - "avg_total_cost": 0.00075, - "sum_total_cost": 0.022494, - "run_count": 30, - "avg_input_cost": 0.000315, - "sum_input_cost": 0.009451, - "avg_output_cost": 0.000435, - "sum_output_cost": 0.013043 + "Model": "gemini-2.5-flash", + "Invalid Diagram": 39, + "MCP Tool Failure": 31, + "Usage Limit Exceeded": 0 }, { "Model": "gemini-2.5-flash-lite", - "test_group": "medium", - "avg_total_cost": 0.000794, - "sum_total_cost": 0.023823, - "run_count": 30, + "Invalid Diagram": 87, + "MCP Tool Failure": 28, + "Usage Limit Exceeded": 0 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Invalid Diagram": 57, + "MCP Tool Failure": 30, + "Usage Limit Exceeded": 0 + }, + { + "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Invalid Diagram": 45, + "MCP Tool Failure": 0, + "Usage Limit Exceeded": 0 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Invalid Diagram": 43, + "MCP Tool Failure": 22, + "Usage Limit Exceeded": 1 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Invalid Diagram": 57, + "MCP Tool Failure": 45, + "Usage Limit Exceeded": 0 + }, + { + "Model": "gemini-2.5-flash-preview-09-2025", + "Invalid Diagram": 31, + "MCP Tool Failure": 19, + "Usage Limit Exceeded": 16 + }, + { + "Model": "gemini-2.5-pro", + "Invalid Diagram": 36, + "MCP Tool Failure": 3, + "Usage Limit Exceeded": 3 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Invalid Diagram": 37, + "MCP Tool Failure": 10, + "Usage Limit Exceeded": 9 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Invalid Diagram": 33, + "MCP Tool Failure": 9, + "Usage Limit Exceeded": 4 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Invalid Diagram": 36, + "MCP Tool Failure": 0, + "Usage Limit Exceeded": 0 + }, + { + "Model": "google/gemma-3-27b", + "Invalid Diagram": 45, + "MCP Tool Failure": 0, + "Usage Limit Exceeded": 0 + }, + { + "Model": "gpt-oss-20b", + "Invalid Diagram": 44, + "MCP Tool Failure": 37, + "Usage Limit Exceeded": 12 + }, + { + "Model": "llama-xlam-2-70b-fc-r", + "Invalid Diagram": 15, + "MCP Tool Failure": 2, + "Usage Limit Exceeded": 0 + }, + { + "Model": "magistral-small-2509-mlx", + "Invalid Diagram": 15, + "MCP Tool Failure": 8, + "Usage Limit Exceeded": 2 + }, + { + "Model": "qwen/qwen3-30b-a3b-2507", + "Invalid Diagram": 45, + "MCP Tool Failure": 0, + "Usage Limit Exceeded": 0 + }, + { + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "Invalid Diagram": 35, + "MCP Tool Failure": 33, + "Usage Limit Exceeded": 0 + }, + { + "Model": "qwen3-coder-30b-a3b-instruct-mlx", + "Invalid Diagram": 45, + "MCP Tool Failure": 0, + "Usage Limit Exceeded": 0 + }, + { + "Model": "seed-oss-36b-instruct-mlx", + "Invalid Diagram": 15, + "MCP Tool Failure": 12, + "Usage Limit Exceeded": 8 + }, + { + "Model": "us.amazon.nova-lite-v1:0", + "Invalid Diagram": 60, + "MCP Tool Failure": 47, + "Usage Limit Exceeded": 33 + }, + { + "Model": "us.amazon.nova-micro-v1:0", + "Invalid Diagram": 60, + "MCP Tool Failure": 49, + "Usage Limit Exceeded": 45 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Invalid Diagram": 58, + "MCP Tool Failure": 32, + "Usage Limit Exceeded": 14 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Invalid Diagram": 60, + "MCP Tool Failure": 56, + "Usage Limit Exceeded": 54 + }, + { + "Model": "xlam-2-32b-fc-r", + "Invalid Diagram": 15, + "MCP Tool Failure": 2, + "Usage Limit Exceeded": 2 + } + ], + "cost_breakdown_data": [ + { + "Model": "gemini-2.0-flash", + "test_group": "easy", + "avg_total_cost": 0.000276, + "sum_total_cost": 0.005528, + "run_count": 20, + "avg_input_cost": 7.4e-5, + "sum_input_cost": 0.001472, + "avg_output_cost": 0.000203, + "sum_output_cost": 0.004056 + }, + { + "Model": "gemini-2.0-flash", + "test_group": "hard", + "avg_total_cost": 0.000275, + "sum_total_cost": 0.005498, + "run_count": 20, + "avg_input_cost": 7.4e-5, + "sum_input_cost": 0.001472, + "avg_output_cost": 0.000201, + "sum_output_cost": 0.004026 + }, + { + "Model": "gemini-2.0-flash", + "test_group": "medium", + "avg_total_cost": 0.000318, + "sum_total_cost": 0.006351, + "run_count": 20, + "avg_input_cost": 9.3e-5, + "sum_input_cost": 0.001869, + "avg_output_cost": 0.000224, + "sum_output_cost": 0.004481 + }, + { + "Model": "gemini-2.5-flash", + "test_group": "easy", + "avg_total_cost": 0.018719, + "sum_total_cost": 0.280784, + "run_count": 15, + "avg_input_cost": 0.000371, + "sum_input_cost": 0.005561, + "avg_output_cost": 0.018348, + "sum_output_cost": 0.275223 + }, + { + "Model": "gemini-2.5-flash", + "test_group": "hard", + "avg_total_cost": 0.012812, + "sum_total_cost": 0.192174, + "run_count": 15, + "avg_input_cost": 0.000429, + "sum_input_cost": 0.006434, + "avg_output_cost": 0.012383, + "sum_output_cost": 0.185741 + }, + { + "Model": "gemini-2.5-flash", + "test_group": "medium", + "avg_total_cost": 0.006741, + "sum_total_cost": 0.101116, + "run_count": 15, + "avg_input_cost": 0.000459, + "sum_input_cost": 0.00688, + "avg_output_cost": 0.006282, + "sum_output_cost": 0.094236 + }, + { + "Model": "gemini-2.5-flash-lite", + "test_group": "easy", + "avg_total_cost": 0.00228, + "sum_total_cost": 0.068387, + "run_count": 30, + "avg_input_cost": 0.001847, + "sum_input_cost": 0.055402, + "avg_output_cost": 0.000433, + "sum_output_cost": 0.012984 + }, + { + "Model": "gemini-2.5-flash-lite", + "test_group": "hard", + "avg_total_cost": 0.00075, + "sum_total_cost": 0.022494, + "run_count": 30, + "avg_input_cost": 0.000315, + "sum_input_cost": 0.009451, + "avg_output_cost": 0.000435, + "sum_output_cost": 0.013043 + }, + { + "Model": "gemini-2.5-flash-lite", + "test_group": "medium", + "avg_total_cost": 0.000794, + "sum_total_cost": 0.023823, + "run_count": 30, "avg_input_cost": 0.000366, "sum_input_cost": 0.010992, "avg_output_cost": 0.000428, @@ -1538,62 +1887,414 @@ "sum_input_cost": 0.086198, "avg_output_cost": 0.033755, "sum_output_cost": 0.57384 - } - ], - "raw_data": [ + }, { - "Model": "gemini-2.5-pro-preview-06-05", - "Case": "fix_invalid_diagram_easy", + "Model": "google/gemma-3-27b", "test_group": "easy", - "Duration": 25.314810254, - "Score_MermaidDiagramValid": 1.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5288.0, - "provider": "Google", - "Metric_request_tokens": 2999.0, - "Metric_response_tokens": 1180.0, - "total_cost": 0.026638749999999996, - "input_cost": 0.00374875, - "output_cost": 0.022889999999999997 + "avg_total_cost": 0.000649, + "sum_total_cost": 0.009736, + "run_count": 15, + "avg_input_cost": 0.000522, + "sum_input_cost": 0.007832, + "avg_output_cost": 0.000127, + "sum_output_cost": 0.001905 }, { - "Model": "gemini-2.5-pro-preview-06-05", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 29.164007858, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5244.0, - "provider": "Google", - "Metric_request_tokens": 3007.0, - "Metric_response_tokens": 1155.0, - "total_cost": 0.02612875, - "input_cost": 0.00375875, - "output_cost": 0.022369999999999998 + "Model": "google/gemma-3-27b", + "test_group": "hard", + "avg_total_cost": 0.000751, + "sum_total_cost": 0.011261, + "run_count": 15, + "avg_input_cost": 0.000531, + "sum_input_cost": 0.007965, + "avg_output_cost": 0.00022, + "sum_output_cost": 0.003296 }, { - "Model": "gemini-2.5-pro-preview-06-05", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 88.240292399, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 17808.0, - "provider": "Google", - "Metric_request_tokens": 11479.0, - "Metric_response_tokens": 2447.0, - "total_cost": 0.07763875, - "input_cost": 0.01434875, - "output_cost": 0.06329 + "Model": "google/gemma-3-27b", + "test_group": "medium", + "avg_total_cost": 0.000721, + "sum_total_cost": 0.01081, + "run_count": 15, + "avg_input_cost": 0.000512, + "sum_input_cost": 0.007687, + "avg_output_cost": 0.000208, + "sum_output_cost": 0.003123 }, { - "Model": "gemini-2.5-pro-preview-06-05", - "Case": "fix_invalid_diagram_easy", + "Model": "gpt-oss-20b", "test_group": "easy", - "Duration": 46.903850207, + "avg_total_cost": 0.000338, + "sum_total_cost": 0.00507, + "run_count": 15, + "avg_input_cost": 0.000138, + "sum_input_cost": 0.002073, + "avg_output_cost": 0.0002, + "sum_output_cost": 0.002997 + }, + { + "Model": "gpt-oss-20b", + "test_group": "hard", + "avg_total_cost": 0.00014, + "sum_total_cost": 0.0021, + "run_count": 15, + "avg_input_cost": 3.2e-5, + "sum_input_cost": 0.000474, + "avg_output_cost": 0.000108, + "sum_output_cost": 0.001625 + }, + { + "Model": "gpt-oss-20b", + "test_group": "medium", + "avg_total_cost": 0.000262, + "sum_total_cost": 0.003928, + "run_count": 15, + "avg_input_cost": 7.5e-5, + "sum_input_cost": 0.00112, + "avg_output_cost": 0.000187, + "sum_output_cost": 0.002809 + }, + { + "Model": "llama-xlam-2-70b-fc-r", + "test_group": "easy", + "avg_total_cost": 0.002657, + "sum_total_cost": 0.039858, + "run_count": 15, + "avg_input_cost": 0.002338, + "sum_input_cost": 0.035068, + "avg_output_cost": 0.000319, + "sum_output_cost": 0.004791 + }, + { + "Model": "magistral-small-2509-mlx", + "test_group": "easy", + "avg_total_cost": 0.00416, + "sum_total_cost": 0.062404, + "run_count": 15, + "avg_input_cost": 0.001249, + "sum_input_cost": 0.018729, + "avg_output_cost": 0.002912, + "sum_output_cost": 0.043676 + }, + { + "Model": "qwen/qwen3-30b-a3b-2507", + "test_group": "easy", + "avg_total_cost": 0.000563, + "sum_total_cost": 0.00844, + "run_count": 15, + "avg_input_cost": 0.00026, + "sum_input_cost": 0.003898, + "avg_output_cost": 0.000303, + "sum_output_cost": 0.004542 + }, + { + "Model": "qwen/qwen3-30b-a3b-2507", + "test_group": "hard", + "avg_total_cost": 0.000558, + "sum_total_cost": 0.00837, + "run_count": 15, + "avg_input_cost": 0.000259, + "sum_input_cost": 0.00389, + "avg_output_cost": 0.000299, + "sum_output_cost": 0.00448 + }, + { + "Model": "qwen/qwen3-30b-a3b-2507", + "test_group": "medium", + "avg_total_cost": 0.000558, + "sum_total_cost": 0.008377, + "run_count": 15, + "avg_input_cost": 0.000259, + "sum_input_cost": 0.003885, + "avg_output_cost": 0.000299, + "sum_output_cost": 0.004492 + }, + { + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "test_group": "easy", + "avg_total_cost": 0.001714, + "sum_total_cost": 0.022284, + "run_count": 13, + "avg_input_cost": 0.000231, + "sum_input_cost": 0.002997, + "avg_output_cost": 0.001484, + "sum_output_cost": 0.019286 + }, + { + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "test_group": "hard", + "avg_total_cost": 0.001595, + "sum_total_cost": 0.020733, + "run_count": 13, + "avg_input_cost": 0.000272, + "sum_input_cost": 0.003538, + "avg_output_cost": 0.001323, + "sum_output_cost": 0.017196 + }, + { + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "test_group": "medium", + "avg_total_cost": 0.001851, + "sum_total_cost": 0.024069, + "run_count": 13, + "avg_input_cost": 0.000238, + "sum_input_cost": 0.003096, + "avg_output_cost": 0.001613, + "sum_output_cost": 0.020974 + }, + { + "Model": "qwen3-coder-30b-a3b-instruct-mlx", + "test_group": "easy", + "avg_total_cost": 0.000458, + "sum_total_cost": 0.006865, + "run_count": 15, + "avg_input_cost": 0.000201, + "sum_input_cost": 0.00302, + "avg_output_cost": 0.000256, + "sum_output_cost": 0.003845 + }, + { + "Model": "qwen3-coder-30b-a3b-instruct-mlx", + "test_group": "hard", + "avg_total_cost": 0.00046, + "sum_total_cost": 0.0069, + "run_count": 15, + "avg_input_cost": 0.000201, + "sum_input_cost": 0.003015, + "avg_output_cost": 0.000259, + "sum_output_cost": 0.003886 + }, + { + "Model": "qwen3-coder-30b-a3b-instruct-mlx", + "test_group": "medium", + "avg_total_cost": 0.000459, + "sum_total_cost": 0.006888, + "run_count": 15, + "avg_input_cost": 0.000201, + "sum_input_cost": 0.003015, + "avg_output_cost": 0.000258, + "sum_output_cost": 0.003873 + }, + { + "Model": "seed-oss-36b-instruct-mlx", + "test_group": "easy", + "avg_total_cost": 0.000999, + "sum_total_cost": 0.014992, + "run_count": 15, + "avg_input_cost": 0.0005, + "sum_input_cost": 0.007499, + "avg_output_cost": 0.0005, + "sum_output_cost": 0.007493 + }, + { + "Model": "seed-oss-36b-instruct-mlx", + "test_group": "medium", + "avg_total_cost": 0.0, + "sum_total_cost": 0.0, + "run_count": 1, + "avg_input_cost": 0.0, + "sum_input_cost": 0.0, + "avg_output_cost": 0.0, + "sum_output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-lite-v1:0", + "test_group": "easy", + "avg_total_cost": 0.000238, + "sum_total_cost": 0.004768, + "run_count": 20, + "avg_input_cost": 0.000135, + "sum_input_cost": 0.002702, + "avg_output_cost": 0.000103, + "sum_output_cost": 0.002066 + }, + { + "Model": "us.amazon.nova-lite-v1:0", + "test_group": "hard", + "avg_total_cost": 0.000155, + "sum_total_cost": 0.003104, + "run_count": 20, + "avg_input_cost": 8.4e-5, + "sum_input_cost": 0.001671, + "avg_output_cost": 7.2e-5, + "sum_output_cost": 0.001433 + }, + { + "Model": "us.amazon.nova-lite-v1:0", + "test_group": "medium", + "avg_total_cost": 0.000349, + "sum_total_cost": 0.006982, + "run_count": 20, + "avg_input_cost": 0.000206, + "sum_input_cost": 0.004112, + "avg_output_cost": 0.000144, + "sum_output_cost": 0.00287 + }, + { + "Model": "us.amazon.nova-micro-v1:0", + "test_group": "easy", + "avg_total_cost": 0.00015, + "sum_total_cost": 0.002996, + "run_count": 20, + "avg_input_cost": 9.2e-5, + "sum_input_cost": 0.001832, + "avg_output_cost": 5.8e-5, + "sum_output_cost": 0.001165 + }, + { + "Model": "us.amazon.nova-micro-v1:0", + "test_group": "hard", + "avg_total_cost": 3.7e-5, + "sum_total_cost": 0.000741, + "run_count": 20, + "avg_input_cost": 3e-5, + "sum_input_cost": 0.000606, + "avg_output_cost": 7e-6, + "sum_output_cost": 0.000136 + }, + { + "Model": "us.amazon.nova-micro-v1:0", + "test_group": "medium", + "avg_total_cost": 8.4e-5, + "sum_total_cost": 0.001689, + "run_count": 20, + "avg_input_cost": 3.7e-5, + "sum_input_cost": 0.000748, + "avg_output_cost": 4.7e-5, + "sum_output_cost": 0.00094 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "test_group": "easy", + "avg_total_cost": 0.05788, + "sum_total_cost": 1.157602, + "run_count": 20, + "avg_input_cost": 0.039284, + "sum_input_cost": 0.78569, + "avg_output_cost": 0.018596, + "sum_output_cost": 0.371912 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "test_group": "hard", + "avg_total_cost": 0.023277, + "sum_total_cost": 0.465548, + "run_count": 20, + "avg_input_cost": 0.011096, + "sum_input_cost": 0.221922, + "avg_output_cost": 0.012181, + "sum_output_cost": 0.243625 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "test_group": "medium", + "avg_total_cost": 0.025766, + "sum_total_cost": 0.515325, + "run_count": 20, + "avg_input_cost": 0.012222, + "sum_input_cost": 0.24445, + "avg_output_cost": 0.013544, + "sum_output_cost": 0.270875 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "test_group": "easy", + "avg_total_cost": 0.00162, + "sum_total_cost": 0.032395, + "run_count": 20, + "avg_input_cost": 0.000864, + "sum_input_cost": 0.017285, + "avg_output_cost": 0.000756, + "sum_output_cost": 0.01511 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "test_group": "hard", + "avg_total_cost": 0.0, + "sum_total_cost": 0.0, + "run_count": 20, + "avg_input_cost": 0.0, + "sum_input_cost": 0.0, + "avg_output_cost": 0.0, + "sum_output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "test_group": "medium", + "avg_total_cost": 0.000913, + "sum_total_cost": 0.018257, + "run_count": 20, + "avg_input_cost": 0.000462, + "sum_input_cost": 0.009233, + "avg_output_cost": 0.000451, + "sum_output_cost": 0.009024 + }, + { + "Model": "xlam-2-32b-fc-r", + "test_group": "easy", + "avg_total_cost": 0.000418, + "sum_total_cost": 0.006267, + "run_count": 15, + "avg_input_cost": 0.000299, + "sum_input_cost": 0.004479, + "avg_output_cost": 0.000119, + "sum_output_cost": 0.001787 + } + ], + "raw_data": [ + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 25.314810254, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5288.0, + "provider": "Google", + "Metric_request_tokens": 2999.0, + "Metric_response_tokens": 1180.0, + "total_cost": 0.026638749999999996, + "input_cost": 0.00374875, + "output_cost": 0.022889999999999997 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 29.164007858, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5244.0, + "provider": "Google", + "Metric_request_tokens": 3007.0, + "Metric_response_tokens": 1155.0, + "total_cost": 0.02612875, + "input_cost": 0.00375875, + "output_cost": 0.022369999999999998 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 88.240292399, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 17808.0, + "provider": "Google", + "Metric_request_tokens": 11479.0, + "Metric_response_tokens": 2447.0, + "total_cost": 0.07763875, + "input_cost": 0.01434875, + "output_cost": 0.06329 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 46.903850207, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, @@ -1638,175 +2339,4239 @@ "output_cost": 0.030400000000000003 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 31.297773585, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5866.0, + "provider": "Google", + "Metric_request_tokens": 2999.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.03241875, + "input_cost": 0.00374875, + "output_cost": 0.02867 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 83.609651342, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 14672.0, + "provider": "Google", + "Metric_request_tokens": 7453.0, + "Metric_response_tokens": 1820.0, + "total_cost": 0.08150625, + "input_cost": 0.00931625, + "output_cost": 0.07219 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 53.065635183, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11351.0, + "provider": "Google", + "Metric_request_tokens": 6507.0, + "Metric_response_tokens": 1744.0, + "total_cost": 0.056573750000000006, + "input_cost": 0.00813375, + "output_cost": 0.048440000000000004 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 32.600817879, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5693.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1192.0, + "total_cost": 0.02974375, + "input_cost": 0.00388375, + "output_cost": 0.02586 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 33.96250988, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5849.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1190.0, + "total_cost": 0.03130375, + "input_cost": 0.00388375, + "output_cost": 0.02742 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 40.243743396, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6388.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1154.0, + "total_cost": 0.03669375, + "input_cost": 0.00388375, + "output_cost": 0.03281 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 27.064979554, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5337.0, + "provider": "Google", + "Metric_request_tokens": 2999.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.027128749999999997, + "input_cost": 0.00374875, + "output_cost": 0.023379999999999998 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 86.028303837, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 18472.0, + "provider": "Google", + "Metric_request_tokens": 11366.0, + "Metric_response_tokens": 2399.0, + "total_cost": 0.0852675, + "input_cost": 0.0142075, + "output_cost": 0.07106 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 32.780646607, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5703.0, + "provider": "Google", + "Metric_request_tokens": 3007.0, + "Metric_response_tokens": 1133.0, + "total_cost": 0.030718750000000003, + "input_cost": 0.00375875, + "output_cost": 0.02696 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 90.311101073, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 197868.0, + "provider": "Google", + "Metric_request_tokens": 6385.0, + "Metric_response_tokens": 1480.0, + "total_cost": 1.9228112499999999, + "input_cost": 0.00798125, + "output_cost": 1.9148299999999998 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 142.709473835, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 59.031657247, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 19.541418498, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 125.222876626, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 36.623935689, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 60.148244504, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 72343.0, + "provider": "Google", + "Metric_request_tokens": 54413.0, + "Metric_response_tokens": 2191.0, + "total_cost": 0.24731625000000002, + "input_cost": 0.06801625, + "output_cost": 0.17930000000000001 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 123.194014333, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 184076.0, + "provider": "Google", + "Metric_request_tokens": 60324.0, + "Metric_response_tokens": 3883.0, + "total_cost": 1.312925, + "input_cost": 0.075405, + "output_cost": 1.23752 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 16.74454915, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 64.164246072, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 78296.0, + "provider": "Google", + "Metric_request_tokens": 56413.0, + "Metric_response_tokens": 3431.0, + "total_cost": 0.28934625, + "input_cost": 0.07051625, + "output_cost": 0.21883 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 42.140703106, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 91218.0, + "provider": "Google", + "Metric_request_tokens": 4519.0, + "Metric_response_tokens": 1342.0, + "total_cost": 0.8726387499999999, + "input_cost": 0.005648749999999999, + "output_cost": 0.8669899999999999 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 95.642718199, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 62.127767944, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 68184.0, + "provider": "Google", + "Metric_request_tokens": 56065.0, + "Metric_response_tokens": 3177.0, + "total_cost": 0.19127125, + "input_cost": 0.07008125, + "output_cost": 0.12118999999999999 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 112.559107875, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 112.154635461, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 67.505358126, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 96802.0, + "provider": "Google", + "Metric_request_tokens": 54862.0, + "Metric_response_tokens": 2603.0, + "total_cost": 0.4879775, + "input_cost": 0.0685775, + "output_cost": 0.4194 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 22.877179664, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 46.778307508, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 18234.0, + "provider": "Google", + "Metric_request_tokens": 4004.0, + "Metric_response_tokens": 1315.0, + "total_cost": 0.147305, + "input_cost": 0.005005, + "output_cost": 0.14229999999999998 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 69.72560593, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 109719.0, + "provider": "Google", + "Metric_request_tokens": 39228.0, + "Metric_response_tokens": 1996.0, + "total_cost": 0.7539449999999999, + "input_cost": 0.049034999999999995, + "output_cost": 0.7049099999999999 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 107.98203716, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 136.260362709, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 80.179718025, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 158325.0, + "provider": "Google", + "Metric_request_tokens": 3874.0, + "Metric_response_tokens": 1511.0, + "total_cost": 1.5493525000000001, + "input_cost": 0.0048425, + "output_cost": 1.54451 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 54.728993541, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 99.128847196, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 46.010087457, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 30663.0, + "provider": "Google", + "Metric_request_tokens": 3980.0, + "Metric_response_tokens": 1295.0, + "total_cost": 0.271805, + "input_cost": 0.004975, + "output_cost": 0.26683 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 90.896605166, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 249.374101535, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 115.493402964, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 155268.0, + "provider": "Google", + "Metric_request_tokens": 55991.0, + "Metric_response_tokens": 3221.0, + "total_cost": 1.06275875, + "input_cost": 0.06998875, + "output_cost": 0.99277 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 90.85619701, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 233.217439735, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 8.909464048, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 5.837458282, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 6.413123275, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.292221706, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 12.366546526, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3796.0, + "provider": "Google", + "Metric_request_tokens": 2723.0, + "Metric_response_tokens": 1073.0, + "total_cost": 0.0007015000000000001, + "input_cost": 0.0002723, + "output_cost": 0.0004292 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 6.458367757, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1242.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 506.0, + "total_cost": 0.00027600000000000004, + "input_cost": 7.36e-5, + "output_cost": 0.00020240000000000004 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 7.122025352, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1261.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 525.0, + "total_cost": 0.0002836, + "input_cost": 7.36e-5, + "output_cost": 0.00021 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 15.055406281, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3796.0, + "provider": "Google", + "Metric_request_tokens": 2723.0, + "Metric_response_tokens": 1073.0, + "total_cost": 0.0007015000000000001, + "input_cost": 0.0002723, + "output_cost": 0.0004292 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 6.581593788, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 8.904104594, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1256.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 520.0, + "total_cost": 0.0002816, + "input_cost": 7.36e-5, + "output_cost": 0.000208 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.709682467, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 7.48106499, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 4.313346779, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.285199703, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.759377617, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 2.905133416, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.155831434, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.289127811, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 2.969590916, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.007718604, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.411889755, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 2.978789606, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1261.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 525.0, + "total_cost": 0.0002836, + "input_cost": 7.36e-5, + "output_cost": 0.00021 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.024806005, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.377116743, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 2.999520941, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.034895574, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.436441263, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 2.963080948, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.033675592, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.437838664, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 60.637950634, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 76294.0, + "provider": "Google", + "Metric_request_tokens": 15093.0, + "Metric_response_tokens": 2470.0, + "total_cost": 0.20930445, + "input_cost": 0.00226395, + "output_cost": 0.20704050000000002 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 31.479987348, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 40567.0, + "provider": "Google", + "Metric_request_tokens": 8696.0, + "Metric_response_tokens": 1852.0, + "total_cost": 0.10748210000000001, + "input_cost": 0.0013044, + "output_cost": 0.10617770000000001 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.009089436, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1730.0, + "provider": "Google", + "Metric_request_tokens": 810.0, + "Metric_response_tokens": 524.0, + "total_cost": 0.0018219, + "input_cost": 0.00012149999999999999, + "output_cost": 0.0017004 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 27.092670181, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 72731.0, + "provider": "Google", + "Metric_request_tokens": 7623.0, + "Metric_response_tokens": 1833.0, + "total_cost": 0.22370575, + "input_cost": 0.0011434499999999998, + "output_cost": 0.22256230000000002 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 15.495790935, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4759.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00946455, + "input_cost": 0.00025035, + "output_cost": 0.0092142 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 15.96490713, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5661.0, + "provider": "Google", + "Metric_request_tokens": 3007.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.0063383499999999995, + "input_cost": 0.00045105, + "output_cost": 0.0058873 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 54.049785353, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11479.0, + "provider": "Google", + "Metric_request_tokens": 4407.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.021993949999999998, + "input_cost": 0.00066105, + "output_cost": 0.0213329 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 21.902809368, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7171.0, + "provider": "Google", + "Metric_request_tokens": 3868.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.008738999999999999, + "input_cost": 0.0005802, + "output_cost": 0.008158799999999999 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 55.958218108, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 23354.0, + "provider": "Google", + "Metric_request_tokens": 3534.0, + "Metric_response_tokens": 1685.0, + "total_cost": 0.06501359999999999, + "input_cost": 0.0005300999999999999, + "output_cost": 0.06448349999999999 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 11.505478102, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 10428.0, + "provider": "Google", + "Metric_request_tokens": 811.0, + "Metric_response_tokens": 526.0, + "total_cost": 0.03225575, + "input_cost": 0.00012164999999999999, + "output_cost": 0.0321341 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 28.035491061, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 20717.0, + "provider": "Google", + "Metric_request_tokens": 6558.0, + "Metric_response_tokens": 1804.0, + "total_cost": 0.04530859999999999, + "input_cost": 0.0009837, + "output_cost": 0.044324899999999993 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 21.009636819, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 22447.0, + "provider": "Google", + "Metric_request_tokens": 3902.0, + "Metric_response_tokens": 1189.0, + "total_cost": 0.0620447, + "input_cost": 0.0005853, + "output_cost": 0.061459400000000004 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 29.185720392, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 7.030499633, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 2109.0, + "provider": "Google", + "Metric_request_tokens": 810.0, + "Metric_response_tokens": 17.0, + "total_cost": 0.0046187, + "input_cost": 0.00012149999999999999, + "output_cost": 0.0044972 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 28.157089277, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7844.0, + "provider": "Google", + "Metric_request_tokens": 3868.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.011094499999999998, + "input_cost": 0.0005802, + "output_cost": 0.010514299999999999 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 16.271507577, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 23939.0, + "provider": "Google", + "Metric_request_tokens": 3060.0, + "Metric_response_tokens": 1164.0, + "total_cost": 0.0701599, + "input_cost": 0.00045899999999999994, + "output_cost": 0.0697009 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 15.337420676, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7649.0, + "provider": "Google", + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.00540755, + "input_cost": 0.00080415, + "output_cost": 0.0046034 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 6.332825299, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2638.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00204105, + "input_cost": 0.00025035, + "output_cost": 0.0017907 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.406013529, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2617.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00195505, + "input_cost": 0.00025065, + "output_cost": 0.0017044 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.596515261, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2556.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00175405, + "input_cost": 0.00025035, + "output_cost": 0.0015037 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 10.369176736, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6932.0, + "provider": "Google", + "Metric_request_tokens": 3981.0, + "Metric_response_tokens": 1184.0, + "total_cost": 0.00749205, + "input_cost": 0.00059715, + "output_cost": 0.0068949 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 6.68512552, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5402.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.01170255, + "input_cost": 0.00025065, + "output_cost": 0.011451900000000001 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 8.790923666, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2875.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00286475, + "input_cost": 0.00025035, + "output_cost": 0.0026144000000000002 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 12.593662815, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3738.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00588525, + "input_cost": 0.00025035, + "output_cost": 0.0056349 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 23.406116351, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 46807.0, + "provider": "Google", + "Metric_request_tokens": 4005.0, + "Metric_response_tokens": 1266.0, + "total_cost": 0.14673635, + "input_cost": 0.0006007499999999999, + "output_cost": 0.14613559999999998 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 5.594700624, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2449.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0013795500000000002, + "input_cost": 0.00025035, + "output_cost": 0.0011292000000000001 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 21.817208931, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 42068.0, + "provider": "Google", + "Metric_request_tokens": 3980.0, + "Metric_response_tokens": 1185.0, + "total_cost": 0.13046849999999996, + "input_cost": 0.000597, + "output_cost": 0.12987149999999997 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 14.751506297, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9535.0, + "provider": "Google", + "Metric_request_tokens": 3970.0, + "Metric_response_tokens": 1219.0, + "total_cost": 0.016537899999999998, + "input_cost": 0.0005954999999999999, + "output_cost": 0.0159424 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 26.279386281, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 25985.0, + "provider": "Google", + "Metric_request_tokens": 9113.0, + "Metric_response_tokens": 1804.0, + "total_cost": 0.055187349999999996, + "input_cost": 0.0013669499999999998, + "output_cost": 0.0538204 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 14.502334274, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7387.0, + "provider": "Google", + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.004490549999999999, + "input_cost": 0.00080415, + "output_cost": 0.0036864 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 4.444876941, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4284.0, + "provider": "Google", + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.0007872, + "input_cost": 0.0003088, + "output_cost": 0.00047840000000000003 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.627413202, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4291.0, + "provider": "Google", + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 1202.0, + "total_cost": 0.0007897, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0004808 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.704063431, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3774.0, + "provider": "Google", + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 683.0, + "total_cost": 0.0005823, + "input_cost": 0.00030910000000000003, + "output_cost": 0.0002732 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 4.447734786, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4282.0, + "provider": "Google", + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 1191.0, + "total_cost": 0.0007855000000000001, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00047640000000000003 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.576908765, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4271.0, + "provider": "Google", + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 1180.0, + "total_cost": 0.0007811000000000001, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00047200000000000003 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.032189281, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4612.0, + "provider": "Google", + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 1476.0, + "total_cost": 0.0009040000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0005904 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 4.472401128, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4274.0, + "provider": "Google", + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1186.0, + "total_cost": 0.0007832000000000001, + "input_cost": 0.0003088, + "output_cost": 0.00047440000000000004 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.732016304, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3778.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 671.0, + "total_cost": 0.0005791, + "input_cost": 0.0003107, + "output_cost": 0.0002684 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 4.756348604, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4571.0, + "provider": "Google", + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 1435.0, + "total_cost": 0.0008876000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0005740000000000001 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 4.71480991, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4318.0, + "provider": "Google", + "Metric_request_tokens": 3104.0, + "Metric_response_tokens": 1214.0, + "total_cost": 0.000796, + "input_cost": 0.0003104, + "output_cost": 0.00048560000000000004 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.797644523, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3776.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 669.0, + "total_cost": 0.0005783, + "input_cost": 0.0003107, + "output_cost": 0.0002676 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 4.755809993, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4296.0, + "provider": "Google", + "Metric_request_tokens": 3092.0, + "Metric_response_tokens": 1204.0, + "total_cost": 0.0007908, + "input_cost": 0.00030920000000000003, + "output_cost": 0.0004816 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 4.602118065, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4273.0, + "provider": "Google", + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1185.0, + "total_cost": 0.0007828, + "input_cost": 0.0003088, + "output_cost": 0.00047400000000000003 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.466044834, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4312.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1205.0, + "total_cost": 0.0007927, + "input_cost": 0.0003107, + "output_cost": 0.000482 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 4.130914105, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3861.0, + "provider": "Google", + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 725.0, + "total_cost": 0.0006036, + "input_cost": 0.00031360000000000003, + "output_cost": 0.00029 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 12.230727772, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11014.0, + "provider": "Google", + "Metric_request_tokens": 3108.0, + "Metric_response_tokens": 1198.0, + "total_cost": 0.024663, + "input_cost": 0.0004662, + "output_cost": 0.0241968 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 8.306374733, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2757.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00245175, + "input_cost": 0.00025035, + "output_cost": 0.0022014 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 6.595447784, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2698.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0022510499999999997, + "input_cost": 0.00025035, + "output_cost": 0.0020007 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 6.077756987, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 8290.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.02181055, + "input_cost": 0.00025065, + "output_cost": 0.0215599 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 13.08787764, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 12993.0, + "provider": "Google", + "Metric_request_tokens": 5355.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.024140349999999998, + "input_cost": 0.00080325, + "output_cost": 0.0233371 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 18.203576056, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 7.877005808, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2829.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00269705, + "input_cost": 0.00025065, + "output_cost": 0.0024464 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 6.691518112, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2766.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00248905, + "input_cost": 0.00025035, + "output_cost": 0.0022387 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.46408462, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2591.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0018765499999999998, + "input_cost": 0.00025035, + "output_cost": 0.0016262 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 11.559526636, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 15503.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.047056049999999995, + "input_cost": 0.00025065, + "output_cost": 0.0468054 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 18.417513707, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8232.0, + "provider": "Google", + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.0074480499999999995, + "input_cost": 0.00080415, + "output_cost": 0.0066438999999999995 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 9.485176875, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2981.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00323575, + "input_cost": 0.00025035, + "output_cost": 0.0029854 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 6.271124626, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2722.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00232255, + "input_cost": 0.00025065, + "output_cost": 0.0020719000000000002 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 18.342354393, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 20909.0, + "provider": "Google", + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1158.0, + "total_cost": 0.051863950000000006, + "input_cost": 0.00080415, + "output_cost": 0.0510598 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 19.63725688, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 19611.0, + "provider": "Google", + "Metric_request_tokens": 3981.0, + "Metric_response_tokens": 1184.0, + "total_cost": 0.05186855, + "input_cost": 0.00059715, + "output_cost": 0.0512714 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 141.168154989, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 68346.0, + "provider": "Amazon", + "Metric_request_tokens": 64861.0, + "Metric_response_tokens": 3485.0, + "total_cost": 0.205715, + "input_cost": 0.1621525, + "output_cost": 0.0435625 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 66.419366072, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9777.0, + "provider": "Amazon", + "Metric_request_tokens": 7990.0, + "Metric_response_tokens": 1787.0, + "total_cost": 0.0423125, + "input_cost": 0.019975, + "output_cost": 0.0223375 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 61.750421529, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 60.774519728, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6479.0, + "provider": "Amazon", + "Metric_request_tokens": 4710.0, + "Metric_response_tokens": 1769.0, + "total_cost": 0.0338875, + "input_cost": 0.011774999999999999, + "output_cost": 0.0221125 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 61.257517708, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 14585.0, + "provider": "Amazon", + "Metric_request_tokens": 11771.0, + "Metric_response_tokens": 2814.0, + "total_cost": 0.06460250000000001, + "input_cost": 0.029427500000000002, + "output_cost": 0.035175 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 168.374540939, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 22046.0, + "provider": "Amazon", + "Metric_request_tokens": 18606.0, + "Metric_response_tokens": 3440.0, + "total_cost": 0.089515, + "input_cost": 0.046515, + "output_cost": 0.043 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 81.523331201, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 63619.0, + "provider": "Amazon", + "Metric_request_tokens": 60858.0, + "Metric_response_tokens": 2761.0, + "total_cost": 0.1866575, + "input_cost": 0.152145, + "output_cost": 0.0345125 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 47.967601991, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6432.0, + "provider": "Amazon", + "Metric_request_tokens": 4754.0, + "Metric_response_tokens": 1678.0, + "total_cost": 0.03286, + "input_cost": 0.011885000000000001, + "output_cost": 0.020975 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 72.870020591, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 19765.0, + "provider": "Amazon", + "Metric_request_tokens": 16317.0, + "Metric_response_tokens": 3448.0, + "total_cost": 0.08389250000000001, + "input_cost": 0.0407925, + "output_cost": 0.0431 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 52.794925723, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10125.0, + "provider": "Amazon", + "Metric_request_tokens": 8113.0, + "Metric_response_tokens": 2012.0, + "total_cost": 0.0454325, + "input_cost": 0.020282500000000002, + "output_cost": 0.02515 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 102.690218942, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 27.695561409, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6168.0, + "provider": "Amazon", + "Metric_request_tokens": 4543.0, + "Metric_response_tokens": 1625.0, + "total_cost": 0.031670000000000004, + "input_cost": 0.0113575, + "output_cost": 0.0203125 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 22.951197833, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6002.0, + "provider": "Amazon", + "Metric_request_tokens": 4471.0, + "Metric_response_tokens": 1531.0, + "total_cost": 0.030315, + "input_cost": 0.0111775, + "output_cost": 0.0191375 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 89.757139435, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 116.956568909, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 141.168154989, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 68346.0, + "provider": "Amazon", + "Metric_request_tokens": 64861.0, + "Metric_response_tokens": 3485.0, + "total_cost": 0.205715, + "input_cost": 0.1621525, + "output_cost": 0.0435625 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 66.419366072, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9777.0, + "provider": "Amazon", + "Metric_request_tokens": 7990.0, + "Metric_response_tokens": 1787.0, + "total_cost": 0.0423125, + "input_cost": 0.019975, + "output_cost": 0.0223375 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 61.750421529, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 60.774519728, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6479.0, + "provider": "Amazon", + "Metric_request_tokens": 4710.0, + "Metric_response_tokens": 1769.0, + "total_cost": 0.0338875, + "input_cost": 0.011774999999999999, + "output_cost": 0.0221125 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 61.257517708, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 14585.0, + "provider": "Amazon", + "Metric_request_tokens": 11771.0, + "Metric_response_tokens": 2814.0, + "total_cost": 0.06460250000000001, + "input_cost": 0.029427500000000002, + "output_cost": 0.035175 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 168.374540939, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 22046.0, + "provider": "Amazon", + "Metric_request_tokens": 18606.0, + "Metric_response_tokens": 3440.0, + "total_cost": 0.089515, + "input_cost": 0.046515, + "output_cost": 0.043 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 81.523331201, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 63619.0, + "provider": "Amazon", + "Metric_request_tokens": 60858.0, + "Metric_response_tokens": 2761.0, + "total_cost": 0.1866575, + "input_cost": 0.152145, + "output_cost": 0.0345125 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 47.967601991, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6432.0, + "provider": "Amazon", + "Metric_request_tokens": 4754.0, + "Metric_response_tokens": 1678.0, + "total_cost": 0.03286, + "input_cost": 0.011885000000000001, + "output_cost": 0.020975 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 72.870020591, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 19765.0, + "provider": "Amazon", + "Metric_request_tokens": 16317.0, + "Metric_response_tokens": 3448.0, + "total_cost": 0.08389250000000001, + "input_cost": 0.0407925, + "output_cost": 0.0431 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 52.794925723, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10125.0, + "provider": "Amazon", + "Metric_request_tokens": 8113.0, + "Metric_response_tokens": 2012.0, + "total_cost": 0.0454325, + "input_cost": 0.020282500000000002, + "output_cost": 0.02515 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 102.690218942, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 27.695561409, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6168.0, + "provider": "Amazon", + "Metric_request_tokens": 4543.0, + "Metric_response_tokens": 1625.0, + "total_cost": 0.031670000000000004, + "input_cost": 0.0113575, + "output_cost": 0.0203125 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 22.951197833, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6002.0, + "provider": "Amazon", + "Metric_request_tokens": 4471.0, + "Metric_response_tokens": 1531.0, + "total_cost": 0.030315, + "input_cost": 0.0111775, + "output_cost": 0.0191375 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 89.757139435, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 116.956568909, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 61.788091838, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8566.0, + "provider": "Amazon", + "Metric_request_tokens": 6763.0, + "Metric_response_tokens": 1803.0, + "total_cost": 0.039444999999999994, + "input_cost": 0.0169075, + "output_cost": 0.0225375 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 139.579026705, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 28485.0, + "provider": "Amazon", + "Metric_request_tokens": 24611.0, + "Metric_response_tokens": 3874.0, + "total_cost": 0.1099525, + "input_cost": 0.0615275, + "output_cost": 0.048424999999999996 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 102.15980072, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 116.141871727, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 19949.0, + "provider": "Amazon", + "Metric_request_tokens": 16646.0, + "Metric_response_tokens": 3303.0, + "total_cost": 0.08290249999999999, + "input_cost": 0.041615, + "output_cost": 0.0412875 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 59.31011804, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6590.0, + "provider": "Amazon", + "Metric_request_tokens": 4868.0, + "Metric_response_tokens": 1722.0, + "total_cost": 0.033695, + "input_cost": 0.01217, + "output_cost": 0.021525 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 153.57670155, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 52.652731978, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6372.0, + "provider": "Amazon", + "Metric_request_tokens": 4841.0, + "Metric_response_tokens": 1531.0, + "total_cost": 0.03124, + "input_cost": 0.012102499999999999, + "output_cost": 0.0191375 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 124.215302042, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 22787.0, + "provider": "Amazon", + "Metric_request_tokens": 19271.0, + "Metric_response_tokens": 3516.0, + "total_cost": 0.0921275, + "input_cost": 0.0481775, + "output_cost": 0.04395 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 152.610399933, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 122.520721486, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 130.053796609, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 85.873660202, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 12301.0, + "provider": "Amazon", + "Metric_request_tokens": 9837.0, + "Metric_response_tokens": 2464.0, + "total_cost": 0.0553925, + "input_cost": 0.0245925, + "output_cost": 0.0308 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 122.249576234, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 1.075055246, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 1.21642044, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 1.060936443, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 1.037051732, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 1.056416859, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 1.045000565, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 1.174768992, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 1.06139308, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 1.096773894, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 1.065730453, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 1.034951182, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 1.170030852, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 1.082285844, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 1.03252349, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 1.038807845, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 1.059020363, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 1.159144096, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 70.761430334, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 50.064655716, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 54.286773631, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 52.848939949, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 52.4068963, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 50.579682246, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 50.490538016, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 49.381506192, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 51.014168316, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 53.619240014, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 46.101775188, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 53.682643889, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 19.294165416, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 52.706066252, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 53.323483575, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 70.761430334, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 50.064655716, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 54.286773631, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 52.848939949, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 52.4068963, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 50.579682246, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 50.490538016, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 49.381506192, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 51.014168316, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 53.619240014, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 46.101775188, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 53.682643889, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 19.294165416, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 52.706066252, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 53.323483575, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 60.490865526, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 24.38614935, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7391.0, + "provider": "Amazon", + "Metric_request_tokens": 6014.0, + "Metric_response_tokens": 1377.0, + "total_cost": 0.009217600000000001, + "input_cost": 0.004811200000000001, + "output_cost": 0.0044064 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 19.587720037, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 35.779726991, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 50.898700324, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 29.120025907, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 52.342244267, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 51.294958536, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 50.891190365, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 47.666868754, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 18401.0, + "provider": "Amazon", + "Metric_request_tokens": 15769.0, + "Metric_response_tokens": 2632.0, + "total_cost": 0.0210376, + "input_cost": 0.0126152, + "output_cost": 0.0084224 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 63.490425569, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 63.600602322, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 50.937322607, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 21.340557151, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 54.150935132, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 53.966453918, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 50.698133211, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 54.007505821, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 65.783296981, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 63.681787497, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 64.71506337, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 52.909417244, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 24.945646339, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6970.0, + "provider": "Amazon", + "Metric_request_tokens": 5527.0, + "Metric_response_tokens": 1443.0, + "total_cost": 0.0090392, + "input_cost": 0.0044216, + "output_cost": 0.0046176 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 52.262290885, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 34.392486572, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7927.0, + "provider": "Amazon", + "Metric_request_tokens": 5837.0, + "Metric_response_tokens": 2090.0, + "total_cost": 0.011357599999999999, + "input_cost": 0.0046696, + "output_cost": 0.0066879999999999995 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 51.782173036, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 51.199029621, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 56.303173032, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 45.050576092, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 53.036889105, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 32.586487666, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 9.551409036, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5238.0, + "provider": "Amazon", + "Metric_request_tokens": 4492.0, + "Metric_response_tokens": 746.0, + "total_cost": 0.00044856000000000003, + "input_cost": 0.00026952000000000004, + "output_cost": 0.00017904 + }, + { + "Model": "us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 24.607065725, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 31.297773585, - "Score_MermaidDiagramValid": 1.0, + "Duration": 13.907358597, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5866.0, - "provider": "Google", - "Metric_request_tokens": 2999.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.03241875, - "input_cost": 0.00374875, - "output_cost": 0.02867 + "total_tokens": 5408.0, + "provider": "Amazon", + "Metric_request_tokens": 4043.0, + "Metric_response_tokens": 1365.0, + "total_cost": 0.0005701799999999999, + "input_cost": 0.00024257999999999997, + "output_cost": 0.00032759999999999994 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 83.609651342, + "Duration": 33.254325135, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 14672.0, - "provider": "Google", - "Metric_request_tokens": 7453.0, - "Metric_response_tokens": 1820.0, - "total_cost": 0.08150625, - "input_cost": 0.00931625, - "output_cost": 0.07219 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 53.065635183, + "Duration": 34.303524713, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 11351.0, - "provider": "Google", - "Metric_request_tokens": 6507.0, - "Metric_response_tokens": 1744.0, - "total_cost": 0.056573750000000006, - "input_cost": 0.00813375, - "output_cost": 0.048440000000000004 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 32.600817879, - "Score_MermaidDiagramValid": 1.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5693.0, - "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1192.0, - "total_cost": 0.02974375, - "input_cost": 0.00388375, - "output_cost": 0.02586 + "Duration": 38.439967155, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 33.96250988, - "Score_MermaidDiagramValid": 1.0, + "Duration": 14.821898825, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5849.0, - "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1190.0, - "total_cost": 0.03130375, - "input_cost": 0.00388375, - "output_cost": 0.02742 + "total_tokens": 9263.0, + "provider": "Amazon", + "Metric_request_tokens": 7867.0, + "Metric_response_tokens": 1396.0, + "total_cost": 0.0008070600000000001, + "input_cost": 0.00047202, + "output_cost": 0.00033504 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 40.243743396, + "Duration": 18.995154891, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6388.0, - "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1154.0, - "total_cost": 0.03669375, - "input_cost": 0.00388375, - "output_cost": 0.03281 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 27.064979554, - "Score_MermaidDiagramValid": 1.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5337.0, - "provider": "Google", - "Metric_request_tokens": 2999.0, - "Metric_response_tokens": 1179.0, - "total_cost": 0.027128749999999997, - "input_cost": 0.00374875, - "output_cost": 0.023379999999999998 + "Duration": 69.995588077, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 86.028303837, + "Duration": 9.927372481, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 18472.0, - "provider": "Google", - "Metric_request_tokens": 11366.0, - "Metric_response_tokens": 2399.0, - "total_cost": 0.0852675, - "input_cost": 0.0142075, - "output_cost": 0.07106 + "total_tokens": 5372.0, + "provider": "Amazon", + "Metric_request_tokens": 4550.0, + "Metric_response_tokens": 822.0, + "total_cost": 0.00047028, + "input_cost": 0.000273, + "output_cost": 0.00019728 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 32.780646607, + "Duration": 32.900333046, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5703.0, - "provider": "Google", - "Metric_request_tokens": 3007.0, - "Metric_response_tokens": 1133.0, - "total_cost": 0.030718750000000003, - "input_cost": 0.00375875, - "output_cost": 0.02696 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 90.311101073, - "Score_MermaidDiagramValid": 1.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 197868.0, - "provider": "Google", - "Metric_request_tokens": 6385.0, - "Metric_response_tokens": 1480.0, - "total_cost": 1.9228112499999999, - "input_cost": 0.00798125, - "output_cost": 1.9148299999999998 + "Duration": 28.391986261, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 142.709473835, + "Duration": 5.633113428, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3619.0, + "provider": "Amazon", + "Metric_request_tokens": 3397.0, + "Metric_response_tokens": 222.0, + "total_cost": 0.00025709999999999996, + "input_cost": 0.00020381999999999998, + "output_cost": 5.328e-5 + }, + { + "Model": "us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 15.857710005, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 32.586487666, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -1814,15 +6579,31 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 9.551409036, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5238.0, + "provider": "Amazon", + "Metric_request_tokens": 4492.0, + "Metric_response_tokens": 746.0, + "total_cost": 0.00044856000000000003, + "input_cost": 0.00026952000000000004, + "output_cost": 0.00017904 + }, + { + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 59.031657247, + "Duration": 24.607065725, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -1830,31 +6611,31 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 19.541418498, + "Duration": 13.907358597, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5408.0, + "provider": "Amazon", + "Metric_request_tokens": 4043.0, + "Metric_response_tokens": 1365.0, + "total_cost": 0.0005701799999999999, + "input_cost": 0.00024257999999999997, + "output_cost": 0.00032759999999999994 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 125.222876626, + "Duration": 33.254325135, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -1862,15 +6643,15 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 36.623935689, + "Duration": 34.303524713, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -1878,47 +6659,47 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 60.148244504, - "Score_MermaidDiagramValid": 1.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 72343.0, - "provider": "Google", - "Metric_request_tokens": 54413.0, - "Metric_response_tokens": 2191.0, - "total_cost": 0.24731625000000002, - "input_cost": 0.06801625, - "output_cost": 0.17930000000000001 + "Duration": 38.439967155, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 123.194014333, - "Score_MermaidDiagramValid": 1.0, + "Duration": 14.821898825, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 184076.0, - "provider": "Google", - "Metric_request_tokens": 60324.0, - "Metric_response_tokens": 3883.0, - "total_cost": 1.312925, - "input_cost": 0.075405, - "output_cost": 1.23752 + "total_tokens": 9263.0, + "provider": "Amazon", + "Metric_request_tokens": 7867.0, + "Metric_response_tokens": 1396.0, + "total_cost": 0.0008070600000000001, + "input_cost": 0.00047202, + "output_cost": 0.00033504 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 16.74454915, + "Duration": 18.995154891, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -1926,47 +6707,47 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 64.164246072, - "Score_MermaidDiagramValid": 1.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 78296.0, - "provider": "Google", - "Metric_request_tokens": 56413.0, - "Metric_response_tokens": 3431.0, - "total_cost": 0.28934625, - "input_cost": 0.07051625, - "output_cost": 0.21883 + "Duration": 69.995588077, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 42.140703106, + "Duration": 9.927372481, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 91218.0, - "provider": "Google", - "Metric_request_tokens": 4519.0, - "Metric_response_tokens": 1342.0, - "total_cost": 0.8726387499999999, - "input_cost": 0.005648749999999999, - "output_cost": 0.8669899999999999 + "total_tokens": 5372.0, + "provider": "Amazon", + "Metric_request_tokens": 4550.0, + "Metric_response_tokens": 822.0, + "total_cost": 0.00047028, + "input_cost": 0.000273, + "output_cost": 0.00019728 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 95.642718199, + "Duration": 32.900333046, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -1974,31 +6755,15 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 62.127767944, - "Score_MermaidDiagramValid": 1.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 68184.0, - "provider": "Google", - "Metric_request_tokens": 56065.0, - "Metric_response_tokens": 3177.0, - "total_cost": 0.19127125, - "input_cost": 0.07008125, - "output_cost": 0.12118999999999999 - }, - { - "Model": "gemini-2.5-pro-preview-05-06", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 112.559107875, + "Duration": 28.391986261, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -2006,15 +6771,31 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 5.633113428, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3619.0, + "provider": "Amazon", + "Metric_request_tokens": 3397.0, + "Metric_response_tokens": 222.0, + "total_cost": 0.00025709999999999996, + "input_cost": 0.00020381999999999998, + "output_cost": 5.328e-5 + }, + { + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 112.154635461, + "Duration": 15.857710005, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -2022,31 +6803,63 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 67.505358126, - "Score_MermaidDiagramValid": 1.0, + "Duration": 10.13120079, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 96802.0, - "provider": "Google", - "Metric_request_tokens": 54862.0, - "Metric_response_tokens": 2603.0, - "total_cost": 0.4879775, - "input_cost": 0.0685775, - "output_cost": 0.4194 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4093.0, + "provider": "Amazon", + "Metric_request_tokens": 3436.0, + "Metric_response_tokens": 657.0, + "total_cost": 0.00036384, + "input_cost": 0.00020616, + "output_cost": 0.00015768 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 22.877179664, + "Duration": 10.202050303, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5241.0, + "provider": "Amazon", + "Metric_request_tokens": 4466.0, + "Metric_response_tokens": 775.0, + "total_cost": 0.00045396, + "input_cost": 0.00026796000000000003, + "output_cost": 0.000186 + }, + { + "Model": "us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 12.523508567, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4560.0, + "provider": "Amazon", + "Metric_request_tokens": 3455.0, + "Metric_response_tokens": 1105.0, + "total_cost": 0.0004725, + "input_cost": 0.0002073, + "output_cost": 0.0002652 + }, + { + "Model": "us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 33.301909151, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -2054,63 +6867,79 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 46.778307508, + "Model": "us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 16.07952375, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 18234.0, - "provider": "Google", - "Metric_request_tokens": 4004.0, - "Metric_response_tokens": 1315.0, - "total_cost": 0.147305, - "input_cost": 0.005005, - "output_cost": 0.14229999999999998 + "total_tokens": 9166.0, + "provider": "Amazon", + "Metric_request_tokens": 7880.0, + "Metric_response_tokens": 1286.0, + "total_cost": 0.00078144, + "input_cost": 0.0004728, + "output_cost": 0.00030864 + }, + { + "Model": "us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 20.832885038, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10426.0, + "provider": "Amazon", + "Metric_request_tokens": 8538.0, + "Metric_response_tokens": 1888.0, + "total_cost": 0.0009653999999999999, + "input_cost": 0.00051228, + "output_cost": 0.00045311999999999994 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 69.72560593, - "Score_MermaidDiagramValid": 1.0, + "Duration": 10.103307419, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 109719.0, - "provider": "Google", - "Metric_request_tokens": 39228.0, - "Metric_response_tokens": 1996.0, - "total_cost": 0.7539449999999999, - "input_cost": 0.049034999999999995, - "output_cost": 0.7049099999999999 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5207.0, + "provider": "Amazon", + "Metric_request_tokens": 4458.0, + "Metric_response_tokens": 749.0, + "total_cost": 0.00044724, + "input_cost": 0.00026748, + "output_cost": 0.00017976 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 107.98203716, + "Duration": 15.416841941, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7739.0, + "provider": "Amazon", + "Metric_request_tokens": 6387.0, + "Metric_response_tokens": 1352.0, + "total_cost": 0.0007076999999999999, + "input_cost": 0.00038322, + "output_cost": 0.00032448 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 136.260362709, + "Duration": 32.051268291, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -2118,47 +6947,47 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 80.179718025, - "Score_MermaidDiagramValid": 1.0, + "Duration": 23.906853847, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 158325.0, - "provider": "Google", - "Metric_request_tokens": 3874.0, - "Metric_response_tokens": 1511.0, - "total_cost": 1.5493525000000001, - "input_cost": 0.0048425, - "output_cost": 1.54451 + "total_tokens": 19298.0, + "provider": "Amazon", + "Metric_request_tokens": 17687.0, + "Metric_response_tokens": 1611.0, + "total_cost": 0.00144786, + "input_cost": 0.00106122, + "output_cost": 0.00038664 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 54.728993541, + "Duration": 14.752534117, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 6073.0, + "provider": "Amazon", + "Metric_request_tokens": 4698.0, + "Metric_response_tokens": 1375.0, + "total_cost": 0.00061188, + "input_cost": 0.00028188, + "output_cost": 0.00032999999999999994 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 99.128847196, + "Duration": 58.670944361, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -2166,31 +6995,31 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 46.010087457, + "Duration": 9.086971754, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 30663.0, - "provider": "Google", - "Metric_request_tokens": 3980.0, - "Metric_response_tokens": 1295.0, - "total_cost": 0.271805, - "input_cost": 0.004975, - "output_cost": 0.26683 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4078.0, + "provider": "Amazon", + "Metric_request_tokens": 3427.0, + "Metric_response_tokens": 651.0, + "total_cost": 0.00036186, + "input_cost": 0.00020562, + "output_cost": 0.00015623999999999998 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 90.896605166, + "Duration": 35.618876054, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -2198,15 +7027,15 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 249.374101535, + "Duration": 20.22773949, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -2214,31 +7043,31 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 115.493402964, - "Score_MermaidDiagramValid": 1.0, + "Duration": 9.717342214, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 155268.0, - "provider": "Google", - "Metric_request_tokens": 55991.0, - "Metric_response_tokens": 3221.0, - "total_cost": 1.06275875, - "input_cost": 0.06998875, - "output_cost": 0.99277 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4188.0, + "provider": "Amazon", + "Metric_request_tokens": 3424.0, + "Metric_response_tokens": 764.0, + "total_cost": 0.0003888, + "input_cost": 0.00020543999999999998, + "output_cost": 0.00018336 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 90.85619701, + "Duration": 36.005522453, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -2246,15 +7075,15 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 233.217439735, + "Duration": 37.475340611, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -2262,687 +7091,671 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 8.909464048, + "Duration": 15.113064255, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5962.0, + "provider": "Amazon", + "Metric_request_tokens": 4515.0, + "Metric_response_tokens": 1447.0, + "total_cost": 0.00061818, + "input_cost": 0.00027089999999999997, + "output_cost": 0.00034728 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.837458282, + "Duration": 17.63608912, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.413123275, + "Duration": 10.620195375, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5359.0, + "provider": "Amazon", + "Metric_request_tokens": 4536.0, + "Metric_response_tokens": 823.0, + "total_cost": 0.00046968, + "input_cost": 0.00027215999999999997, + "output_cost": 0.00019752 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.292221706, + "Duration": 57.336604821, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 - }, - { - "Model": "gemini-2.0-flash", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 12.366546526, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3796.0, - "provider": "Google", - "Metric_request_tokens": 2723.0, - "Metric_response_tokens": 1073.0, - "total_cost": 0.0007015000000000001, - "input_cost": 0.0002723, - "output_cost": 0.0004292 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 33.836315522, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.458367757, + "Duration": 5.83794607, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1242.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 506.0, - "total_cost": 0.00027600000000000004, - "input_cost": 7.36e-5, - "output_cost": 0.00020240000000000004 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3553.0, + "provider": "Amazon", + "Metric_request_tokens": 3358.0, + "Metric_response_tokens": 195.0, + "total_cost": 0.00024828, + "input_cost": 0.00020147999999999998, + "output_cost": 4.68e-5 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 7.122025352, + "Duration": 33.75772521, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1261.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 525.0, - "total_cost": 0.0002836, - "input_cost": 7.36e-5, - "output_cost": 0.00021 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 15.055406281, + "Duration": 10.1295691, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3796.0, - "provider": "Google", - "Metric_request_tokens": 2723.0, - "Metric_response_tokens": 1073.0, - "total_cost": 0.0007015000000000001, - "input_cost": 0.0002723, - "output_cost": 0.0004292 + "total_tokens": 5291.0, + "provider": "Amazon", + "Metric_request_tokens": 4491.0, + "Metric_response_tokens": 800.0, + "total_cost": 0.00046146, + "input_cost": 0.00026946, + "output_cost": 0.000192 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.581593788, + "Duration": 9.798962699, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4239.0, + "provider": "Amazon", + "Metric_request_tokens": 3490.0, + "Metric_response_tokens": 749.0, + "total_cost": 0.00038916, + "input_cost": 0.0002094, + "output_cost": 0.00017976 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 8.904104594, + "Duration": 35.330206914, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1256.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 520.0, - "total_cost": 0.0002816, - "input_cost": 7.36e-5, - "output_cost": 0.000208 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.709682467, + "Duration": 57.658972199, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-lite-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 7.48106499, + "Duration": 13.06782325, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5686.0, + "provider": "Amazon", + "Metric_request_tokens": 4477.0, + "Metric_response_tokens": 1209.0, + "total_cost": 0.00055878, + "input_cost": 0.00026862, + "output_cost": 0.00029016 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.313346779, + "Duration": 7.235511866, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4178.0, + "provider": "Amazon", + "Metric_request_tokens": 3433.0, + "Metric_response_tokens": 745.0, + "total_cost": 0.00022445500000000003, + "input_cost": 0.000120155, + "output_cost": 0.00010430000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.285199703, + "Duration": 10.032302433, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4888.0, + "provider": "Amazon", + "Metric_request_tokens": 3535.0, + "Metric_response_tokens": 1353.0, + "total_cost": 0.000313145, + "input_cost": 0.000123725, + "output_cost": 0.00018942 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.759377617, + "Duration": 16.138580876, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 2.905133416, + "Duration": 10.894268114, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7935.0, + "provider": "Amazon", + "Metric_request_tokens": 6523.0, + "Metric_response_tokens": 1412.0, + "total_cost": 0.00042598500000000004, + "input_cost": 0.00022830500000000002, + "output_cost": 0.00019768000000000002 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.155831434, + "Duration": 15.982702523, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 3.289127811, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Model": "us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 22.058090836, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 2.969590916, + "Duration": 17.691408682, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.007718604, + "Duration": 29.664497105, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.411889755, + "Duration": 17.96936611, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 2.978789606, + "Duration": 25.827149441, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1261.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 525.0, - "total_cost": 0.0002836, - "input_cost": 7.36e-5, - "output_cost": 0.00021 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.024806005, + "Duration": 13.347354317, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9162.0, + "provider": "Amazon", + "Metric_request_tokens": 7157.0, + "Metric_response_tokens": 2005.0, + "total_cost": 0.000531195, + "input_cost": 0.00025049500000000005, + "output_cost": 0.0002807 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.377116743, + "Duration": 5.111033105, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 2.999520941, + "Duration": 26.272915809, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.034895574, + "Duration": 22.779003058, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.436441263, + "Duration": 19.910701039, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 2.963080948, + "Duration": 7.235511866, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4178.0, + "provider": "Amazon", + "Metric_request_tokens": 3433.0, + "Metric_response_tokens": 745.0, + "total_cost": 0.00022445500000000003, + "input_cost": 0.000120155, + "output_cost": 0.00010430000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.033675592, + "Duration": 10.032302433, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4888.0, + "provider": "Amazon", + "Metric_request_tokens": 3535.0, + "Metric_response_tokens": 1353.0, + "total_cost": 0.000313145, + "input_cost": 0.000123725, + "output_cost": 0.00018942 }, { - "Model": "gemini-2.0-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.437838664, + "Duration": 16.138580876, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, - "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 60.637950634, + "Duration": 10.894268114, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 76294.0, - "provider": "Google", - "Metric_request_tokens": 15093.0, - "Metric_response_tokens": 2470.0, - "total_cost": 0.20930445, - "input_cost": 0.00226395, - "output_cost": 0.20704050000000002 + "total_tokens": 7935.0, + "provider": "Amazon", + "Metric_request_tokens": 6523.0, + "Metric_response_tokens": 1412.0, + "total_cost": 0.00042598500000000004, + "input_cost": 0.00022830500000000002, + "output_cost": 0.00019768000000000002 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 31.479987348, + "Duration": 15.982702523, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 40567.0, - "provider": "Google", - "Metric_request_tokens": 8696.0, - "Metric_response_tokens": 1852.0, - "total_cost": 0.10748210000000001, - "input_cost": 0.0013044, - "output_cost": 0.10617770000000001 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.009089436, + "Duration": 22.058090836, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1730.0, - "provider": "Google", - "Metric_request_tokens": 810.0, - "Metric_response_tokens": 524.0, - "total_cost": 0.0018219, - "input_cost": 0.00012149999999999999, - "output_cost": 0.0017004 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 27.092670181, - "Score_MermaidDiagramValid": 1.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 72731.0, - "provider": "Google", - "Metric_request_tokens": 7623.0, - "Metric_response_tokens": 1833.0, - "total_cost": 0.22370575, - "input_cost": 0.0011434499999999998, - "output_cost": 0.22256230000000002 + "Duration": 17.691408682, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 15.495790935, + "Duration": 29.664497105, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4759.0, - "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00946455, - "input_cost": 0.00025035, - "output_cost": 0.0092142 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 15.96490713, + "Duration": 17.96936611, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5661.0, - "provider": "Google", - "Metric_request_tokens": 3007.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.0063383499999999995, - "input_cost": 0.00045105, - "output_cost": 0.0058873 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 54.049785353, + "Duration": 25.827149441, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 11479.0, - "provider": "Google", - "Metric_request_tokens": 4407.0, - "Metric_response_tokens": 1179.0, - "total_cost": 0.021993949999999998, - "input_cost": 0.00066105, - "output_cost": 0.0213329 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 21.902809368, + "Duration": 13.347354317, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7171.0, - "provider": "Google", - "Metric_request_tokens": 3868.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.008738999999999999, - "input_cost": 0.0005802, - "output_cost": 0.008158799999999999 + "total_tokens": 9162.0, + "provider": "Amazon", + "Metric_request_tokens": 7157.0, + "Metric_response_tokens": 2005.0, + "total_cost": 0.000531195, + "input_cost": 0.00025049500000000005, + "output_cost": 0.0002807 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 55.958218108, + "Duration": 5.111033105, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 23354.0, - "provider": "Google", - "Metric_request_tokens": 3534.0, - "Metric_response_tokens": 1685.0, - "total_cost": 0.06501359999999999, - "input_cost": 0.0005300999999999999, - "output_cost": 0.06448349999999999 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 11.505478102, + "Duration": 26.272915809, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 10428.0, - "provider": "Google", - "Metric_request_tokens": 811.0, - "Metric_response_tokens": 526.0, - "total_cost": 0.03225575, - "input_cost": 0.00012164999999999999, - "output_cost": 0.0321341 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 28.035491061, + "Duration": 22.779003058, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 20717.0, - "provider": "Google", - "Metric_request_tokens": 6558.0, - "Metric_response_tokens": 1804.0, - "total_cost": 0.04530859999999999, - "input_cost": 0.0009837, - "output_cost": 0.044324899999999993 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 21.009636819, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 22447.0, - "provider": "Google", - "Metric_request_tokens": 3902.0, - "Metric_response_tokens": 1189.0, - "total_cost": 0.0620447, - "input_cost": 0.0005853, - "output_cost": 0.061459400000000004 - }, - { - "Model": "gemini-2.5-flash-preview-04-17", - "Case": "fix_invalid_diagram_easy", - "test_group": "easy", - "Duration": 29.185720392, + "Duration": 19.910701039, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "Amazon", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -2950,1711 +7763,1775 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 10.709128752, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7655.0, + "provider": "Amazon", + "Metric_request_tokens": 6340.0, + "Metric_response_tokens": 1315.0, + "total_cost": 0.000406, + "input_cost": 0.00022190000000000003, + "output_cost": 0.0001841 + }, + { + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 7.030499633, + "Duration": 16.1569874, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 2109.0, - "provider": "Google", - "Metric_request_tokens": 810.0, - "Metric_response_tokens": 17.0, - "total_cost": 0.0046187, - "input_cost": 0.00012149999999999999, - "output_cost": 0.0044972 + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 28.157089277, + "Duration": 16.292243277, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7844.0, - "provider": "Google", - "Metric_request_tokens": 3868.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.011094499999999998, - "input_cost": 0.0005802, - "output_cost": 0.010514299999999999 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 16.271507577, - "Score_MermaidDiagramValid": 1.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 23939.0, - "provider": "Google", - "Metric_request_tokens": 3060.0, - "Metric_response_tokens": 1164.0, - "total_cost": 0.0701599, - "input_cost": 0.00045899999999999994, - "output_cost": 0.0697009 + "Duration": 25.459357714, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 15.337420676, + "Duration": 24.24639963, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7649.0, - "provider": "Google", - "Metric_request_tokens": 5361.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.00540755, - "input_cost": 0.00080415, - "output_cost": 0.0046034 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.332825299, + "Duration": 27.127796199, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2638.0, - "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00204105, - "input_cost": 0.00025035, - "output_cost": 0.0017907 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.406013529, + "Duration": 7.515629417, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2617.0, - "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00195505, - "input_cost": 0.00025065, - "output_cost": 0.0017044 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4196.0, + "provider": "Amazon", + "Metric_request_tokens": 3478.0, + "Metric_response_tokens": 718.0, + "total_cost": 0.00022225, + "input_cost": 0.00012173000000000002, + "output_cost": 0.00010052 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.596515261, + "Duration": 22.272816724, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2556.0, - "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00175405, - "input_cost": 0.00025035, - "output_cost": 0.0015037 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 10.369176736, + "Duration": 27.155517751, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6932.0, - "provider": "Google", - "Metric_request_tokens": 3981.0, - "Metric_response_tokens": 1184.0, - "total_cost": 0.00749205, - "input_cost": 0.00059715, - "output_cost": 0.0068949 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.68512552, + "Duration": 21.478668781, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 5402.0, - "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.01170255, - "input_cost": 0.00025065, - "output_cost": 0.011451900000000001 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 8.790923666, + "Duration": 16.68120615, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2875.0, - "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00286475, - "input_cost": 0.00025035, - "output_cost": 0.0026144000000000002 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 12.593662815, + "Duration": 17.854751274, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3738.0, - "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00588525, - "input_cost": 0.00025035, - "output_cost": 0.0056349 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 23.406116351, - "Score_MermaidDiagramValid": 1.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 46807.0, - "provider": "Google", - "Metric_request_tokens": 4005.0, - "Metric_response_tokens": 1266.0, - "total_cost": 0.14673635, - "input_cost": 0.0006007499999999999, - "output_cost": 0.14613559999999998 + "Duration": 26.11028082, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.594700624, + "Duration": 13.019602175, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2449.0, - "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0013795500000000002, - "input_cost": 0.00025035, - "output_cost": 0.0011292000000000001 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 21.817208931, + "Duration": 18.122924172, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 42068.0, - "provider": "Google", - "Metric_request_tokens": 3980.0, - "Metric_response_tokens": 1185.0, - "total_cost": 0.13046849999999996, - "input_cost": 0.000597, - "output_cost": 0.12987149999999997 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 14.751506297, - "Score_MermaidDiagramValid": 1.0, + "Duration": 13.539048311, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9535.0, - "provider": "Google", - "Metric_request_tokens": 3970.0, - "Metric_response_tokens": 1219.0, - "total_cost": 0.016537899999999998, - "input_cost": 0.0005954999999999999, - "output_cost": 0.0159424 + "total_tokens": 18474.0, + "provider": "Amazon", + "Metric_request_tokens": 17520.0, + "Metric_response_tokens": 954.0, + "total_cost": 0.0007467600000000001, + "input_cost": 0.0006132, + "output_cost": 0.00013356000000000002 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 26.279386281, + "Duration": 25.096765367, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 25985.0, - "provider": "Google", - "Metric_request_tokens": 9113.0, - "Metric_response_tokens": 1804.0, - "total_cost": 0.055187349999999996, - "input_cost": 0.0013669499999999998, - "output_cost": 0.0538204 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 14.502334274, + "Duration": 15.169118995, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7387.0, - "provider": "Google", - "Metric_request_tokens": 5361.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.004490549999999999, - "input_cost": 0.00080415, - "output_cost": 0.0036864 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.444876941, + "Duration": 9.241399938, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4284.0, - "provider": "Google", - "Metric_request_tokens": 3088.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.0007872, - "input_cost": 0.0003088, - "output_cost": 0.00047840000000000003 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6107.0, + "provider": "Amazon", + "Metric_request_tokens": 5090.0, + "Metric_response_tokens": 1017.0, + "total_cost": 0.00032053, + "input_cost": 0.00017815000000000002, + "output_cost": 0.00014238 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.627413202, + "Duration": 29.228884007, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4291.0, - "provider": "Google", - "Metric_request_tokens": 3089.0, - "Metric_response_tokens": 1202.0, - "total_cost": 0.0007897, - "input_cost": 0.00030890000000000003, - "output_cost": 0.0004808 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.704063431, + "Duration": 27.533683609, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3774.0, - "provider": "Google", - "Metric_request_tokens": 3091.0, - "Metric_response_tokens": 683.0, - "total_cost": 0.0005823, - "input_cost": 0.00030910000000000003, - "output_cost": 0.0002732 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.447734786, + "Duration": 20.056999566, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4282.0, - "provider": "Google", - "Metric_request_tokens": 3091.0, - "Metric_response_tokens": 1191.0, - "total_cost": 0.0007855000000000001, - "input_cost": 0.00030910000000000003, - "output_cost": 0.00047640000000000003 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.576908765, + "Duration": 15.667786223, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4271.0, - "provider": "Google", - "Metric_request_tokens": 3091.0, - "Metric_response_tokens": 1180.0, - "total_cost": 0.0007811000000000001, - "input_cost": 0.00030910000000000003, - "output_cost": 0.00047200000000000003 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.032189281, + "Duration": 25.437752797, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4612.0, - "provider": "Google", - "Metric_request_tokens": 3136.0, - "Metric_response_tokens": 1476.0, - "total_cost": 0.0009040000000000001, - "input_cost": 0.00031360000000000003, - "output_cost": 0.0005904 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.472401128, + "Duration": 21.22790309, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4274.0, - "provider": "Google", - "Metric_request_tokens": 3088.0, - "Metric_response_tokens": 1186.0, - "total_cost": 0.0007832000000000001, - "input_cost": 0.0003088, - "output_cost": 0.00047440000000000004 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 27.491055, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 17.986839409, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 29.143947476, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.732016304, + "Duration": 27.419551466, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3778.0, - "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 671.0, - "total_cost": 0.0005791, - "input_cost": 0.0003107, - "output_cost": 0.0002684 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "us.amazon.nova-micro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.756348604, + "Duration": 13.586402193, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4571.0, - "provider": "Google", - "Metric_request_tokens": 3136.0, - "Metric_response_tokens": 1435.0, - "total_cost": 0.0008876000000000001, - "input_cost": 0.00031360000000000003, - "output_cost": 0.0005740000000000001 + "total_tokens": 18273.0, + "provider": "Amazon", + "Metric_request_tokens": 17305.0, + "Metric_response_tokens": 968.0, + "total_cost": 0.000741195, + "input_cost": 0.0006056750000000001, + "output_cost": 0.00013552 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.71480991, + "Duration": 24.207496913, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4318.0, + "total_tokens": 5044.0, "provider": "Google", - "Metric_request_tokens": 3104.0, - "Metric_response_tokens": 1214.0, - "total_cost": 0.000796, - "input_cost": 0.0003104, - "output_cost": 0.00048560000000000004 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1158.0, + "total_cost": 0.023253749999999997, + "input_cost": 0.00388375, + "output_cost": 0.01937 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.797644523, + "Duration": 24.803203804, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3776.0, + "total_tokens": 5199.0, "provider": "Google", "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 669.0, - "total_cost": 0.0005783, - "input_cost": 0.0003107, - "output_cost": 0.0002676 + "Metric_response_tokens": 1173.0, + "total_cost": 0.02480375, + "input_cost": 0.00388375, + "output_cost": 0.02092 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.755809993, + "Duration": 34.032753508, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4296.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6600.0, "provider": "Google", - "Metric_request_tokens": 3092.0, - "Metric_response_tokens": 1204.0, - "total_cost": 0.0007908, - "input_cost": 0.00030920000000000003, - "output_cost": 0.0004816 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.03881375, + "input_cost": 0.00388375, + "output_cost": 0.03493 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.602118065, + "Duration": 24.432732425, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4273.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5326.0, "provider": "Google", - "Metric_request_tokens": 3088.0, - "Metric_response_tokens": 1185.0, - "total_cost": 0.0007828, - "input_cost": 0.0003088, - "output_cost": 0.00047400000000000003 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1176.0, + "total_cost": 0.02607375, + "input_cost": 0.00388375, + "output_cost": 0.02219 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.466044834, + "Duration": 27.33412975, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4312.0, + "total_tokens": 5806.0, "provider": "Google", "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1205.0, - "total_cost": 0.0007927, - "input_cost": 0.0003107, - "output_cost": 0.000482 + "Metric_response_tokens": 1201.0, + "total_cost": 0.03087375, + "input_cost": 0.00388375, + "output_cost": 0.02699 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.130914105, + "Duration": 32.488361727, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3861.0, + "total_tokens": 5925.0, "provider": "Google", - "Metric_request_tokens": 3136.0, - "Metric_response_tokens": 725.0, - "total_cost": 0.0006036, - "input_cost": 0.00031360000000000003, - "output_cost": 0.00029 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1154.0, + "total_cost": 0.03206375, + "input_cost": 0.00388375, + "output_cost": 0.028180000000000004 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 12.230727772, + "Duration": 26.454849392, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 11014.0, + "total_tokens": 5526.0, "provider": "Google", - "Metric_request_tokens": 3108.0, - "Metric_response_tokens": 1198.0, - "total_cost": 0.024663, - "input_cost": 0.0004662, - "output_cost": 0.0241968 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.02807375, + "input_cost": 0.00388375, + "output_cost": 0.024190000000000003 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 8.306374733, - "Score_MermaidDiagramValid": 0.0, + "Duration": 40.878031462, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2757.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10420.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00245175, - "input_cost": 0.00025035, - "output_cost": 0.0022014 + "Metric_request_tokens": 6847.0, + "Metric_response_tokens": 1815.0, + "total_cost": 0.04428875, + "input_cost": 0.00855875, + "output_cost": 0.03573 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.595447784, + "Duration": 67.384048514, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2698.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 13422.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0022510499999999997, - "input_cost": 0.00025035, - "output_cost": 0.0020007 + "Metric_request_tokens": 6855.0, + "Metric_response_tokens": 1785.0, + "total_cost": 0.07423875, + "input_cost": 0.00856875, + "output_cost": 0.06567 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.077756987, - "Score_MermaidDiagramValid": 0.0, + "Duration": 27.014404973, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 8290.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5406.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.02181055, - "input_cost": 0.00025065, - "output_cost": 0.0215599 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.026873749999999995, + "input_cost": 0.00388375, + "output_cost": 0.022989999999999997 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 13.08787764, + "Duration": 26.744050775, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 12993.0, + "total_tokens": 5425.0, "provider": "Google", - "Metric_request_tokens": 5355.0, - "Metric_response_tokens": 1171.0, - "total_cost": 0.024140349999999998, - "input_cost": 0.00080325, - "output_cost": 0.0233371 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.027063750000000004, + "input_cost": 0.00388375, + "output_cost": 0.023180000000000003 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 18.203576056, + "Duration": 39.712500277, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6614.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.03895375, + "input_cost": 0.00388375, + "output_cost": 0.035070000000000004 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 7.877005808, - "Score_MermaidDiagramValid": 0.0, + "Duration": 41.219766849, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2829.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7059.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00269705, - "input_cost": 0.00025065, - "output_cost": 0.0024464 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.04340375, + "input_cost": 0.00388375, + "output_cost": 0.03952 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 6.691518112, + "Duration": 24.833685412, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2766.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5205.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00248905, - "input_cost": 0.00025035, - "output_cost": 0.0022387 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.024863749999999997, + "input_cost": 0.00388375, + "output_cost": 0.02098 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.46408462, + "Duration": 30.57386297, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2591.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5837.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0018765499999999998, - "input_cost": 0.00025035, - "output_cost": 0.0016262 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.031183749999999996, + "input_cost": 0.00388375, + "output_cost": 0.027299999999999998 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 11.559526636, - "Score_MermaidDiagramValid": 0.0, + "Duration": 53.658760503, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 15503.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 63389.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.047056049999999995, - "input_cost": 0.00025065, - "output_cost": 0.0468054 + "Metric_request_tokens": 58608.0, + "Metric_response_tokens": 2447.0, + "total_cost": 0.12107000000000001, + "input_cost": 0.07326, + "output_cost": 0.04781 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 18.417513707, + "Duration": 32.435081791, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 8232.0, + "total_tokens": 6158.0, "provider": "Google", - "Metric_request_tokens": 5361.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.0074480499999999995, - "input_cost": 0.00080415, - "output_cost": 0.0066438999999999995 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1150.0, + "total_cost": 0.03439375, + "input_cost": 0.00388375, + "output_cost": 0.03051 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 9.485176875, + "Duration": 26.427531899, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2981.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5629.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00323575, - "input_cost": 0.00025035, - "output_cost": 0.0029854 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.029103749999999998, + "input_cost": 0.00388375, + "output_cost": 0.02522 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.271124626, + "Duration": 33.541754196, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2722.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6253.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00232255, - "input_cost": 0.00025065, - "output_cost": 0.0020719000000000002 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1164.0, + "total_cost": 0.03534375, + "input_cost": 0.00388375, + "output_cost": 0.03146 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 18.342354393, + "Duration": 24.7685358, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 20909.0, + "total_tokens": 5160.0, "provider": "Google", - "Metric_request_tokens": 5361.0, - "Metric_response_tokens": 1158.0, - "total_cost": 0.051863950000000006, - "input_cost": 0.00080415, - "output_cost": 0.0510598 + "Metric_request_tokens": 3101.0, + "Metric_response_tokens": 1172.0, + "total_cost": 0.024466250000000002, + "input_cost": 0.00387625, + "output_cost": 0.02059 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 19.63725688, + "Duration": 25.723323767, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 19611.0, + "total_tokens": 5428.0, "provider": "Google", - "Metric_request_tokens": 3981.0, - "Metric_response_tokens": 1184.0, - "total_cost": 0.05186855, - "input_cost": 0.00059715, - "output_cost": 0.0512714 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.02709375, + "input_cost": 0.00388375, + "output_cost": 0.02321 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 141.168154989, - "Score_MermaidDiagramValid": 0.0, + "Duration": 26.441471015, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 68346.0, - "provider": "Amazon", - "Metric_request_tokens": 64861.0, - "Metric_response_tokens": 3485.0, - "total_cost": 0.205715, - "input_cost": 0.1621525, - "output_cost": 0.0435625 + "total_tokens": 5382.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.026633749999999998, + "input_cost": 0.00388375, + "output_cost": 0.02275 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 66.419366072, + "Duration": 26.604628416, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9777.0, - "provider": "Amazon", - "Metric_request_tokens": 7990.0, - "Metric_response_tokens": 1787.0, - "total_cost": 0.0423125, - "input_cost": 0.019975, - "output_cost": 0.0223375 + "total_tokens": 5562.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.02843375, + "input_cost": 0.00388375, + "output_cost": 0.024550000000000002 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 61.750421529, + "Duration": 50.415529759, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11548.0, + "provider": "Google", + "Metric_request_tokens": 6858.0, + "Metric_response_tokens": 1806.0, + "total_cost": 0.055472499999999994, + "input_cost": 0.0085725, + "output_cost": 0.0469 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 60.774519728, + "Duration": 22.679703767, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6479.0, - "provider": "Amazon", - "Metric_request_tokens": 4710.0, - "Metric_response_tokens": 1769.0, - "total_cost": 0.0338875, - "input_cost": 0.011774999999999999, - "output_cost": 0.0221125 + "total_tokens": 5023.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.02304375, + "input_cost": 0.00388375, + "output_cost": 0.01916 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 61.257517708, + "Duration": 25.725383571, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 14585.0, - "provider": "Amazon", - "Metric_request_tokens": 11771.0, - "Metric_response_tokens": 2814.0, - "total_cost": 0.06460250000000001, - "input_cost": 0.029427500000000002, - "output_cost": 0.035175 + "total_tokens": 5543.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.028243749999999998, + "input_cost": 0.00388375, + "output_cost": 0.02436 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 168.374540939, + "Duration": 36.866564448, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 22046.0, - "provider": "Amazon", - "Metric_request_tokens": 18606.0, - "Metric_response_tokens": 3440.0, - "total_cost": 0.089515, - "input_cost": 0.046515, - "output_cost": 0.043 + "total_tokens": 6227.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1133.0, + "total_cost": 0.03508375, + "input_cost": 0.00388375, + "output_cost": 0.0312 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 81.523331201, - "Score_MermaidDiagramValid": 1.0, + "Duration": 32.45911738, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 63619.0, - "provider": "Amazon", - "Metric_request_tokens": 60858.0, - "Metric_response_tokens": 2761.0, - "total_cost": 0.1866575, - "input_cost": 0.152145, - "output_cost": 0.0345125 + "total_tokens": 5824.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.031053749999999998, + "input_cost": 0.00388375, + "output_cost": 0.02717 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 47.967601991, + "Duration": 37.005075571, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6432.0, - "provider": "Amazon", - "Metric_request_tokens": 4754.0, - "Metric_response_tokens": 1678.0, - "total_cost": 0.03286, - "input_cost": 0.011885000000000001, - "output_cost": 0.020975 + "total_tokens": 6464.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.03745375, + "input_cost": 0.00388375, + "output_cost": 0.03357 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 72.870020591, + "Duration": 40.203596393, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 19765.0, - "provider": "Amazon", - "Metric_request_tokens": 16317.0, - "Metric_response_tokens": 3448.0, - "total_cost": 0.08389250000000001, - "input_cost": 0.0407925, - "output_cost": 0.0431 + "total_tokens": 6719.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1176.0, + "total_cost": 0.04000375, + "input_cost": 0.00388375, + "output_cost": 0.03612 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 52.794925723, - "Score_MermaidDiagramValid": 0.0, + "Duration": 32.600817879, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10125.0, - "provider": "Amazon", - "Metric_request_tokens": 8113.0, - "Metric_response_tokens": 2012.0, - "total_cost": 0.0454325, - "input_cost": 0.020282500000000002, - "output_cost": 0.02515 + "total_tokens": 5693.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1192.0, + "total_cost": 0.02974375, + "input_cost": 0.00388375, + "output_cost": 0.02586 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 102.690218942, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Duration": 33.96250988, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5849.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1190.0, + "total_cost": 0.03130375, + "input_cost": 0.00388375, + "output_cost": 0.02742 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 27.695561409, + "Duration": 40.243743396, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6168.0, - "provider": "Amazon", - "Metric_request_tokens": 4543.0, - "Metric_response_tokens": 1625.0, - "total_cost": 0.031670000000000004, - "input_cost": 0.0113575, - "output_cost": 0.0203125 + "total_tokens": 6388.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1154.0, + "total_cost": 0.03669375, + "input_cost": 0.00388375, + "output_cost": 0.03281 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 22.951197833, + "Duration": 18.620513207, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6002.0, - "provider": "Amazon", - "Metric_request_tokens": 4471.0, - "Metric_response_tokens": 1531.0, - "total_cost": 0.030315, - "input_cost": 0.0111775, - "output_cost": 0.0191375 + "total_tokens": 4699.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.019803750000000002, + "input_cost": 0.00388375, + "output_cost": 0.01592 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 89.757139435, + "Duration": 31.869262663, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5716.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1168.0, + "total_cost": 0.02997375, + "input_cost": 0.00388375, + "output_cost": 0.026090000000000002 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-06-05", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 116.956568909, + "Duration": 31.277196563, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5832.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.03113375, + "input_cost": 0.00388375, + "output_cost": 0.02725 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 141.168154989, + "Duration": 22.205452715, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 68346.0, - "provider": "Amazon", - "Metric_request_tokens": 64861.0, - "Metric_response_tokens": 3485.0, - "total_cost": 0.205715, - "input_cost": 0.1621525, - "output_cost": 0.0435625 + "total_tokens": 4960.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1187.0, + "total_cost": 0.022413750000000003, + "input_cost": 0.00388375, + "output_cost": 0.01853 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 66.419366072, + "Duration": 45.254514377, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9777.0, - "provider": "Amazon", - "Metric_request_tokens": 7990.0, - "Metric_response_tokens": 1787.0, - "total_cost": 0.0423125, - "input_cost": 0.019975, - "output_cost": 0.0223375 + "total_tokens": 7035.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1153.0, + "total_cost": 0.04316375, + "input_cost": 0.00388375, + "output_cost": 0.03928 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 61.750421529, + "Duration": 25.540973715, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5321.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1168.0, + "total_cost": 0.02602375, + "input_cost": 0.00388375, + "output_cost": 0.02214 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 60.774519728, - "Score_MermaidDiagramValid": 0.0, + "Duration": 29.203818188, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6479.0, - "provider": "Amazon", - "Metric_request_tokens": 4710.0, - "Metric_response_tokens": 1769.0, - "total_cost": 0.0338875, - "input_cost": 0.011774999999999999, - "output_cost": 0.0221125 + "total_tokens": 5909.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.031903749999999995, + "input_cost": 0.00388375, + "output_cost": 0.028019999999999996 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 61.257517708, + "Duration": 41.27588133, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 14585.0, - "provider": "Amazon", - "Metric_request_tokens": 11771.0, - "Metric_response_tokens": 2814.0, - "total_cost": 0.06460250000000001, - "input_cost": 0.029427500000000002, - "output_cost": 0.035175 + "total_tokens": 10620.0, + "provider": "Google", + "Metric_request_tokens": 6855.0, + "Metric_response_tokens": 1805.0, + "total_cost": 0.04621875, + "input_cost": 0.00856875, + "output_cost": 0.03765 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 168.374540939, + "Duration": 68.89242445, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 22046.0, - "provider": "Amazon", - "Metric_request_tokens": 18606.0, - "Metric_response_tokens": 3440.0, - "total_cost": 0.089515, - "input_cost": 0.046515, - "output_cost": 0.043 + "total_tokens": 9389.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.06670375, + "input_cost": 0.00388375, + "output_cost": 0.06282 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 81.523331201, + "Duration": 34.105280135, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 63619.0, - "provider": "Amazon", - "Metric_request_tokens": 60858.0, - "Metric_response_tokens": 2761.0, - "total_cost": 0.1866575, - "input_cost": 0.152145, - "output_cost": 0.0345125 + "total_tokens": 6279.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1280.0, + "total_cost": 0.035603749999999997, + "input_cost": 0.00388375, + "output_cost": 0.03172 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 47.967601991, + "Duration": 120.860207911, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6432.0, - "provider": "Amazon", - "Metric_request_tokens": 4754.0, - "Metric_response_tokens": 1678.0, - "total_cost": 0.03286, - "input_cost": 0.011885000000000001, - "output_cost": 0.020975 + "total_tokens": 13787.0, + "provider": "Google", + "Metric_request_tokens": 7717.0, + "Metric_response_tokens": 1789.0, + "total_cost": 0.07034625, + "input_cost": 0.00964625, + "output_cost": 0.0607 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 72.870020591, + "Duration": 29.936865091, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 19765.0, - "provider": "Amazon", - "Metric_request_tokens": 16317.0, - "Metric_response_tokens": 3448.0, - "total_cost": 0.08389250000000001, - "input_cost": 0.0407925, - "output_cost": 0.0431 + "total_tokens": 5784.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1157.0, + "total_cost": 0.03065375, + "input_cost": 0.00388375, + "output_cost": 0.026770000000000002 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 52.794925723, + "Duration": 27.396380686, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10125.0, - "provider": "Amazon", - "Metric_request_tokens": 8113.0, - "Metric_response_tokens": 2012.0, - "total_cost": 0.0454325, - "input_cost": 0.020282500000000002, - "output_cost": 0.02515 + "total_tokens": 5674.0, + "provider": "Google", + "Metric_request_tokens": 3106.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.029562500000000002, + "input_cost": 0.0038824999999999997, + "output_cost": 0.02568 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 102.690218942, + "Duration": 23.581064657, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5037.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1154.0, + "total_cost": 0.023183750000000003, + "input_cost": 0.00388375, + "output_cost": 0.0193 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 27.695561409, + "Duration": 29.262028799, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6168.0, - "provider": "Amazon", - "Metric_request_tokens": 4543.0, - "Metric_response_tokens": 1625.0, - "total_cost": 0.031670000000000004, - "input_cost": 0.0113575, - "output_cost": 0.0203125 + "total_tokens": 5839.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1150.0, + "total_cost": 0.031203750000000002, + "input_cost": 0.00388375, + "output_cost": 0.02732 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 22.951197833, - "Score_MermaidDiagramValid": 0.0, + "Duration": 53.601437466, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6002.0, - "provider": "Amazon", - "Metric_request_tokens": 4471.0, - "Metric_response_tokens": 1531.0, - "total_cost": 0.030315, - "input_cost": 0.0111775, - "output_cost": 0.0191375 + "total_tokens": 8093.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1193.0, + "total_cost": 0.05374375, + "input_cost": 0.00388375, + "output_cost": 0.04986 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 89.757139435, + "Duration": 23.554440753, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5308.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.02589375, + "input_cost": 0.00388375, + "output_cost": 0.02201 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 116.956568909, + "Duration": 23.896141303, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5227.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.025083750000000002, + "input_cost": 0.00388375, + "output_cost": 0.0212 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 61.788091838, - "Score_MermaidDiagramValid": 0.0, + "Duration": 46.506492861, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 8566.0, - "provider": "Amazon", - "Metric_request_tokens": 6763.0, - "Metric_response_tokens": 1803.0, - "total_cost": 0.039444999999999994, - "input_cost": 0.0169075, - "output_cost": 0.0225375 + "total_tokens": 10985.0, + "provider": "Google", + "Metric_request_tokens": 6854.0, + "Metric_response_tokens": 1829.0, + "total_cost": 0.0498775, + "input_cost": 0.0085675, + "output_cost": 0.04131 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 139.579026705, + "Duration": 31.379013607, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 28485.0, - "provider": "Amazon", - "Metric_request_tokens": 24611.0, - "Metric_response_tokens": 3874.0, - "total_cost": 0.1099525, - "input_cost": 0.0615275, - "output_cost": 0.048424999999999996 + "total_tokens": 5869.0, + "provider": "Google", + "Metric_request_tokens": 3104.0, + "Metric_response_tokens": 1195.0, + "total_cost": 0.03153, + "input_cost": 0.0038799999999999998, + "output_cost": 0.02765 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 102.15980072, + "Duration": 23.397718403, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5142.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1136.0, + "total_cost": 0.02423375, + "input_cost": 0.00388375, + "output_cost": 0.02035 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 116.141871727, - "Score_MermaidDiagramValid": 0.0, + "Duration": 31.349962177, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 19949.0, - "provider": "Amazon", - "Metric_request_tokens": 16646.0, - "Metric_response_tokens": 3303.0, - "total_cost": 0.08290249999999999, - "input_cost": 0.041615, - "output_cost": 0.0412875 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6014.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.03295375, + "input_cost": 0.00388375, + "output_cost": 0.02907 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 59.31011804, + "Duration": 39.356080363, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6590.0, - "provider": "Amazon", - "Metric_request_tokens": 4868.0, - "Metric_response_tokens": 1722.0, - "total_cost": 0.033695, - "input_cost": 0.01217, - "output_cost": 0.021525 + "total_tokens": 6782.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1166.0, + "total_cost": 0.040633749999999996, + "input_cost": 0.00388375, + "output_cost": 0.03675 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 153.57670155, + "Duration": 26.725778378, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5165.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1195.0, + "total_cost": 0.02446375, + "input_cost": 0.00388375, + "output_cost": 0.02058 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 52.652731978, - "Score_MermaidDiagramValid": 0.0, + "Duration": 41.986268087, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6372.0, - "provider": "Amazon", - "Metric_request_tokens": 4841.0, - "Metric_response_tokens": 1531.0, - "total_cost": 0.03124, - "input_cost": 0.012102499999999999, - "output_cost": 0.0191375 + "total_tokens": 6768.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1183.0, + "total_cost": 0.04049375, + "input_cost": 0.00388375, + "output_cost": 0.036610000000000004 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 124.215302042, + "Duration": 29.098656784, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 22787.0, - "provider": "Amazon", - "Metric_request_tokens": 19271.0, - "Metric_response_tokens": 3516.0, - "total_cost": 0.0921275, - "input_cost": 0.0481775, - "output_cost": 0.04395 + "total_tokens": 5771.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1168.0, + "total_cost": 0.030523750000000002, + "input_cost": 0.00388375, + "output_cost": 0.02664 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 152.610399933, + "Duration": 24.371314172, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5229.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.02510375, + "input_cost": 0.00388375, + "output_cost": 0.021220000000000003 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 122.520721486, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Duration": 23.463013826, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4976.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.022573749999999997, + "input_cost": 0.00388375, + "output_cost": 0.01869 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 130.053796609, + "Duration": 47.889786736, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7356.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1154.0, + "total_cost": 0.04637375, + "input_cost": 0.00388375, + "output_cost": 0.04249 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 85.873660202, + "Duration": 27.124752992, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 12301.0, - "provider": "Amazon", - "Metric_request_tokens": 9837.0, - "Metric_response_tokens": 2464.0, - "total_cost": 0.0553925, - "input_cost": 0.0245925, - "output_cost": 0.0308 + "total_tokens": 5475.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.027563749999999998, + "input_cost": 0.00388375, + "output_cost": 0.02368 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 122.249576234, + "Duration": 35.179937664, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6258.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 977.0, + "total_cost": 0.03539375, + "input_cost": 0.00388375, + "output_cost": 0.03151 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 1.075055246, + "Duration": 25.536239605, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5279.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.02560375, + "input_cost": 0.00388375, + "output_cost": 0.02172 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-05-06", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 1.21642044, + "Duration": 28.961691958, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5610.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1148.0, + "total_cost": 0.028913750000000002, + "input_cost": 0.00388375, + "output_cost": 0.02503 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 1.060936443, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Duration": 26.18099707, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5581.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.028623750000000003, + "input_cost": 0.00388375, + "output_cost": 0.02474 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 1.037051732, + "Duration": 37.387479684, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6757.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.040383749999999996, + "input_cost": 0.00388375, + "output_cost": 0.0365 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 1.056416859, + "Duration": 41.910195327, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10585.0, + "provider": "Google", + "Metric_request_tokens": 6807.0, + "Metric_response_tokens": 1723.0, + "total_cost": 0.046288750000000004, + "input_cost": 0.008508749999999999, + "output_cost": 0.03778 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 1.045000565, + "Duration": 18.566825519, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4762.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.02043375, + "input_cost": 0.00388375, + "output_cost": 0.01655 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 1.174768992, + "Duration": 35.952256562, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6120.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.03401375, + "input_cost": 0.00388375, + "output_cost": 0.03013 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 1.06139308, + "Duration": 23.562834473, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5167.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.02448375, + "input_cost": 0.00388375, + "output_cost": 0.0206 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 1.096773894, - "Score_MermaidDiagramValid": 0.0, + "Duration": 26.116377335, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5461.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1175.0, + "total_cost": 0.027423750000000004, + "input_cost": 0.00388375, + "output_cost": 0.023540000000000002 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 1.065730453, + "Duration": 42.19434694, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7549.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.04830375, + "input_cost": 0.00388375, + "output_cost": 0.04442 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 1.034951182, + "Duration": 30.915458356, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5835.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1148.0, + "total_cost": 0.031163749999999997, + "input_cost": 0.00388375, + "output_cost": 0.02728 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 1.170030852, + "Duration": 26.201752563, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5374.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.02655375, + "input_cost": 0.00388375, + "output_cost": 0.02267 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 1.082285844, + "Duration": 23.905880551, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5249.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.02530375, + "input_cost": 0.00388375, + "output_cost": 0.021419999999999998 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 1.03252349, + "Duration": 24.308699595, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5184.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1153.0, + "total_cost": 0.024653749999999995, + "input_cost": 0.00388375, + "output_cost": 0.020769999999999997 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 1.038807845, - "Score_MermaidDiagramValid": 0.0, + "Duration": 39.750523403, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6263.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.035443749999999996, + "input_cost": 0.00388375, + "output_cost": 0.03156 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 1.059020363, + "Duration": 65.476559976, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6971.0, + "provider": "Google", + "Metric_request_tokens": 3984.0, + "Metric_response_tokens": 1170.0, + "total_cost": 0.03485, + "input_cost": 0.00498, + "output_cost": 0.02987 }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 1.159144096, + "Duration": 72.910902918, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Amazon", + "provider": "Google", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -4662,1215 +9539,1263 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 70.761430334, + "Duration": 43.602165928, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10530.0, + "provider": "Google", + "Metric_request_tokens": 6840.0, + "Metric_response_tokens": 1786.0, + "total_cost": 0.045450000000000004, + "input_cost": 0.00855, + "output_cost": 0.0369 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 50.064655716, + "Duration": 33.823405726, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6724.0, + "provider": "Google", + "Metric_request_tokens": 3968.0, + "Metric_response_tokens": 1157.0, + "total_cost": 0.03252, + "input_cost": 0.00496, + "output_cost": 0.02756 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 54.286773631, + "Duration": 30.694557094, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5928.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1154.0, + "total_cost": 0.032093750000000004, + "input_cost": 0.00388375, + "output_cost": 0.028210000000000002 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 52.848939949, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Duration": 28.020181574, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5702.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.02983375, + "input_cost": 0.00388375, + "output_cost": 0.02595 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 52.4068963, + "Duration": 26.120680251, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5541.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.02822375, + "input_cost": 0.00388375, + "output_cost": 0.02434 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 50.579682246, + "Duration": 51.401740722, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8168.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.05449375, + "input_cost": 0.00388375, + "output_cost": 0.05061 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 50.490538016, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Duration": 30.757611824, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5710.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.029913749999999996, + "input_cost": 0.00388375, + "output_cost": 0.026029999999999998 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 49.381506192, + "Duration": 34.511741088, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6547.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1176.0, + "total_cost": 0.03828375, + "input_cost": 0.00388375, + "output_cost": 0.0344 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 51.014168316, + "Duration": 33.667128837, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6142.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.03423375, + "input_cost": 0.00388375, + "output_cost": 0.03035 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 53.619240014, + "Duration": 26.603688339, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5227.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.025083750000000002, + "input_cost": 0.00388375, + "output_cost": 0.0212 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 49.97060826, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11419.0, + "provider": "Google", + "Metric_request_tokens": 6836.0, + "Metric_response_tokens": 1793.0, + "total_cost": 0.05437500000000001, + "input_cost": 0.008545, + "output_cost": 0.04583 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 97.172731042, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 15969.0, + "provider": "Google", + "Metric_request_tokens": 6815.0, + "Metric_response_tokens": 1761.0, + "total_cost": 0.10005875000000002, + "input_cost": 0.00851875, + "output_cost": 0.09154000000000001 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 53.025055925, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8261.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.055423749999999994, + "input_cost": 0.00388375, + "output_cost": 0.051539999999999996 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 46.101775188, + "Duration": 32.071007326, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5793.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.03074375, + "input_cost": 0.00388375, + "output_cost": 0.026860000000000002 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 53.682643889, + "Duration": 30.771411983, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5778.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1136.0, + "total_cost": 0.030593750000000003, + "input_cost": 0.00388375, + "output_cost": 0.02671 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 19.294165416, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Duration": 33.989499385, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6020.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.033013749999999994, + "input_cost": 0.00388375, + "output_cost": 0.029129999999999996 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 52.706066252, + "Duration": 26.15421085, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5364.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.026453749999999998, + "input_cost": 0.00388375, + "output_cost": 0.02257 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 53.323483575, + "Duration": 35.682115651, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6187.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1198.0, + "total_cost": 0.03468375, + "input_cost": 0.00388375, + "output_cost": 0.030799999999999998 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 70.761430334, + "Duration": 3.16239572, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 50.064655716, + "Duration": 3.095654126, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 54.286773631, + "Duration": 3.573285539, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 52.848939949, + "Duration": 3.13363762, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 52.4068963, + "Duration": 3.075430478, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 50.579682246, + "Duration": 3.734203872, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 50.490538016, + "Duration": 3.207070397, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1261.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 525.0, + "total_cost": 0.0002836, + "input_cost": 7.36e-5, + "output_cost": 0.00021 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 49.381506192, + "Duration": 3.173871208, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 51.014168316, + "Duration": 3.445568835, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 53.619240014, + "Duration": 3.087279343, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 46.101775188, + "Duration": 3.14396244, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 53.682643889, + "Duration": 3.481531585, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1242.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 506.0, + "total_cost": 0.00027600000000000004, + "input_cost": 7.36e-5, + "output_cost": 0.00020240000000000004 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 19.294165416, + "Duration": 3.053321468, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 52.706066252, + "Duration": 3.041527691, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 53.323483575, + "Duration": 3.656550888, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 60.490865526, + "Duration": 3.232291255, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1261.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 525.0, + "total_cost": 0.0002836, + "input_cost": 7.36e-5, + "output_cost": 0.00021 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 24.38614935, + "Duration": 3.16402353, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7391.0, - "provider": "Amazon", - "Metric_request_tokens": 6014.0, - "Metric_response_tokens": 1377.0, - "total_cost": 0.009217600000000001, - "input_cost": 0.004811200000000001, - "output_cost": 0.0044064 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 19.587720037, + "Duration": 3.371460427, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 35.779726991, + "Duration": 3.306933591, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1261.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 525.0, + "total_cost": 0.0002836, + "input_cost": 7.36e-5, + "output_cost": 0.00021 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 50.898700324, + "Duration": 3.199269162, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 29.120025907, + "Duration": 3.715297386, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 52.342244267, + "Duration": 3.390987167, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1256.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 520.0, + "total_cost": 0.0002816, + "input_cost": 7.36e-5, + "output_cost": 0.000208 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 51.294958536, + "Duration": 3.009082393, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 50.891190365, + "Duration": 3.572387722, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 47.666868754, + "Duration": 3.097230117, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 18401.0, - "provider": "Amazon", - "Metric_request_tokens": 15769.0, - "Metric_response_tokens": 2632.0, - "total_cost": 0.0210376, - "input_cost": 0.0126152, - "output_cost": 0.0084224 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 63.490425569, + "Duration": 3.342514555, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 63.600602322, + "Duration": 3.742365295, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 50.937322607, + "Duration": 3.291870391, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 21.340557151, + "Duration": 3.237470473, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1251.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 515.0, + "total_cost": 0.0002796, + "input_cost": 7.36e-5, + "output_cost": 0.00020600000000000002 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 54.150935132, + "Duration": 3.437132314, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 53.966453918, + "Duration": 5.205071529, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2632.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00200755, + "input_cost": 0.00025065, + "output_cost": 0.0017569 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 50.698133211, + "Duration": 9.27147079, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3259.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 557.0, + "total_cost": 0.00420005, + "input_cost": 0.00025035, + "output_cost": 0.0039497 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 54.007505821, + "Duration": 4.703598385, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2531.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0016665500000000001, + "input_cost": 0.00025035, + "output_cost": 0.0014162 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 65.783296981, + "Duration": 5.890864449, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2638.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 591.0, + "total_cost": 0.00192125, + "input_cost": 0.00025065, + "output_cost": 0.0016706 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 63.681787497, + "Duration": 5.237042556, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2595.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00189055, + "input_cost": 0.00025035, + "output_cost": 0.0016401999999999999 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 64.71506337, + "Duration": 5.402304393, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2648.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0020760500000000003, + "input_cost": 0.00025035, + "output_cost": 0.0018257000000000002 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 52.909417244, + "Duration": 5.681153876, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2673.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00215105, + "input_cost": 0.00025065, + "output_cost": 0.0019004 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 24.945646339, + "Duration": 5.74444077, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6970.0, - "provider": "Amazon", - "Metric_request_tokens": 5527.0, - "Metric_response_tokens": 1443.0, - "total_cost": 0.0090392, - "input_cost": 0.0044216, - "output_cost": 0.0046176 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2683.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00219855, + "input_cost": 0.00025035, + "output_cost": 0.0019482 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 52.262290885, + "Duration": 35.715282683, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 15172.0, + "provider": "Google", + "Metric_request_tokens": 8909.0, + "Metric_response_tokens": 1841.0, + "total_cost": 0.01791795, + "input_cost": 0.00133635, + "output_cost": 0.0165816 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 34.392486572, - "Score_MermaidDiagramValid": 0.0, + "Duration": 19.026625667, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7927.0, - "provider": "Amazon", - "Metric_request_tokens": 5837.0, - "Metric_response_tokens": 2090.0, - "total_cost": 0.011357599999999999, - "input_cost": 0.0046696, - "output_cost": 0.0066879999999999995 + "total_tokens": 7184.0, + "provider": "Google", + "Metric_request_tokens": 3969.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.00837945, + "input_cost": 0.00059535, + "output_cost": 0.0077840999999999995 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 51.782173036, + "Duration": 5.674318695, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2643.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00205855, + "input_cost": 0.00025035, + "output_cost": 0.0018082 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 51.199029621, + "Duration": 6.052854086, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2773.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00251355, + "input_cost": 0.00025035, + "output_cost": 0.0022632 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 56.303173032, + "Duration": 6.691234945, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2764.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.0024695499999999996, + "input_cost": 0.00025065, + "output_cost": 0.0022188999999999998 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 45.050576092, + "Duration": 10.540987174, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7199.0, + "provider": "Google", + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.00383255, + "input_cost": 0.00080415, + "output_cost": 0.0030284 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 53.036889105, + "Duration": 5.512122765, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2665.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00213555, + "input_cost": 0.00025035, + "output_cost": 0.0018852 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 32.586487666, + "Duration": 6.02414223, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2737.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.0023750499999999996, + "input_cost": 0.00025065, + "output_cost": 0.0021244 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 9.551409036, + "Duration": 18.309828457, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 5238.0, - "provider": "Amazon", - "Metric_request_tokens": 4492.0, - "Metric_response_tokens": 746.0, - "total_cost": 0.00044856000000000003, - "input_cost": 0.00026952000000000004, - "output_cost": 0.00017904 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7772.0, + "provider": "Google", + "Metric_request_tokens": 3983.0, + "Metric_response_tokens": 1188.0, + "total_cost": 0.01041375, + "input_cost": 0.00059745, + "output_cost": 0.0098163 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 24.607065725, + "Duration": 5.858011472, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2586.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0018590500000000001, + "input_cost": 0.00025035, + "output_cost": 0.0016087 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 13.907358597, + "Duration": 5.887542438, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5408.0, - "provider": "Amazon", - "Metric_request_tokens": 4043.0, - "Metric_response_tokens": 1365.0, - "total_cost": 0.0005701799999999999, - "input_cost": 0.00024257999999999997, - "output_cost": 0.00032759999999999994 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2715.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00229805, + "input_cost": 0.00025065, + "output_cost": 0.0020474 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 33.254325135, + "Duration": 5.607493865, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2660.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00211225, + "input_cost": 0.00025035, + "output_cost": 0.0018619 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 34.303524713, + "Duration": 5.350370934, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2632.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0020200500000000002, + "input_cost": 0.00025035, + "output_cost": 0.0017697000000000001 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 38.439967155, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Duration": 13.369507864, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6433.0, + "provider": "Google", + "Metric_request_tokens": 3970.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.0057969, + "input_cost": 0.0005954999999999999, + "output_cost": 0.0052014 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 14.821898825, + "Duration": 5.427887401, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9263.0, - "provider": "Amazon", - "Metric_request_tokens": 7867.0, - "Metric_response_tokens": 1396.0, - "total_cost": 0.0008070600000000001, - "input_cost": 0.00047202, - "output_cost": 0.00033504 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2640.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00204805, + "input_cost": 0.00025035, + "output_cost": 0.0017977 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 18.995154891, + "Duration": 6.886046113, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2860.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00281805, + "input_cost": 0.00025035, + "output_cost": 0.0025677 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 69.995588077, + "Duration": 6.190617486, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2704.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00225955, + "input_cost": 0.00025065, + "output_cost": 0.0020089 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 9.927372481, + "Duration": 12.313419785, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5372.0, - "provider": "Amazon", - "Metric_request_tokens": 4550.0, - "Metric_response_tokens": 822.0, - "total_cost": 0.00047028, - "input_cost": 0.000273, - "output_cost": 0.00019728 + "total_tokens": 7265.0, + "provider": "Google", + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.00406355, + "input_cost": 0.00080415, + "output_cost": 0.0032594 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 32.900333046, + "Duration": 14.12684929, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6606.0, + "provider": "Google", + "Metric_request_tokens": 3969.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.00642025, + "input_cost": 0.00059535, + "output_cost": 0.0058249 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 28.391986261, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Duration": 11.416052215, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5051.0, + "provider": "Google", + "Metric_request_tokens": 3061.0, + "Metric_response_tokens": 1169.0, + "total_cost": 0.00403405, + "input_cost": 0.00045914999999999997, + "output_cost": 0.0035749 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.633113428, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3619.0, - "provider": "Amazon", - "Metric_request_tokens": 3397.0, - "Metric_response_tokens": 222.0, - "total_cost": 0.00025709999999999996, - "input_cost": 0.00020381999999999998, - "output_cost": 5.328e-5 + "Duration": 5.042913232, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2542.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00170505, + "input_cost": 0.00025035, + "output_cost": 0.0014547 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 15.857710005, + "Duration": 5.803645705, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2732.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0023700499999999994, + "input_cost": 0.00025035, + "output_cost": 0.0021196999999999995 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 32.586487666, + "Duration": 24.280603396, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Amazon", + "provider": "Google", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -5878,63 +10803,63 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 9.551409036, + "Duration": 21.444646242, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 5238.0, - "provider": "Amazon", - "Metric_request_tokens": 4492.0, - "Metric_response_tokens": 746.0, - "total_cost": 0.00044856000000000003, - "input_cost": 0.00026952000000000004, - "output_cost": 0.00017904 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7586.0, + "provider": "Google", + "Metric_request_tokens": 3679.0, + "Metric_response_tokens": 722.0, + "total_cost": 0.012132549999999999, + "input_cost": 0.00055185, + "output_cost": 0.0115807 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 24.607065725, + "Duration": 14.103361899, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7319.0, + "provider": "Google", + "Metric_request_tokens": 4540.0, + "Metric_response_tokens": 1218.0, + "total_cost": 0.006875300000000001, + "input_cost": 0.000681, + "output_cost": 0.006194300000000001 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 13.907358597, - "Score_MermaidDiagramValid": 0.0, + "Duration": 26.086590308, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5408.0, - "provider": "Amazon", - "Metric_request_tokens": 4043.0, - "Metric_response_tokens": 1365.0, - "total_cost": 0.0005701799999999999, - "input_cost": 0.00024257999999999997, - "output_cost": 0.00032759999999999994 + "total_tokens": 9499.0, + "provider": "Google", + "Metric_request_tokens": 3969.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.016481950000000002, + "input_cost": 0.00059535, + "output_cost": 0.0158866 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 33.254325135, + "Duration": 24.1841836, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Amazon", + "provider": "Google", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -5942,79 +10867,47 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 34.303524713, + "Duration": 17.981969969, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7548.0, + "provider": "Google", + "Metric_request_tokens": 3630.0, + "Metric_response_tokens": 1695.0, + "total_cost": 0.009342000000000001, + "input_cost": 0.0005445, + "output_cost": 0.008797500000000001 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 38.439967155, + "Duration": 20.823357766, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9790.0, + "provider": "Google", + "Metric_request_tokens": 5066.0, + "Metric_response_tokens": 1729.0, + "total_cost": 0.0122798, + "input_cost": 0.0007599, + "output_cost": 0.0115199 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 14.821898825, + "Duration": 25.462973093, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9263.0, - "provider": "Amazon", - "Metric_request_tokens": 7867.0, - "Metric_response_tokens": 1396.0, - "total_cost": 0.0008070600000000001, - "input_cost": 0.00047202, - "output_cost": 0.00033504 - }, - { - "Model": "bedrock:us.amazon.nova-lite-v1:0", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 18.995154891, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 - }, - { - "Model": "bedrock:us.amazon.nova-lite-v1:0", - "Case": "fix_invalid_diagram_easy", - "test_group": "easy", - "Duration": 69.995588077, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Amazon", + "provider": "Google", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -6022,47 +10915,47 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 9.927372481, + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 35.722264378, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5372.0, - "provider": "Amazon", - "Metric_request_tokens": 4550.0, - "Metric_response_tokens": 822.0, - "total_cost": 0.00047028, - "input_cost": 0.000273, - "output_cost": 0.00019728 + "total_tokens": 16420.0, + "provider": "Google", + "Metric_request_tokens": 8000.0, + "Metric_response_tokens": 1807.0, + "total_cost": 0.0254297, + "input_cost": 0.0012, + "output_cost": 0.0242297 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 32.900333046, + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.71420281, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 1869.0, + "provider": "Google", + "Metric_request_tokens": 811.0, + "Metric_response_tokens": 111.0, + "total_cost": 0.0035027500000000002, + "input_cost": 0.00012164999999999999, + "output_cost": 0.0033811 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", - "Case": "fix_invalid_diagram_easy", - "test_group": "easy", - "Duration": 28.391986261, + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 22.548778955, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Amazon", + "provider": "Google", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -6070,31 +10963,15 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 5.633113428, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3619.0, - "provider": "Amazon", - "Metric_request_tokens": 3397.0, - "Metric_response_tokens": 222.0, - "total_cost": 0.00025709999999999996, - "input_cost": 0.00020381999999999998, - "output_cost": 5.328e-5 - }, - { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 15.857710005, + "Duration": 18.459759081, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Amazon", + "provider": "Google", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -6102,63 +10979,63 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 10.13120079, + "Duration": 35.081946937, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4093.0, - "provider": "Amazon", - "Metric_request_tokens": 3436.0, - "Metric_response_tokens": 657.0, - "total_cost": 0.00036384, - "input_cost": 0.00020616, - "output_cost": 0.00015768 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11019.0, + "provider": "Google", + "Metric_request_tokens": 4018.0, + "Metric_response_tokens": 1243.0, + "total_cost": 0.021501500000000003, + "input_cost": 0.0006027, + "output_cost": 0.020898800000000002 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 10.202050303, + "Duration": 26.112757598, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 5241.0, - "provider": "Amazon", - "Metric_request_tokens": 4466.0, - "Metric_response_tokens": 775.0, - "total_cost": 0.00045396, - "input_cost": 0.00026796000000000003, - "output_cost": 0.000186 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 12440.0, + "provider": "Google", + "Metric_request_tokens": 6832.0, + "Metric_response_tokens": 1782.0, + "total_cost": 0.015484999999999999, + "input_cost": 0.0010248, + "output_cost": 0.0144602 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 12.523508567, + "Duration": 15.507230852, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4560.0, - "provider": "Amazon", - "Metric_request_tokens": 3455.0, - "Metric_response_tokens": 1105.0, - "total_cost": 0.0004725, - "input_cost": 0.0002073, - "output_cost": 0.0002652 + "total_tokens": 4880.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00988225, + "input_cost": 0.00025035, + "output_cost": 0.0096319 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 33.301909151, + "Duration": 22.268203361, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Amazon", + "provider": "Google", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -6166,79 +11043,95 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 16.07952375, + "Duration": 29.657737725, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9166.0, - "provider": "Amazon", - "Metric_request_tokens": 7880.0, - "Metric_response_tokens": 1286.0, - "total_cost": 0.00078144, - "input_cost": 0.0004728, - "output_cost": 0.00030864 + "total_tokens": 8725.0, + "provider": "Google", + "Metric_request_tokens": 3968.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.013843, + "input_cost": 0.0005952, + "output_cost": 0.013247799999999999 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 20.832885038, + "Duration": 17.856923641, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10426.0, - "provider": "Amazon", - "Metric_request_tokens": 8538.0, - "Metric_response_tokens": 1888.0, - "total_cost": 0.0009653999999999999, - "input_cost": 0.00051228, - "output_cost": 0.00045311999999999994 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 10.103307419, + "Duration": 22.406209617, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 5207.0, - "provider": "Amazon", - "Metric_request_tokens": 4458.0, - "Metric_response_tokens": 749.0, - "total_cost": 0.00044724, - "input_cost": 0.00026748, - "output_cost": 0.00017976 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9355.0, + "provider": "Google", + "Metric_request_tokens": 4545.0, + "Metric_response_tokens": 1192.0, + "total_cost": 0.01405995, + "input_cost": 0.0006817500000000001, + "output_cost": 0.0133782 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 15.416841941, + "Duration": 29.67447829, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7739.0, - "provider": "Amazon", - "Metric_request_tokens": 6387.0, - "Metric_response_tokens": 1352.0, - "total_cost": 0.0007076999999999999, - "input_cost": 0.00038322, - "output_cost": 0.00032448 + "total_tokens": 9321.0, + "provider": "Google", + "Metric_request_tokens": 3105.0, + "Metric_response_tokens": 1192.0, + "total_cost": 0.01876495, + "input_cost": 0.00046575, + "output_cost": 0.018299199999999998 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 32.051268291, + "Duration": 4.125237979, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1709.0, + "provider": "Google", + "Metric_request_tokens": 810.0, + "Metric_response_tokens": 524.0, + "total_cost": 0.0017484000000000002, + "input_cost": 0.00012149999999999999, + "output_cost": 0.0016269000000000001 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 32.319621961, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Amazon", + "provider": "Google", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -6246,47 +11139,79 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 16.088759709, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 22.178054671, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8629.0, + "provider": "Google", + "Metric_request_tokens": 3965.0, + "Metric_response_tokens": 1170.0, + "total_cost": 0.01352575, + "input_cost": 0.0005947499999999999, + "output_cost": 0.012931 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 23.906853847, + "Duration": 10.029780959, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 19298.0, - "provider": "Amazon", - "Metric_request_tokens": 17687.0, - "Metric_response_tokens": 1611.0, - "total_cost": 0.00144786, - "input_cost": 0.00106122, - "output_cost": 0.00038664 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 3107.0, + "provider": "Google", + "Metric_request_tokens": 811.0, + "Metric_response_tokens": 536.0, + "total_cost": 0.006603250000000001, + "input_cost": 0.00012164999999999999, + "output_cost": 0.0064816000000000006 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 14.752534117, + "Duration": 42.923902196, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 6073.0, - "provider": "Amazon", - "Metric_request_tokens": 4698.0, - "Metric_response_tokens": 1375.0, - "total_cost": 0.00061188, - "input_cost": 0.00028188, - "output_cost": 0.00032999999999999994 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 30481.0, + "provider": "Google", + "Metric_request_tokens": 21012.0, + "Metric_response_tokens": 3073.0, + "total_cost": 0.0273816, + "input_cost": 0.0031517999999999997, + "output_cost": 0.0242298 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 58.670944361, + "Duration": 39.222518221, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Amazon", + "provider": "Google", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -6294,47 +11219,47 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 9.086971754, + "Duration": 9.631045956, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4078.0, - "provider": "Amazon", - "Metric_request_tokens": 3427.0, - "Metric_response_tokens": 651.0, - "total_cost": 0.00036186, - "input_cost": 0.00020562, - "output_cost": 0.00015623999999999998 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 2892.0, + "provider": "Google", + "Metric_request_tokens": 811.0, + "Metric_response_tokens": 625.0, + "total_cost": 0.00559265, + "input_cost": 0.00012164999999999999, + "output_cost": 0.005471 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 35.618876054, + "Duration": 6.647730144, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 2292.0, + "provider": "Google", + "Metric_request_tokens": 810.0, + "Metric_response_tokens": 524.0, + "total_cost": 0.0037889, + "input_cost": 0.00012149999999999999, + "output_cost": 0.0036674 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 20.22773949, + "Duration": 35.71455716, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Amazon", + "provider": "Google", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -6342,575 +11267,575 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 9.717342214, + "Duration": 5.476345308, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4188.0, - "provider": "Amazon", - "Metric_request_tokens": 3424.0, - "Metric_response_tokens": 764.0, - "total_cost": 0.0003888, - "input_cost": 0.00020543999999999998, - "output_cost": 0.00018336 + "total_tokens": 2659.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00210205, + "input_cost": 0.00025065, + "output_cost": 0.0018514 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 36.005522453, + "Duration": 6.346005219, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2718.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00232105, + "input_cost": 0.00025035, + "output_cost": 0.0020707 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 37.475340611, + "Duration": 4.836977527, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2539.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00169455, + "input_cost": 0.00025035, + "output_cost": 0.0014441999999999999 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 15.113064255, + "Duration": 6.672693551, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 5962.0, - "provider": "Amazon", - "Metric_request_tokens": 4515.0, - "Metric_response_tokens": 1447.0, - "total_cost": 0.00061818, - "input_cost": 0.00027089999999999997, - "output_cost": 0.00034728 + "total_tokens": 2929.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00304705, + "input_cost": 0.00025065, + "output_cost": 0.0027964 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 17.63608912, + "Duration": 14.662619685, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6896.0, + "provider": "Google", + "Metric_request_tokens": 3981.0, + "Metric_response_tokens": 1184.0, + "total_cost": 0.00736605, + "input_cost": 0.00059715, + "output_cost": 0.0067689 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 10.620195375, + "Duration": 5.188614707, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5359.0, - "provider": "Amazon", - "Metric_request_tokens": 4536.0, - "Metric_response_tokens": 823.0, - "total_cost": 0.00046968, - "input_cost": 0.00027215999999999997, - "output_cost": 0.00019752 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2611.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 557.0, + "total_cost": 0.0019320499999999998, + "input_cost": 0.00025035, + "output_cost": 0.0016817 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 57.336604821, + "Duration": 6.5603659, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2756.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00244155, + "input_cost": 0.00025065, + "output_cost": 0.0021909 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 33.836315522, + "Duration": 6.128366075, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2798.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00259525, + "input_cost": 0.00025035, + "output_cost": 0.0023449 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.83794607, + "Duration": 4.944774864, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3553.0, - "provider": "Amazon", - "Metric_request_tokens": 3358.0, - "Metric_response_tokens": 195.0, - "total_cost": 0.00024828, - "input_cost": 0.00020147999999999998, - "output_cost": 4.68e-5 + "total_tokens": 2557.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00175755, + "input_cost": 0.00025035, + "output_cost": 0.0015072 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 33.75772521, + "Duration": 5.76290555, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2696.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.0022315499999999997, + "input_cost": 0.00025065, + "output_cost": 0.0019809 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 10.1295691, + "Duration": 11.161597903, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5291.0, - "provider": "Amazon", - "Metric_request_tokens": 4491.0, - "Metric_response_tokens": 800.0, - "total_cost": 0.00046146, - "input_cost": 0.00026946, - "output_cost": 0.000192 + "total_tokens": 7403.0, + "provider": "Google", + "Metric_request_tokens": 5349.0, + "Metric_response_tokens": 1168.0, + "total_cost": 0.00460415, + "input_cost": 0.00080235, + "output_cost": 0.0038018 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 9.798962699, + "Duration": 6.529419098, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4239.0, - "provider": "Amazon", - "Metric_request_tokens": 3490.0, - "Metric_response_tokens": 749.0, - "total_cost": 0.00038916, - "input_cost": 0.0002094, - "output_cost": 0.00017976 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2839.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0027445499999999997, + "input_cost": 0.00025035, + "output_cost": 0.0024942 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 35.330206914, + "Duration": 8.540685767, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3173.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.0039010499999999997, + "input_cost": 0.00025065, + "output_cost": 0.0036504 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 57.658972199, + "Duration": 5.922996591, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2729.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0023595499999999998, + "input_cost": 0.00025035, + "output_cost": 0.0021092 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 13.06782325, + "Duration": 6.741784844, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 5686.0, - "provider": "Amazon", - "Metric_request_tokens": 4477.0, - "Metric_response_tokens": 1209.0, - "total_cost": 0.00055878, - "input_cost": 0.00026862, - "output_cost": 0.00029016 + "total_tokens": 2900.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00295805, + "input_cost": 0.00025035, + "output_cost": 0.0027077 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 7.235511866, + "Duration": 7.055616236, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4178.0, - "provider": "Amazon", - "Metric_request_tokens": 3433.0, - "Metric_response_tokens": 745.0, - "total_cost": 0.00022445500000000003, - "input_cost": 0.000120155, - "output_cost": 0.00010430000000000001 + "total_tokens": 2809.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.0026270499999999997, + "input_cost": 0.00025065, + "output_cost": 0.0023764 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 10.032302433, + "Duration": 21.365465445, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4888.0, - "provider": "Amazon", - "Metric_request_tokens": 3535.0, - "Metric_response_tokens": 1353.0, - "total_cost": 0.000313145, - "input_cost": 0.000123725, - "output_cost": 0.00018942 + "total_tokens": 9022.0, + "provider": "Google", + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.01021305, + "input_cost": 0.00080415, + "output_cost": 0.0094089 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 16.138580876, + "Duration": 5.080146417, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2513.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00160355, + "input_cost": 0.00025035, + "output_cost": 0.0013532 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 10.894268114, - "Score_MermaidDiagramValid": 0.0, + "Duration": 25.419348211, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7935.0, - "provider": "Amazon", - "Metric_request_tokens": 6523.0, - "Metric_response_tokens": 1412.0, - "total_cost": 0.00042598500000000004, - "input_cost": 0.00022830500000000002, - "output_cost": 0.00019768000000000002 + "total_tokens": 10935.0, + "provider": "Google", + "Metric_request_tokens": 5358.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.016854800000000003, + "input_cost": 0.0008037, + "output_cost": 0.016051100000000002 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 15.982702523, + "Duration": 4.279733312, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2443.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00135855, + "input_cost": 0.00025035, + "output_cost": 0.0011082 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 22.058090836, + "Duration": 5.584990518, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2690.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00221725, + "input_cost": 0.00025035, + "output_cost": 0.0019669 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 17.691408682, + "Duration": 7.12928071, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2925.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00303305, + "input_cost": 0.00025065, + "output_cost": 0.0027824 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 29.664497105, + "Duration": 23.894062173, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11489.0, + "provider": "Google", + "Metric_request_tokens": 6861.0, + "Metric_response_tokens": 1806.0, + "total_cost": 0.01198975, + "input_cost": 0.00102915, + "output_cost": 0.010960600000000001 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 17.96936611, + "Duration": 9.010575503, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3355.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00455055, + "input_cost": 0.00025035, + "output_cost": 0.0043002000000000005 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 25.827149441, + "Duration": 5.778800106, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2693.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.0022210499999999996, + "input_cost": 0.00025065, + "output_cost": 0.0019703999999999998 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 13.347354317, + "Duration": 8.167258931, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9162.0, - "provider": "Amazon", - "Metric_request_tokens": 7157.0, - "Metric_response_tokens": 2005.0, - "total_cost": 0.000531195, - "input_cost": 0.00025049500000000005, - "output_cost": 0.0002807 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3303.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 550.0, + "total_cost": 0.004374349999999999, + "input_cost": 0.00025035, + "output_cost": 0.004123999999999999 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.111033105, + "Duration": 4.731809965, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2458.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00141105, + "input_cost": 0.00025035, + "output_cost": 0.0011607 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.272915809, + "Duration": 5.421789254, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2578.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 568.0, + "total_cost": 0.0017779500000000001, + "input_cost": 0.00025065, + "output_cost": 0.0015273 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 22.779003058, + "Duration": 4.939170129, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2535.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0016805499999999998, + "input_cost": 0.00025035, + "output_cost": 0.0014302 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 19.910701039, + "Duration": 5.085231652, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2550.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0017330499999999999, + "input_cost": 0.00025035, + "output_cost": 0.0014827 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 7.235511866, - "Score_MermaidDiagramValid": 0.0, + "Duration": 12.230727772, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4178.0, - "provider": "Amazon", - "Metric_request_tokens": 3433.0, - "Metric_response_tokens": 745.0, - "total_cost": 0.00022445500000000003, - "input_cost": 0.000120155, - "output_cost": 0.00010430000000000001 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11014.0, + "provider": "Google", + "Metric_request_tokens": 3108.0, + "Metric_response_tokens": 1198.0, + "total_cost": 0.024663, + "input_cost": 0.0004662, + "output_cost": 0.0241968 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 10.032302433, + "Duration": 8.306374733, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4888.0, - "provider": "Amazon", - "Metric_request_tokens": 3535.0, - "Metric_response_tokens": 1353.0, - "total_cost": 0.000313145, - "input_cost": 0.000123725, - "output_cost": 0.00018942 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2757.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00245175, + "input_cost": 0.00025035, + "output_cost": 0.0022014 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 16.138580876, + "Duration": 6.595447784, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2698.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0022510499999999997, + "input_cost": 0.00025035, + "output_cost": 0.0020007 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 10.894268114, + "Duration": 6.077756987, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7935.0, - "provider": "Amazon", - "Metric_request_tokens": 6523.0, - "Metric_response_tokens": 1412.0, - "total_cost": 0.00042598500000000004, - "input_cost": 0.00022830500000000002, - "output_cost": 0.00019768000000000002 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 8290.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.02181055, + "input_cost": 0.00025065, + "output_cost": 0.0215599 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 15.982702523, + "Duration": 13.08787764, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 12993.0, + "provider": "Google", + "Metric_request_tokens": 5355.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.024140349999999998, + "input_cost": 0.00080325, + "output_cost": 0.0233371 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 22.058090836, + "Duration": 18.203576056, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Amazon", + "provider": "Google", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -6918,1914 +11843,1914 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 17.691408682, + "Duration": 7.877005808, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2829.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00269705, + "input_cost": 0.00025065, + "output_cost": 0.0024464 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 29.664497105, + "Duration": 6.691518112, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2766.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00248905, + "input_cost": 0.00025035, + "output_cost": 0.0022387 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 17.96936611, + "Duration": 5.46408462, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2591.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0018765499999999998, + "input_cost": 0.00025035, + "output_cost": 0.0016262 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 25.827149441, + "Duration": 11.559526636, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 15503.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.047056049999999995, + "input_cost": 0.00025065, + "output_cost": 0.0468054 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 13.347354317, + "Duration": 18.417513707, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9162.0, - "provider": "Amazon", - "Metric_request_tokens": 7157.0, - "Metric_response_tokens": 2005.0, - "total_cost": 0.000531195, - "input_cost": 0.00025049500000000005, - "output_cost": 0.0002807 + "total_tokens": 8232.0, + "provider": "Google", + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.0074480499999999995, + "input_cost": 0.00080415, + "output_cost": 0.0066438999999999995 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.111033105, + "Duration": 9.485176875, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2981.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00323575, + "input_cost": 0.00025035, + "output_cost": 0.0029854 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.272915809, + "Duration": 6.271124626, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2722.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00232255, + "input_cost": 0.00025065, + "output_cost": 0.0020719000000000002 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 22.779003058, + "Duration": 18.342354393, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 20909.0, + "provider": "Google", + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1158.0, + "total_cost": 0.051863950000000006, + "input_cost": 0.00080415, + "output_cost": 0.0510598 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 19.910701039, + "Duration": 19.63725688, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 19611.0, + "provider": "Google", + "Metric_request_tokens": 3981.0, + "Metric_response_tokens": 1184.0, + "total_cost": 0.05186855, + "input_cost": 0.00059715, + "output_cost": 0.0512714 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 10.709128752, + "Duration": 4.517046246, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7655.0, - "provider": "Amazon", - "Metric_request_tokens": 6340.0, - "Metric_response_tokens": 1315.0, - "total_cost": 0.000406, - "input_cost": 0.00022190000000000003, - "output_cost": 0.0001841 + "total_tokens": 4330.0, + "provider": "Google", + "Metric_request_tokens": 3106.0, + "Metric_response_tokens": 1224.0, + "total_cost": 0.0008002, + "input_cost": 0.0003106, + "output_cost": 0.0004896000000000001 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 16.1569874, + "Duration": 4.366856197, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4300.0, + "provider": "Google", + "Metric_request_tokens": 3108.0, + "Metric_response_tokens": 1192.0, + "total_cost": 0.0007876000000000001, + "input_cost": 0.0003108, + "output_cost": 0.0004768 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 16.292243277, + "Duration": 3.691253093, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3771.0, + "provider": "Google", + "Metric_request_tokens": 3105.0, + "Metric_response_tokens": 666.0, + "total_cost": 0.0005769, + "input_cost": 0.0003105, + "output_cost": 0.0002664 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 25.459357714, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Duration": 4.948628294, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 50365.0, + "provider": "Google", + "Metric_request_tokens": 49186.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.0053902, + "input_cost": 0.0049186, + "output_cost": 0.00047159999999999997 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 24.24639963, + "Duration": 4.397124725, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4273.0, + "provider": "Google", + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 1184.0, + "total_cost": 0.0007825, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0004736 }, - { - "Model": "bedrock:us.amazon.nova-micro-v1:0", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 27.127796199, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 4.246158175, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4269.0, + "provider": "Google", + "Metric_request_tokens": 3092.0, + "Metric_response_tokens": 1177.0, + "total_cost": 0.0007800000000000001, + "input_cost": 0.00030920000000000003, + "output_cost": 0.00047080000000000006 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 7.515629417, + "Duration": 4.461461037, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4196.0, - "provider": "Amazon", - "Metric_request_tokens": 3478.0, - "Metric_response_tokens": 718.0, - "total_cost": 0.00022225, - "input_cost": 0.00012173000000000002, - "output_cost": 0.00010052 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4313.0, + "provider": "Google", + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 1224.0, + "total_cost": 0.0007985000000000002, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0004896000000000001 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 22.272816724, + "Duration": 3.826108098, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3829.0, + "provider": "Google", + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 693.0, + "total_cost": 0.0005908, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0002772 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 27.155517751, + "Duration": 3.812971395, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3771.0, + "provider": "Google", + "Metric_request_tokens": 3103.0, + "Metric_response_tokens": 668.0, + "total_cost": 0.0005775, + "input_cost": 0.0003103, + "output_cost": 0.0002672 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 21.478668781, + "Duration": 4.463204247, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4292.0, + "provider": "Google", + "Metric_request_tokens": 3106.0, + "Metric_response_tokens": 1186.0, + "total_cost": 0.0007850000000000001, + "input_cost": 0.0003106, + "output_cost": 0.00047440000000000004 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 16.68120615, + "Duration": 4.422216874, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6650.0, + "provider": "Google", + "Metric_request_tokens": 5423.0, + "Metric_response_tokens": 1227.0, + "total_cost": 0.0010331, + "input_cost": 0.0005423, + "output_cost": 0.0004908 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 17.854751274, + "Duration": 4.450171262, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4287.0, + "provider": "Google", + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.0007875, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00047840000000000003 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.11028082, + "Duration": 4.562724232, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4284.0, + "provider": "Google", + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.0007872, + "input_cost": 0.0003088, + "output_cost": 0.00047840000000000003 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 13.019602175, + "Duration": 4.407190932, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4378.0, + "provider": "Google", + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 1242.0, + "total_cost": 0.0008104000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0004968 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 18.122924172, + "Duration": 3.620992741, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3766.0, + "provider": "Google", + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 675.0, + "total_cost": 0.0005791, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00027 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 13.539048311, + "Duration": 5.472499546, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 18474.0, - "provider": "Amazon", - "Metric_request_tokens": 17520.0, - "Metric_response_tokens": 954.0, - "total_cost": 0.0007467600000000001, - "input_cost": 0.0006132, - "output_cost": 0.00013356000000000002 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4297.0, + "provider": "Google", + "Metric_request_tokens": 3090.0, + "Metric_response_tokens": 1207.0, + "total_cost": 0.0007918000000000001, + "input_cost": 0.00030900000000000003, + "output_cost": 0.0004828 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 25.096765367, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 - }, - { - "Model": "bedrock:us.amazon.nova-micro-v1:0", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 15.169118995, + "Duration": 4.473251097, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4322.0, + "provider": "Google", + "Metric_request_tokens": 3106.0, + "Metric_response_tokens": 1216.0, + "total_cost": 0.0007970000000000001, + "input_cost": 0.0003106, + "output_cost": 0.00048640000000000006 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 4.574995535, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4266.0, + "provider": "Google", + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 1175.0, + "total_cost": 0.0007791, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00047000000000000004 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 9.241399938, + "Duration": 4.094999263, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6107.0, - "provider": "Amazon", - "Metric_request_tokens": 5090.0, - "Metric_response_tokens": 1017.0, - "total_cost": 0.00032053, - "input_cost": 0.00017815000000000002, - "output_cost": 0.00014238 + "total_tokens": 3858.0, + "provider": "Google", + "Metric_request_tokens": 3137.0, + "Metric_response_tokens": 721.0, + "total_cost": 0.0006021, + "input_cost": 0.00031370000000000004, + "output_cost": 0.0002884 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 29.228884007, + "Duration": 4.658386722, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3778.0, + "provider": "Google", + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 689.0, + "total_cost": 0.0005845000000000001, + "input_cost": 0.00030890000000000003, + "output_cost": 0.00027560000000000003 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 27.533683609, + "Duration": 4.893656632, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4357.0, + "provider": "Google", + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 1221.0, + "total_cost": 0.0008020000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0004884 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 20.056999566, + "Duration": 5.236565242, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3883.0, + "provider": "Google", + "Metric_request_tokens": 3137.0, + "Metric_response_tokens": 746.0, + "total_cost": 0.0006121000000000001, + "input_cost": 0.00031370000000000004, + "output_cost": 0.00029840000000000004 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 15.667786223, + "Duration": 4.485359369, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4324.0, + "provider": "Google", + "Metric_request_tokens": 3108.0, + "Metric_response_tokens": 1216.0, + "total_cost": 0.0007972000000000001, + "input_cost": 0.0003108, + "output_cost": 0.00048640000000000006 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 25.437752797, + "Duration": 3.734365673, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3791.0, + "provider": "Google", + "Metric_request_tokens": 3110.0, + "Metric_response_tokens": 681.0, + "total_cost": 0.0005834, + "input_cost": 0.000311, + "output_cost": 0.0002724 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 21.22790309, + "Duration": 4.500398738, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4304.0, + "provider": "Google", + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1216.0, + "total_cost": 0.0007952, + "input_cost": 0.0003088, + "output_cost": 0.00048640000000000006 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 27.491055, + "Duration": 4.124787172, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3850.0, + "provider": "Google", + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 761.0, + "total_cost": 0.0006133, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0003044 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 17.986839409, + "Duration": 3.98193398, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3903.0, + "provider": "Google", + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 767.0, + "total_cost": 0.0006204000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.00030680000000000003 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 29.143947476, + "Duration": 4.48101111, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4290.0, + "provider": "Google", + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1202.0, + "total_cost": 0.0007896, + "input_cost": 0.0003088, + "output_cost": 0.0004808 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 27.419551466, + "Duration": 4.916593879, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4543.0, + "provider": "Google", + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 1454.0, + "total_cost": 0.0008905, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0005816 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 13.586402193, + "Duration": 3.879042448, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 18273.0, - "provider": "Amazon", - "Metric_request_tokens": 17305.0, - "Metric_response_tokens": 968.0, - "total_cost": 0.000741195, - "input_cost": 0.0006056750000000001, - "output_cost": 0.00013552 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3885.0, + "provider": "Google", + "Metric_request_tokens": 3092.0, + "Metric_response_tokens": 793.0, + "total_cost": 0.0006264, + "input_cost": 0.00030920000000000003, + "output_cost": 0.0003172 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 24.207496913, - "Score_MermaidDiagramValid": 1.0, + "Duration": 4.444876941, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5044.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4284.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1158.0, - "total_cost": 0.023253749999999997, - "input_cost": 0.00388375, - "output_cost": 0.01937 + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.0007872, + "input_cost": 0.0003088, + "output_cost": 0.00047840000000000003 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 24.803203804, + "Duration": 4.627413202, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5199.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4291.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.02480375, - "input_cost": 0.00388375, - "output_cost": 0.02092 + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 1202.0, + "total_cost": 0.0007897, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0004808 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 34.032753508, + "Duration": 3.704063431, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6600.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3774.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1160.0, - "total_cost": 0.03881375, - "input_cost": 0.00388375, - "output_cost": 0.03493 + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 683.0, + "total_cost": 0.0005823, + "input_cost": 0.00030910000000000003, + "output_cost": 0.0002732 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 24.432732425, + "Duration": 4.447734786, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5326.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4282.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1176.0, - "total_cost": 0.02607375, - "input_cost": 0.00388375, - "output_cost": 0.02219 + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 1191.0, + "total_cost": 0.0007855000000000001, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00047640000000000003 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 27.33412975, + "Duration": 4.576908765, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5806.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4271.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1201.0, - "total_cost": 0.03087375, - "input_cost": 0.00388375, - "output_cost": 0.02699 + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 1180.0, + "total_cost": 0.0007811000000000001, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00047200000000000003 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 32.488361727, + "Duration": 5.032189281, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5925.0, + "total_tokens": 4612.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1154.0, - "total_cost": 0.03206375, - "input_cost": 0.00388375, - "output_cost": 0.028180000000000004 + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 1476.0, + "total_cost": 0.0009040000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0005904 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.454849392, - "Score_MermaidDiagramValid": 1.0, + "Duration": 4.472401128, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5526.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4274.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.02807375, - "input_cost": 0.00388375, - "output_cost": 0.024190000000000003 + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1186.0, + "total_cost": 0.0007832000000000001, + "input_cost": 0.0003088, + "output_cost": 0.00047440000000000004 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 40.878031462, - "Score_MermaidDiagramValid": 1.0, + "Duration": 3.732016304, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10420.0, + "total_tokens": 3778.0, "provider": "Google", - "Metric_request_tokens": 6847.0, - "Metric_response_tokens": 1815.0, - "total_cost": 0.04428875, - "input_cost": 0.00855875, - "output_cost": 0.03573 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 671.0, + "total_cost": 0.0005791, + "input_cost": 0.0003107, + "output_cost": 0.0002684 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 67.384048514, + "Duration": 4.756348604, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 13422.0, + "total_tokens": 4571.0, "provider": "Google", - "Metric_request_tokens": 6855.0, - "Metric_response_tokens": 1785.0, - "total_cost": 0.07423875, - "input_cost": 0.00856875, - "output_cost": 0.06567 + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 1435.0, + "total_cost": 0.0008876000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0005740000000000001 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 27.014404973, + "Duration": 4.71480991, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5406.0, + "total_tokens": 4318.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.026873749999999995, - "input_cost": 0.00388375, - "output_cost": 0.022989999999999997 + "Metric_request_tokens": 3104.0, + "Metric_response_tokens": 1214.0, + "total_cost": 0.000796, + "input_cost": 0.0003104, + "output_cost": 0.00048560000000000004 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 26.744050775, + "Duration": 3.797644523, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5425.0, + "total_tokens": 3776.0, "provider": "Google", "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1156.0, - "total_cost": 0.027063750000000004, - "input_cost": 0.00388375, - "output_cost": 0.023180000000000003 + "Metric_response_tokens": 669.0, + "total_cost": 0.0005783, + "input_cost": 0.0003107, + "output_cost": 0.0002676 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 39.712500277, + "Duration": 4.755809993, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6614.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4296.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1156.0, - "total_cost": 0.03895375, - "input_cost": 0.00388375, - "output_cost": 0.035070000000000004 + "Metric_request_tokens": 3092.0, + "Metric_response_tokens": 1204.0, + "total_cost": 0.0007908, + "input_cost": 0.00030920000000000003, + "output_cost": 0.0004816 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 41.219766849, - "Score_MermaidDiagramValid": 1.0, + "Duration": 4.602118065, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7059.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4273.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.04340375, - "input_cost": 0.00388375, - "output_cost": 0.03952 + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1185.0, + "total_cost": 0.0007828, + "input_cost": 0.0003088, + "output_cost": 0.00047400000000000003 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 24.833685412, + "Duration": 4.466044834, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5205.0, + "total_tokens": 4312.0, "provider": "Google", "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.024863749999999997, - "input_cost": 0.00388375, - "output_cost": 0.02098 + "Metric_response_tokens": 1205.0, + "total_cost": 0.0007927, + "input_cost": 0.0003107, + "output_cost": 0.000482 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 30.57386297, + "Duration": 4.130914105, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5837.0, + "total_tokens": 3861.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1156.0, - "total_cost": 0.031183749999999996, - "input_cost": 0.00388375, - "output_cost": 0.027299999999999998 + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 725.0, + "total_cost": 0.0006036, + "input_cost": 0.00031360000000000003, + "output_cost": 0.00029 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 53.658760503, + "Duration": 19.530914159, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 63389.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4267.0, "provider": "Google", - "Metric_request_tokens": 58608.0, - "Metric_response_tokens": 2447.0, - "total_cost": 0.12107000000000001, - "input_cost": 0.07326, - "output_cost": 0.04781 + "Metric_request_tokens": 3077.0, + "Metric_response_tokens": 1190.0, + "total_cost": 0.0007837, + "input_cost": 0.0003077, + "output_cost": 0.0004760000000000001 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 32.435081791, + "Duration": 5.086535765, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6158.0, + "total_tokens": 4261.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1150.0, - "total_cost": 0.03439375, - "input_cost": 0.00388375, - "output_cost": 0.03051 + "Metric_request_tokens": 3094.0, + "Metric_response_tokens": 1167.0, + "total_cost": 0.0007762000000000001, + "input_cost": 0.00030940000000000004, + "output_cost": 0.00046680000000000007 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 26.427531899, + "Duration": 5.168974023, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5629.0, + "total_tokens": 4317.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1156.0, - "total_cost": 0.029103749999999998, - "input_cost": 0.00388375, - "output_cost": 0.02522 + "Metric_request_tokens": 3128.0, + "Metric_response_tokens": 1189.0, + "total_cost": 0.0007884000000000001, + "input_cost": 0.0003128, + "output_cost": 0.0004756 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 33.541754196, + "Duration": 4.200968076, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6253.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3763.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1164.0, - "total_cost": 0.03534375, - "input_cost": 0.00388375, - "output_cost": 0.03146 + "Metric_request_tokens": 3076.0, + "Metric_response_tokens": 687.0, + "total_cost": 0.0005824000000000001, + "input_cost": 0.00030760000000000005, + "output_cost": 0.0002748 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 24.7685358, + "Duration": 4.04668023, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5160.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3772.0, "provider": "Google", - "Metric_request_tokens": 3101.0, - "Metric_response_tokens": 1172.0, - "total_cost": 0.024466250000000002, - "input_cost": 0.00387625, - "output_cost": 0.02059 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 694.0, + "total_cost": 0.0005854, + "input_cost": 0.0003078, + "output_cost": 0.00027759999999999997 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 25.723323767, + "Duration": 5.559231565, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5428.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4278.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.02709375, - "input_cost": 0.00388375, - "output_cost": 0.02321 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1200.0, + "total_cost": 0.0007878, + "input_cost": 0.0003078, + "output_cost": 0.00047999999999999996 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.441471015, - "Score_MermaidDiagramValid": 1.0, + "Duration": 5.227105654, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5382.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4262.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1179.0, - "total_cost": 0.026633749999999998, - "input_cost": 0.00388375, - "output_cost": 0.02275 + "Metric_request_tokens": 3076.0, + "Metric_response_tokens": 1186.0, + "total_cost": 0.0007820000000000001, + "input_cost": 0.00030760000000000005, + "output_cost": 0.00047440000000000004 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 26.604628416, + "Duration": 5.281790633, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5562.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4259.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1160.0, - "total_cost": 0.02843375, - "input_cost": 0.00388375, - "output_cost": 0.024550000000000002 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1181.0, + "total_cost": 0.0007802, + "input_cost": 0.0003078, + "output_cost": 0.0004724 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 50.415529759, + "Duration": 4.292229758, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 11548.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3794.0, "provider": "Google", - "Metric_request_tokens": 6858.0, - "Metric_response_tokens": 1806.0, - "total_cost": 0.055472499999999994, - "input_cost": 0.0085725, - "output_cost": 0.0469 + "Metric_request_tokens": 3080.0, + "Metric_response_tokens": 714.0, + "total_cost": 0.0005936, + "input_cost": 0.000308, + "output_cost": 0.0002856 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 22.679703767, + "Duration": 4.415614026, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5023.0, + "total_tokens": 3801.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.02304375, - "input_cost": 0.00388375, - "output_cost": 0.01916 + "Metric_request_tokens": 3094.0, + "Metric_response_tokens": 707.0, + "total_cost": 0.0005922, + "input_cost": 0.00030940000000000004, + "output_cost": 0.0002828 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 25.725383571, + "Duration": 4.140784542, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5543.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3750.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1171.0, - "total_cost": 0.028243749999999998, - "input_cost": 0.00388375, - "output_cost": 0.02436 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 672.0, + "total_cost": 0.0005766, + "input_cost": 0.0003078, + "output_cost": 0.0002688 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 36.866564448, + "Duration": 4.949271176, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6227.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4281.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1133.0, - "total_cost": 0.03508375, - "input_cost": 0.00388375, - "output_cost": 0.0312 + "Metric_request_tokens": 3081.0, + "Metric_response_tokens": 1200.0, + "total_cost": 0.0007880999999999999, + "input_cost": 0.0003081, + "output_cost": 0.00047999999999999996 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 32.45911738, + "Duration": 4.131322822, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5824.0, + "total_tokens": 3759.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.031053749999999998, - "input_cost": 0.00388375, - "output_cost": 0.02717 + "Metric_request_tokens": 3092.0, + "Metric_response_tokens": 667.0, + "total_cost": 0.000576, + "input_cost": 0.00030920000000000003, + "output_cost": 0.0002668 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 37.005075571, + "Duration": 15.856126981, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6464.0, + "total_tokens": 21095.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.03745375, - "input_cost": 0.00388375, - "output_cost": 0.03357 + "Metric_request_tokens": 18585.0, + "Metric_response_tokens": 2510.0, + "total_cost": 0.0028625000000000005, + "input_cost": 0.0018585000000000001, + "output_cost": 0.0010040000000000001 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 40.203596393, + "Duration": 5.093578255, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6719.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4305.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1176.0, - "total_cost": 0.04000375, - "input_cost": 0.00388375, - "output_cost": 0.03612 + "Metric_request_tokens": 3080.0, + "Metric_response_tokens": 1225.0, + "total_cost": 0.000798, + "input_cost": 0.000308, + "output_cost": 0.00049 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 32.600817879, - "Score_MermaidDiagramValid": 1.0, + "Duration": 5.10008024, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5693.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4271.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1192.0, - "total_cost": 0.02974375, - "input_cost": 0.00388375, - "output_cost": 0.02586 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1193.0, + "total_cost": 0.0007850000000000001, + "input_cost": 0.0003078, + "output_cost": 0.00047720000000000005 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 33.96250988, - "Score_MermaidDiagramValid": 1.0, + "Duration": 4.996482216, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5849.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4254.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1190.0, - "total_cost": 0.03130375, - "input_cost": 0.00388375, - "output_cost": 0.02742 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1176.0, + "total_cost": 0.0007781999999999999, + "input_cost": 0.0003078, + "output_cost": 0.0004704 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 40.243743396, + "Duration": 4.381453313, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6388.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3873.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1154.0, - "total_cost": 0.03669375, - "input_cost": 0.00388375, - "output_cost": 0.03281 + "Metric_request_tokens": 3080.0, + "Metric_response_tokens": 793.0, + "total_cost": 0.0006252, + "input_cost": 0.000308, + "output_cost": 0.0003172 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 18.620513207, + "Duration": 4.576133277, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4699.0, + "total_tokens": 49852.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.019803750000000002, - "input_cost": 0.00388375, - "output_cost": 0.01592 + "Metric_request_tokens": 49187.0, + "Metric_response_tokens": 665.0, + "total_cost": 0.0051847, + "input_cost": 0.004918700000000001, + "output_cost": 0.000266 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 31.869262663, + "Duration": 4.075267329, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5716.0, + "total_tokens": 3813.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1168.0, - "total_cost": 0.02997375, - "input_cost": 0.00388375, - "output_cost": 0.026090000000000002 + "Metric_request_tokens": 3123.0, + "Metric_response_tokens": 690.0, + "total_cost": 0.0005882999999999999, + "input_cost": 0.0003123, + "output_cost": 0.000276 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 31.277196563, + "Duration": 4.984536161, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5832.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4251.0, "provider": "Google", - "Metric_request_tokens": 3107.0, + "Metric_request_tokens": 3078.0, "Metric_response_tokens": 1173.0, - "total_cost": 0.03113375, - "input_cost": 0.00388375, - "output_cost": 0.02725 + "total_cost": 0.000777, + "input_cost": 0.0003078, + "output_cost": 0.0004692 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 22.205452715, + "Duration": 5.060489677, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4960.0, + "total_tokens": 4378.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1187.0, - "total_cost": 0.022413750000000003, - "input_cost": 0.00388375, - "output_cost": 0.01853 + "Metric_request_tokens": 3125.0, + "Metric_response_tokens": 1253.0, + "total_cost": 0.0008137000000000001, + "input_cost": 0.00031250000000000006, + "output_cost": 0.0005012 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 45.254514377, + "Duration": 4.046059014, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7035.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3742.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1153.0, - "total_cost": 0.04316375, - "input_cost": 0.00388375, - "output_cost": 0.03928 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 664.0, + "total_cost": 0.0005734, + "input_cost": 0.0003078, + "output_cost": 0.0002656 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 25.540973715, + "Duration": 4.931260288, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5321.0, + "total_tokens": 4296.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1168.0, - "total_cost": 0.02602375, - "input_cost": 0.00388375, - "output_cost": 0.02214 + "Metric_request_tokens": 3095.0, + "Metric_response_tokens": 1201.0, + "total_cost": 0.0007899000000000001, + "input_cost": 0.00030950000000000004, + "output_cost": 0.0004804 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 29.203818188, - "Score_MermaidDiagramValid": 1.0, + "Duration": 5.324805262, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5909.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4264.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.031903749999999995, - "input_cost": 0.00388375, - "output_cost": 0.028019999999999996 + "Metric_request_tokens": 3076.0, + "Metric_response_tokens": 1188.0, + "total_cost": 0.0007828000000000002, + "input_cost": 0.00030760000000000005, + "output_cost": 0.00047520000000000006 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 41.27588133, + "Duration": 5.532431761, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10620.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4482.0, "provider": "Google", - "Metric_request_tokens": 6855.0, - "Metric_response_tokens": 1805.0, - "total_cost": 0.04621875, - "input_cost": 0.00856875, - "output_cost": 0.03765 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1404.0, + "total_cost": 0.0008694000000000001, + "input_cost": 0.0003078, + "output_cost": 0.0005616000000000001 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 68.89242445, + "Duration": 4.878045252, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9389.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4315.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1160.0, - "total_cost": 0.06670375, - "input_cost": 0.00388375, - "output_cost": 0.06282 + "Metric_request_tokens": 3080.0, + "Metric_response_tokens": 1235.0, + "total_cost": 0.000802, + "input_cost": 0.000308, + "output_cost": 0.000494 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 34.105280135, - "Score_MermaidDiagramValid": 1.0, + "Duration": 5.37411601, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6279.0, + "total_tokens": 4277.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1280.0, - "total_cost": 0.035603749999999997, - "input_cost": 0.00388375, - "output_cost": 0.03172 + "Metric_request_tokens": 3094.0, + "Metric_response_tokens": 1183.0, + "total_cost": 0.0007826, + "input_cost": 0.00030940000000000004, + "output_cost": 0.0004732 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 120.860207911, + "Duration": 4.198357553, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 13787.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3773.0, "provider": "Google", - "Metric_request_tokens": 7717.0, - "Metric_response_tokens": 1789.0, - "total_cost": 0.07034625, - "input_cost": 0.00964625, - "output_cost": 0.0607 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 695.0, + "total_cost": 0.0005858, + "input_cost": 0.0003078, + "output_cost": 0.000278 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 29.936865091, + "Duration": 5.122080385, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5784.0, + "total_tokens": 4309.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1157.0, - "total_cost": 0.03065375, - "input_cost": 0.00388375, - "output_cost": 0.026770000000000002 + "Metric_request_tokens": 3097.0, + "Metric_response_tokens": 1212.0, + "total_cost": 0.0007945000000000001, + "input_cost": 0.0003097, + "output_cost": 0.0004848 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 27.396380686, + "Duration": 5.670149233, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5674.0, + "total_tokens": 6618.0, "provider": "Google", - "Metric_request_tokens": 3106.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.029562500000000002, - "input_cost": 0.0038824999999999997, - "output_cost": 0.02568 + "Metric_request_tokens": 5390.0, + "Metric_response_tokens": 1228.0, + "total_cost": 0.0010302, + "input_cost": 0.000539, + "output_cost": 0.0004912 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 23.581064657, + "Duration": 5.358885211, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5037.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4265.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1154.0, - "total_cost": 0.023183750000000003, - "input_cost": 0.00388375, - "output_cost": 0.0193 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1187.0, + "total_cost": 0.0007826, + "input_cost": 0.0003078, + "output_cost": 0.0004748 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 29.262028799, + "Duration": 5.529175968, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5839.0, + "total_tokens": 4276.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1150.0, - "total_cost": 0.031203750000000002, - "input_cost": 0.00388375, - "output_cost": 0.02732 + "Metric_request_tokens": 3097.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.0007813, + "input_cost": 0.0003097, + "output_cost": 0.00047159999999999997 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 53.601437466, - "Score_MermaidDiagramValid": 1.0, + "Duration": 5.318584704, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 8093.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4259.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1193.0, - "total_cost": 0.05374375, - "input_cost": 0.00388375, - "output_cost": 0.04986 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1181.0, + "total_cost": 0.0007802, + "input_cost": 0.0003078, + "output_cost": 0.0004724 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 23.554440753, + "Duration": 6.129762473, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5308.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4518.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.02589375, - "input_cost": 0.00388375, - "output_cost": 0.02201 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1440.0, + "total_cost": 0.0008838, + "input_cost": 0.0003078, + "output_cost": 0.000576 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 23.896141303, + "Duration": 5.357050738, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5227.0, + "total_tokens": 4260.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.025083750000000002, - "input_cost": 0.00388375, - "output_cost": 0.0212 + "Metric_request_tokens": 3094.0, + "Metric_response_tokens": 1166.0, + "total_cost": 0.0007758000000000001, + "input_cost": 0.00030940000000000004, + "output_cost": 0.0004664 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 46.506492861, - "Score_MermaidDiagramValid": 1.0, + "Duration": 5.448035659, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10985.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4277.0, "provider": "Google", - "Metric_request_tokens": 6854.0, - "Metric_response_tokens": 1829.0, - "total_cost": 0.0498775, - "input_cost": 0.0085675, - "output_cost": 0.04131 + "Metric_request_tokens": 3076.0, + "Metric_response_tokens": 1201.0, + "total_cost": 0.0007880000000000001, + "input_cost": 0.00030760000000000005, + "output_cost": 0.0004804 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 31.379013607, + "Duration": 5.454605744, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5869.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4260.0, "provider": "Google", - "Metric_request_tokens": 3104.0, - "Metric_response_tokens": 1195.0, - "total_cost": 0.03153, - "input_cost": 0.0038799999999999998, - "output_cost": 0.02765 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1182.0, + "total_cost": 0.0007806, + "input_cost": 0.0003078, + "output_cost": 0.00047280000000000005 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 23.397718403, + "Duration": 4.712239535, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5142.0, + "total_tokens": 3832.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1136.0, - "total_cost": 0.02423375, - "input_cost": 0.00388375, - "output_cost": 0.02035 + "Metric_request_tokens": 3127.0, + "Metric_response_tokens": 705.0, + "total_cost": 0.0005947000000000001, + "input_cost": 0.0003127, + "output_cost": 0.000282 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 31.349962177, - "Score_MermaidDiagramValid": 1.0, + "Duration": 5.044155716, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6014.0, + "total_tokens": 4299.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.03295375, - "input_cost": 0.00388375, - "output_cost": 0.02907 + "Metric_request_tokens": 3092.0, + "Metric_response_tokens": 1207.0, + "total_cost": 0.0007920000000000001, + "input_cost": 0.00030920000000000003, + "output_cost": 0.0004828 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 39.356080363, + "Duration": 4.750485612, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6782.0, + "total_tokens": 3843.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1166.0, - "total_cost": 0.040633749999999996, - "input_cost": 0.00388375, - "output_cost": 0.03675 + "Metric_request_tokens": 3125.0, + "Metric_response_tokens": 718.0, + "total_cost": 0.0005997000000000001, + "input_cost": 0.00031250000000000006, + "output_cost": 0.0002872 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 26.725778378, + "Duration": 5.14357062, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5165.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4283.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1195.0, - "total_cost": 0.02446375, - "input_cost": 0.00388375, - "output_cost": 0.02058 + "Metric_request_tokens": 3081.0, + "Metric_response_tokens": 1202.0, + "total_cost": 0.0007888999999999999, + "input_cost": 0.0003081, + "output_cost": 0.0004808 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 41.986268087, - "Score_MermaidDiagramValid": 1.0, + "Duration": 5.678468457, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6768.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3756.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1183.0, - "total_cost": 0.04049375, - "input_cost": 0.00388375, - "output_cost": 0.036610000000000004 + "Metric_request_tokens": 3077.0, + "Metric_response_tokens": 679.0, + "total_cost": 0.0005793, + "input_cost": 0.0003077, + "output_cost": 0.00027160000000000004 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 29.098656784, + "Duration": 5.395852675, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5771.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4255.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1168.0, - "total_cost": 0.030523750000000002, - "input_cost": 0.00388375, - "output_cost": 0.02664 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1177.0, + "total_cost": 0.0007786000000000001, + "input_cost": 0.0003078, + "output_cost": 0.00047080000000000006 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 24.371314172, + "Duration": 5.920805734, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5229.0, + "total_tokens": 4262.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1171.0, - "total_cost": 0.02510375, - "input_cost": 0.00388375, - "output_cost": 0.021220000000000003 + "Metric_request_tokens": 3094.0, + "Metric_response_tokens": 1168.0, + "total_cost": 0.0007766000000000001, + "input_cost": 0.00030940000000000004, + "output_cost": 0.0004672 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 23.463013826, + "Duration": 30.535216, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4976.0, + "total_tokens": 65104.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1179.0, - "total_cost": 0.022573749999999997, - "input_cost": 0.00388375, - "output_cost": 0.01869 + "Metric_request_tokens": 59569.0, + "Metric_response_tokens": 5535.0, + "total_cost": 0.040898199999999996, + "input_cost": 0.0178707, + "output_cost": 0.0230275 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 47.889786736, + "Duration": 41.27852, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7356.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1154.0, - "total_cost": 0.04637375, - "input_cost": 0.00388375, - "output_cost": 0.04249 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 27.124752992, + "Duration": 36.451985, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5475.0, + "total_tokens": 15437.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.027563749999999998, - "input_cost": 0.00388375, - "output_cost": 0.02368 + "Metric_request_tokens": 7985.0, + "Metric_response_tokens": 7452.0, + "total_cost": 0.035218, + "input_cost": 0.0023955, + "output_cost": 0.0328225 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 35.179937664, - "Score_MermaidDiagramValid": 0.0, + "Duration": 30.921618, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6258.0, + "total_tokens": 65447.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 977.0, - "total_cost": 0.03539375, - "input_cost": 0.00388375, - "output_cost": 0.03151 + "Metric_request_tokens": 59634.0, + "Metric_response_tokens": 5813.0, + "total_cost": 0.0422752, + "input_cost": 0.0178902, + "output_cost": 0.024385 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 25.536239605, + "Duration": 43.039527, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5279.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1171.0, - "total_cost": 0.02560375, - "input_cost": 0.00388375, - "output_cost": 0.02172 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 28.961691958, + "Duration": 40.613476, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5610.0, + "total_tokens": 21723.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1148.0, - "total_cost": 0.028913750000000002, - "input_cost": 0.00388375, - "output_cost": 0.02503 + "Metric_request_tokens": 13502.0, + "Metric_response_tokens": 8221.0, + "total_cost": 0.039043100000000004, + "input_cost": 0.0040506, + "output_cost": 0.0349925 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.18099707, + "Duration": 30.789959, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5581.0, + "total_tokens": 65265.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.028623750000000003, - "input_cost": 0.00388375, - "output_cost": 0.02474 + "Metric_request_tokens": 59678.0, + "Metric_response_tokens": 5587.0, + "total_cost": 0.041150900000000004, + "input_cost": 0.0179034, + "output_cost": 0.0232475 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 37.387479684, + "Duration": 20.254133, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6757.0, + "total_tokens": 7397.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1171.0, - "total_cost": 0.040383749999999996, - "input_cost": 0.00388375, - "output_cost": 0.0365 + "Metric_request_tokens": 3208.0, + "Metric_response_tokens": 4189.0, + "total_cost": 0.018834899999999998, + "input_cost": 0.0009624, + "output_cost": 0.0178725 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 41.910195327, + "Duration": 39.96252, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10585.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 6807.0, - "Metric_response_tokens": 1723.0, - "total_cost": 0.046288750000000004, - "input_cost": 0.008508749999999999, - "output_cost": 0.03778 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 18.566825519, - "Score_MermaidDiagramValid": 0.0, + "Duration": 28.708043, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4762.0, + "total_tokens": 65454.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.02043375, - "input_cost": 0.00388375, - "output_cost": 0.01655 + "Metric_request_tokens": 60032.0, + "Metric_response_tokens": 5422.0, + "total_cost": 0.0404296, + "input_cost": 0.0180096, + "output_cost": 0.022420000000000002 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 35.952256562, + "Duration": 37.960577, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6120.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1171.0, - "total_cost": 0.03401375, - "input_cost": 0.00388375, - "output_cost": 0.03013 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 23.562834473, + "Duration": 35.53418, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5167.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.02448375, - "input_cost": 0.00388375, - "output_cost": 0.0206 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.116377335, - "Score_MermaidDiagramValid": 1.0, + "Duration": 16.319646, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5461.0, + "total_tokens": 6271.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1175.0, - "total_cost": 0.027423750000000004, - "input_cost": 0.00388375, - "output_cost": 0.023540000000000002 + "Metric_request_tokens": 3217.0, + "Metric_response_tokens": 3054.0, + "total_cost": 0.013285100000000001, + "input_cost": 0.0009650999999999999, + "output_cost": 0.012320000000000001 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 42.19434694, + "Duration": 34.242218, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7549.0, + "total_tokens": 14979.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1171.0, - "total_cost": 0.04830375, - "input_cost": 0.00388375, - "output_cost": 0.04442 + "Metric_request_tokens": 8039.0, + "Metric_response_tokens": 6940.0, + "total_cost": 0.0325442, + "input_cost": 0.0024116999999999997, + "output_cost": 0.0301325 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 30.915458356, + "Duration": 29.800008, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5835.0, + "total_tokens": 19299.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1148.0, - "total_cost": 0.031163749999999997, - "input_cost": 0.00388375, - "output_cost": 0.02728 + "Metric_request_tokens": 13473.0, + "Metric_response_tokens": 5826.0, + "total_cost": 0.0268194, + "input_cost": 0.0040419, + "output_cost": 0.0227775 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.201752563, - "Score_MermaidDiagramValid": 0.0, + "Duration": 30.071548, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5374.0, + "total_tokens": 65106.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.02655375, - "input_cost": 0.00388375, - "output_cost": 0.02267 + "Metric_request_tokens": 59409.0, + "Metric_response_tokens": 5697.0, + "total_cost": 0.0416952, + "input_cost": 0.017822699999999997, + "output_cost": 0.0238725 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 23.905880551, + "Duration": 42.437322, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5249.0, + "total_tokens": 27727.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1156.0, - "total_cost": 0.02530375, - "input_cost": 0.00388375, - "output_cost": 0.021419999999999998 + "Metric_request_tokens": 19471.0, + "Metric_response_tokens": 8256.0, + "total_cost": 0.039598799999999997, + "input_cost": 0.0058413, + "output_cost": 0.033757499999999996 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 24.308699595, + "Duration": 31.305007, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5184.0, + "total_tokens": 19697.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1153.0, - "total_cost": 0.024653749999999995, - "input_cost": 0.00388375, - "output_cost": 0.020769999999999997 + "Metric_request_tokens": 13501.0, + "Metric_response_tokens": 6196.0, + "total_cost": 0.0289778, + "input_cost": 0.0040503, + "output_cost": 0.024927500000000002 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 39.750523403, + "Duration": 25.683554, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6263.0, + "total_tokens": 64108.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.035443749999999996, - "input_cost": 0.00388375, - "output_cost": 0.03156 + "Metric_request_tokens": 59439.0, + "Metric_response_tokens": 4669.0, + "total_cost": 0.0364617, + "input_cost": 0.0178317, + "output_cost": 0.01863 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 65.476559976, + "Duration": 35.585454, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6971.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 3984.0, - "Metric_response_tokens": 1170.0, - "total_cost": 0.03485, - "input_cost": 0.00498, - "output_cost": 0.02987 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 72.910902918, + "Duration": 36.908135, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, @@ -8838,2300 +13763,2300 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 43.602165928, - "Score_MermaidDiagramValid": 0.0, + "Duration": 22.669943, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10530.0, + "total_tokens": 63800.0, "provider": "Google", - "Metric_request_tokens": 6840.0, - "Metric_response_tokens": 1786.0, - "total_cost": 0.045450000000000004, - "input_cost": 0.00855, - "output_cost": 0.0369 + "Metric_request_tokens": 59409.0, + "Metric_response_tokens": 4391.0, + "total_cost": 0.03520519999999999, + "input_cost": 0.017822699999999997, + "output_cost": 0.0173825 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 33.823405726, - "Score_MermaidDiagramValid": 0.0, + "Duration": 48.34871, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6724.0, + "total_tokens": 23572.0, "provider": "Google", - "Metric_request_tokens": 3968.0, - "Metric_response_tokens": 1157.0, - "total_cost": 0.03252, - "input_cost": 0.00496, - "output_cost": 0.02756 + "Metric_request_tokens": 13446.0, + "Metric_response_tokens": 10126.0, + "total_cost": 0.048568799999999995, + "input_cost": 0.0040338, + "output_cost": 0.044535 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 30.694557094, + "Duration": 24.075586, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5928.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 7125.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1154.0, - "total_cost": 0.032093750000000004, - "input_cost": 0.00388375, - "output_cost": 0.028210000000000002 + "Metric_request_tokens": 1685.0, + "Metric_response_tokens": 5440.0, + "total_cost": 0.0263205, + "input_cost": 0.0005055, + "output_cost": 0.025815 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 28.020181574, + "Duration": 23.746025, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5702.0, + "total_tokens": 63216.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.02983375, - "input_cost": 0.00388375, - "output_cost": 0.02595 + "Metric_request_tokens": 59438.0, + "Metric_response_tokens": 3778.0, + "total_cost": 0.0320064, + "input_cost": 0.017831399999999997, + "output_cost": 0.014175 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 26.120680251, + "Duration": 38.982162, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5541.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.02822375, - "input_cost": 0.00388375, - "output_cost": 0.02434 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 51.401740722, + "Duration": 34.937334, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 8168.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1171.0, - "total_cost": 0.05449375, - "input_cost": 0.00388375, - "output_cost": 0.05061 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 30.757611824, + "Duration": 23.290948, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5710.0, - "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1179.0, - "total_cost": 0.029913749999999996, - "input_cost": 0.00388375, - "output_cost": 0.026029999999999998 + "total_tokens": 63494.0, + "provider": "Google", + "Metric_request_tokens": 59240.0, + "Metric_response_tokens": 4254.0, + "total_cost": 0.034337, + "input_cost": 0.017772, + "output_cost": 0.016565 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 34.511741088, + "Duration": 36.192303, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6547.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1176.0, - "total_cost": 0.03828375, - "input_cost": 0.00388375, - "output_cost": 0.0344 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 33.667128837, + "Duration": 39.181344, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6142.0, + "total_tokens": 20917.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1160.0, - "total_cost": 0.03423375, - "input_cost": 0.00388375, - "output_cost": 0.03035 + "Metric_request_tokens": 13424.0, + "Metric_response_tokens": 7493.0, + "total_cost": 0.0354947, + "input_cost": 0.0040272, + "output_cost": 0.031467499999999995 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.603688339, - "Score_MermaidDiagramValid": 0.0, + "Duration": 30.817341, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5227.0, + "total_tokens": 70272.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.025083750000000002, - "input_cost": 0.00388375, - "output_cost": 0.0212 + "Metric_request_tokens": 64504.0, + "Metric_response_tokens": 5768.0, + "total_cost": 0.0419462, + "input_cost": 0.019351200000000002, + "output_cost": 0.022594999999999997 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 49.97060826, + "Duration": 37.161836, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 11419.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 6836.0, - "Metric_response_tokens": 1793.0, - "total_cost": 0.05437500000000001, - "input_cost": 0.008545, - "output_cost": 0.04583 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 97.172731042, + "Duration": 30.427731, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 15969.0, + "total_tokens": 19318.0, "provider": "Google", - "Metric_request_tokens": 6815.0, - "Metric_response_tokens": 1761.0, - "total_cost": 0.10005875000000002, - "input_cost": 0.00851875, - "output_cost": 0.09154000000000001 + "Metric_request_tokens": 13508.0, + "Metric_response_tokens": 5810.0, + "total_cost": 0.026967400000000002, + "input_cost": 0.004052399999999999, + "output_cost": 0.022915 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 53.025055925, + "Duration": 25.665431, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 8261.0, + "total_tokens": 9023.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.055423749999999994, - "input_cost": 0.00388375, - "output_cost": 0.051539999999999996 + "Metric_request_tokens": 4087.0, + "Metric_response_tokens": 4936.0, + "total_cost": 0.0229061, + "input_cost": 0.0012261000000000001, + "output_cost": 0.021679999999999998 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 32.071007326, + "Duration": 39.694036, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5793.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1160.0, - "total_cost": 0.03074375, - "input_cost": 0.00388375, - "output_cost": 0.026860000000000002 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 30.771411983, + "Duration": 2.110298, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5778.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1136.0, - "total_cost": 0.030593750000000003, - "input_cost": 0.00388375, - "output_cost": 0.02671 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 33.989499385, + "Duration": 29.774367, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6020.0, + "total_tokens": 65029.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1160.0, - "total_cost": 0.033013749999999994, - "input_cost": 0.00388375, - "output_cost": 0.029129999999999996 + "Metric_request_tokens": 59634.0, + "Metric_response_tokens": 5395.0, + "total_cost": 0.0401827, + "input_cost": 0.0178902, + "output_cost": 0.0222925 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 26.15421085, + "Duration": 41.44486, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5364.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1171.0, - "total_cost": 0.026453749999999998, - "input_cost": 0.00388375, - "output_cost": 0.02257 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 35.682115651, + "Duration": 32.833202, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6187.0, + "total_tokens": 19792.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1198.0, - "total_cost": 0.03468375, - "input_cost": 0.00388375, - "output_cost": 0.030799999999999998 + "Metric_request_tokens": 13455.0, + "Metric_response_tokens": 6337.0, + "total_cost": 0.029679000000000004, + "input_cost": 0.0040365, + "output_cost": 0.025642500000000002 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.16239572, - "Score_MermaidDiagramValid": 0.0, + "Duration": 31.837097, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 64441.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Metric_request_tokens": 59551.0, + "Metric_response_tokens": 4890.0, + "total_cost": 0.0376353, + "input_cost": 0.0178653, + "output_cost": 0.01977 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.095654126, + "Duration": 45.30594, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.573285539, + "Duration": 33.692928, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.13363762, + "Duration": 6.23502, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.075430478, + "Duration": 37.989877, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.734203872, + "Duration": 40.060316, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 21124.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 13386.0, + "Metric_response_tokens": 7738.0, + "total_cost": 0.0367308, + "input_cost": 0.0040158, + "output_cost": 0.032715 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.207070397, + "Duration": 5.215447, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1261.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5446.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 525.0, - "total_cost": 0.0002836, - "input_cost": 7.36e-5, - "output_cost": 0.00021 + "Metric_request_tokens": 4086.0, + "Metric_response_tokens": 1360.0, + "total_cost": 0.0009526000000000002, + "input_cost": 0.00040860000000000007, + "output_cost": 0.0005440000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.173871208, + "Duration": 4.900561, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4506.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 3213.0, + "Metric_response_tokens": 1293.0, + "total_cost": 0.0008385000000000002, + "input_cost": 0.00032130000000000006, + "output_cost": 0.0005172000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.445568835, + "Duration": 4.647086, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4558.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 1348.0, + "total_cost": 0.0008602, + "input_cost": 0.00032100000000000005, + "output_cost": 0.0005392 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.087279343, + "Duration": 5.251769, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5431.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Metric_request_tokens": 4086.0, + "Metric_response_tokens": 1345.0, + "total_cost": 0.0009466000000000001, + "input_cost": 0.00040860000000000007, + "output_cost": 0.0005380000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.14396244, + "Duration": 16.549062, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 18304.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "Metric_request_tokens": 11757.0, + "Metric_response_tokens": 6547.0, + "total_cost": 0.0037945, + "input_cost": 0.0011757, + "output_cost": 0.0026188 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.481531585, + "Duration": 4.801133, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1242.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4563.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 506.0, - "total_cost": 0.00027600000000000004, - "input_cost": 7.36e-5, - "output_cost": 0.00020240000000000004 + "Metric_request_tokens": 3208.0, + "Metric_response_tokens": 1355.0, + "total_cost": 0.0008628, + "input_cost": 0.0003208, + "output_cost": 0.0005420000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.053321468, + "Duration": 6.511676, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5825.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Metric_request_tokens": 4087.0, + "Metric_response_tokens": 1738.0, + "total_cost": 0.0011039, + "input_cost": 0.00040870000000000007, + "output_cost": 0.0006952 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.041527691, + "Duration": 5.176256, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5376.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 4075.0, + "Metric_response_tokens": 1301.0, + "total_cost": 0.0009279000000000001, + "input_cost": 0.0004075, + "output_cost": 0.0005204000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.656550888, + "Duration": 4.872751, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4539.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 3206.0, + "Metric_response_tokens": 1333.0, + "total_cost": 0.0008538, + "input_cost": 0.00032060000000000004, + "output_cost": 0.0005332000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.232291255, + "Duration": 6.15609, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1261.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5888.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 525.0, - "total_cost": 0.0002836, - "input_cost": 7.36e-5, - "output_cost": 0.00021 + "Metric_request_tokens": 4087.0, + "Metric_response_tokens": 1801.0, + "total_cost": 0.0011291, + "input_cost": 0.00040870000000000007, + "output_cost": 0.0007204000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.16402353, + "Duration": 4.889272, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5350.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "Metric_request_tokens": 4082.0, + "Metric_response_tokens": 1268.0, + "total_cost": 0.0009154, + "input_cost": 0.0004082, + "output_cost": 0.0005072000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.371460427, + "Duration": 5.012574, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5370.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 4077.0, + "Metric_response_tokens": 1293.0, + "total_cost": 0.0009249000000000001, + "input_cost": 0.0004077, + "output_cost": 0.0005172000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.306933591, + "Duration": 5.579592, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1261.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4841.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 525.0, - "total_cost": 0.0002836, - "input_cost": 7.36e-5, - "output_cost": 0.00021 + "Metric_request_tokens": 3217.0, + "Metric_response_tokens": 1624.0, + "total_cost": 0.0009713, + "input_cost": 0.0003217, + "output_cost": 0.0006496000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.199269162, + "Duration": 4.866885, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5346.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 4082.0, + "Metric_response_tokens": 1264.0, + "total_cost": 0.0009138, + "input_cost": 0.0004082, + "output_cost": 0.0005056 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.715297386, + "Duration": 4.845275, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5343.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 4075.0, + "Metric_response_tokens": 1268.0, + "total_cost": 0.0009147000000000001, + "input_cost": 0.0004075, + "output_cost": 0.0005072000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.390987167, + "Duration": 6.136979, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1256.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5861.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 520.0, - "total_cost": 0.0002816, - "input_cost": 7.36e-5, - "output_cost": 0.000208 + "Metric_request_tokens": 4090.0, + "Metric_response_tokens": 1771.0, + "total_cost": 0.0011174000000000002, + "input_cost": 0.000409, + "output_cost": 0.0007084000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.009082393, + "Duration": 5.112432, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5400.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "Metric_request_tokens": 4082.0, + "Metric_response_tokens": 1318.0, + "total_cost": 0.0009354000000000001, + "input_cost": 0.0004082, + "output_cost": 0.0005272 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.572387722, + "Duration": 5.028322, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4481.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 3208.0, + "Metric_response_tokens": 1273.0, + "total_cost": 0.00083, + "input_cost": 0.0003208, + "output_cost": 0.0005092 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.097230117, + "Duration": 6.260043, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5714.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Metric_request_tokens": 4087.0, + "Metric_response_tokens": 1627.0, + "total_cost": 0.0010595000000000001, + "input_cost": 0.00040870000000000007, + "output_cost": 0.0006508 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.342514555, + "Duration": 4.62823, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4480.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 3213.0, + "Metric_response_tokens": 1267.0, + "total_cost": 0.0008281, + "input_cost": 0.00032130000000000006, + "output_cost": 0.0005068 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.742365295, + "Duration": 4.569272, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4509.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 3206.0, + "Metric_response_tokens": 1303.0, + "total_cost": 0.0008418, + "input_cost": 0.00032060000000000004, + "output_cost": 0.0005212 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.291870391, + "Duration": 5.411835, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5643.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Metric_request_tokens": 4086.0, + "Metric_response_tokens": 1557.0, + "total_cost": 0.0010314, + "input_cost": 0.00040860000000000007, + "output_cost": 0.0006228000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.237470473, + "Duration": 5.118662, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1251.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4696.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 515.0, - "total_cost": 0.0002796, - "input_cost": 7.36e-5, - "output_cost": 0.00020600000000000002 + "Metric_request_tokens": 3208.0, + "Metric_response_tokens": 1488.0, + "total_cost": 0.000916, + "input_cost": 0.0003208, + "output_cost": 0.0005952 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.437132314, + "Duration": 4.876956, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5328.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 4075.0, + "Metric_response_tokens": 1253.0, + "total_cost": 0.0009087, + "input_cost": 0.0004075, + "output_cost": 0.0005012 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.205071529, + "Duration": 5.165924, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2632.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5628.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00200755, - "input_cost": 0.00025065, - "output_cost": 0.0017569 + "Metric_request_tokens": 4087.0, + "Metric_response_tokens": 1541.0, + "total_cost": 0.0010251000000000001, + "input_cost": 0.00040870000000000007, + "output_cost": 0.0006164 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 9.27147079, + "Duration": 4.339767, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3259.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4434.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 557.0, - "total_cost": 0.00420005, - "input_cost": 0.00025035, - "output_cost": 0.0039497 + "Metric_request_tokens": 3208.0, + "Metric_response_tokens": 1226.0, + "total_cost": 0.0008112000000000001, + "input_cost": 0.0003208, + "output_cost": 0.0004904000000000001 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.703598385, + "Duration": 5.04924, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2531.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4630.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0016665500000000001, - "input_cost": 0.00025035, - "output_cost": 0.0014162 + "Metric_request_tokens": 3208.0, + "Metric_response_tokens": 1422.0, + "total_cost": 0.0008896, + "input_cost": 0.0003208, + "output_cost": 0.0005688000000000001 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.890864449, + "Duration": 5.26326, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2638.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5715.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 591.0, - "total_cost": 0.00192125, - "input_cost": 0.00025065, - "output_cost": 0.0016706 + "Metric_request_tokens": 4087.0, + "Metric_response_tokens": 1628.0, + "total_cost": 0.0010599000000000001, + "input_cost": 0.00040870000000000007, + "output_cost": 0.0006512000000000001 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.237042556, + "Duration": 5.144055, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2595.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5510.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00189055, - "input_cost": 0.00025035, - "output_cost": 0.0016401999999999999 + "Metric_request_tokens": 4075.0, + "Metric_response_tokens": 1435.0, + "total_cost": 0.0009815000000000002, + "input_cost": 0.0004075, + "output_cost": 0.0005740000000000001 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.402304393, + "Duration": 4.860893, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2648.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5344.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0020760500000000003, - "input_cost": 0.00025035, - "output_cost": 0.0018257000000000002 + "Metric_request_tokens": 4075.0, + "Metric_response_tokens": 1269.0, + "total_cost": 0.0009151, + "input_cost": 0.0004075, + "output_cost": 0.0005076 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.681153876, + "Duration": 5.738872, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2673.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5531.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00215105, - "input_cost": 0.00025065, - "output_cost": 0.0019004 + "Metric_request_tokens": 4087.0, + "Metric_response_tokens": 1444.0, + "total_cost": 0.0009863, + "input_cost": 0.00040870000000000007, + "output_cost": 0.0005776 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.74444077, + "Duration": 5.025443, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2683.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4496.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00219855, - "input_cost": 0.00025035, - "output_cost": 0.0019482 + "Metric_request_tokens": 3206.0, + "Metric_response_tokens": 1290.0, + "total_cost": 0.0008366000000000001, + "input_cost": 0.00032060000000000004, + "output_cost": 0.000516 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 35.715282683, + "Duration": 8.195003, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 15172.0, + "total_tokens": 5789.0, "provider": "Google", - "Metric_request_tokens": 8909.0, - "Metric_response_tokens": 1841.0, - "total_cost": 0.01791795, - "input_cost": 0.00133635, - "output_cost": 0.0165816 + "Metric_request_tokens": 3208.0, + "Metric_response_tokens": 2581.0, + "total_cost": 0.0013532, + "input_cost": 0.0003208, + "output_cost": 0.0010324 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 19.026625667, - "Score_MermaidDiagramValid": 1.0, + "Duration": 5.925279, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7184.0, + "total_tokens": 5623.0, "provider": "Google", - "Metric_request_tokens": 3969.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.00837945, - "input_cost": 0.00059535, - "output_cost": 0.0077840999999999995 + "Metric_request_tokens": 4087.0, + "Metric_response_tokens": 1536.0, + "total_cost": 0.0010231, + "input_cost": 0.00040870000000000007, + "output_cost": 0.0006144000000000001 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.674318695, + "Duration": 5.235465, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2643.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5293.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00205855, - "input_cost": 0.00025035, - "output_cost": 0.0018082 + "Metric_request_tokens": 4082.0, + "Metric_response_tokens": 1211.0, + "total_cost": 0.0008926000000000001, + "input_cost": 0.0004082, + "output_cost": 0.00048440000000000006 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.052854086, + "Duration": 8.433263, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2773.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10652.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00251355, - "input_cost": 0.00025035, - "output_cost": 0.0022632 + "Metric_request_tokens": 7846.0, + "Metric_response_tokens": 2806.0, + "total_cost": 0.001907, + "input_cost": 0.0007846000000000001, + "output_cost": 0.0011224 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.691234945, + "Duration": 5.637824, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2764.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5585.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.0024695499999999996, - "input_cost": 0.00025065, - "output_cost": 0.0022188999999999998 + "Metric_request_tokens": 4086.0, + "Metric_response_tokens": 1499.0, + "total_cost": 0.0010082000000000001, + "input_cost": 0.00040860000000000007, + "output_cost": 0.0005996 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 10.540987174, + "Duration": 4.961917, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7199.0, + "total_tokens": 5394.0, "provider": "Google", - "Metric_request_tokens": 5361.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.00383255, - "input_cost": 0.00080415, - "output_cost": 0.0030284 + "Metric_request_tokens": 4082.0, + "Metric_response_tokens": 1312.0, + "total_cost": 0.000933, + "input_cost": 0.0004082, + "output_cost": 0.0005248000000000001 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.512122765, + "Duration": 4.664187, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2665.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5329.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00213555, - "input_cost": 0.00025035, - "output_cost": 0.0018852 + "Metric_request_tokens": 4075.0, + "Metric_response_tokens": 1254.0, + "total_cost": 0.0009090999999999999, + "input_cost": 0.0004075, + "output_cost": 0.0005015999999999999 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.02414223, + "Duration": 5.58097, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2737.0, - "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.0023750499999999996, - "input_cost": 0.00025065, - "output_cost": 0.0021244 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5603.0, + "provider": "Google", + "Metric_request_tokens": 4086.0, + "Metric_response_tokens": 1517.0, + "total_cost": 0.0010154, + "input_cost": 0.00040860000000000007, + "output_cost": 0.0006068 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 18.309828457, + "Duration": 4.893051, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7772.0, + "total_tokens": 5331.0, "provider": "Google", - "Metric_request_tokens": 3983.0, - "Metric_response_tokens": 1188.0, - "total_cost": 0.01041375, - "input_cost": 0.00059745, - "output_cost": 0.0098163 + "Metric_request_tokens": 4082.0, + "Metric_response_tokens": 1249.0, + "total_cost": 0.0009078000000000001, + "input_cost": 0.0004082, + "output_cost": 0.0004996 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.858011472, + "Duration": 6.497755, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2586.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5531.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0018590500000000001, - "input_cost": 0.00025035, - "output_cost": 0.0016087 + "Metric_request_tokens": 3206.0, + "Metric_response_tokens": 2325.0, + "total_cost": 0.0012506, + "input_cost": 0.00032060000000000004, + "output_cost": 0.0009299999999999999 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.887542438, + "Duration": 8.960675, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2715.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7000.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00229805, - "input_cost": 0.00025065, - "output_cost": 0.0020474 + "Metric_request_tokens": 4086.0, + "Metric_response_tokens": 2914.0, + "total_cost": 0.0015742, + "input_cost": 0.00040860000000000007, + "output_cost": 0.0011656 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.607493865, + "Duration": 4.787697, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2660.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5354.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00211225, - "input_cost": 0.00025035, - "output_cost": 0.0018619 + "Metric_request_tokens": 4082.0, + "Metric_response_tokens": 1272.0, + "total_cost": 0.0009170000000000001, + "input_cost": 0.0004082, + "output_cost": 0.0005088 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite-preview-09-2025", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.350370934, + "Duration": 4.830706, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2632.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5382.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0020200500000000002, - "input_cost": 0.00025035, - "output_cost": 0.0017697000000000001 + "Metric_request_tokens": 4079.0, + "Metric_response_tokens": 1303.0, + "total_cost": 0.0009291, + "input_cost": 0.00040790000000000005, + "output_cost": 0.0005212 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 13.369507864, - "Score_MermaidDiagramValid": 1.0, + "Duration": 8.399291, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6433.0, + "total_tokens": 55783.0, "provider": "Google", - "Metric_request_tokens": 3970.0, - "Metric_response_tokens": 1179.0, - "total_cost": 0.0057969, - "input_cost": 0.0005954999999999999, - "output_cost": 0.0052014 + "Metric_request_tokens": 54588.0, + "Metric_response_tokens": 1195.0, + "total_cost": 0.0059368, + "input_cost": 0.0054588, + "output_cost": 0.00047800000000000007 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.427887401, + "Duration": 5.992778, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2640.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3911.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00204805, - "input_cost": 0.00025035, - "output_cost": 0.0017977 + "Metric_request_tokens": 3211.0, + "Metric_response_tokens": 700.0, + "total_cost": 0.0006011, + "input_cost": 0.0003211, + "output_cost": 0.00028000000000000003 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.886046113, + "Duration": 6.646862, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2860.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4409.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00281805, - "input_cost": 0.00025035, - "output_cost": 0.0025677 + "Metric_request_tokens": 3209.0, + "Metric_response_tokens": 1200.0, + "total_cost": 0.0008009, + "input_cost": 0.00032090000000000005, + "output_cost": 0.00047999999999999996 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.190617486, - "Score_MermaidDiagramValid": 0.0, + "Duration": 6.466695, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2704.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4443.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00225955, - "input_cost": 0.00025065, - "output_cost": 0.0020089 + "Metric_request_tokens": 3218.0, + "Metric_response_tokens": 1225.0, + "total_cost": 0.0008118, + "input_cost": 0.0003218, + "output_cost": 0.00049 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 12.313419785, + "Duration": 6.38993, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7265.0, + "total_tokens": 4398.0, "provider": "Google", - "Metric_request_tokens": 5361.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.00406355, - "input_cost": 0.00080415, - "output_cost": 0.0032594 + "Metric_request_tokens": 3211.0, + "Metric_response_tokens": 1187.0, + "total_cost": 0.0007959, + "input_cost": 0.0003211, + "output_cost": 0.0004748 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 14.12684929, + "Duration": 6.511308, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6606.0, + "total_tokens": 4408.0, "provider": "Google", - "Metric_request_tokens": 3969.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.00642025, - "input_cost": 0.00059535, - "output_cost": 0.0058249 + "Metric_request_tokens": 3211.0, + "Metric_response_tokens": 1197.0, + "total_cost": 0.0007999, + "input_cost": 0.0003211, + "output_cost": 0.0004788 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 11.416052215, - "Score_MermaidDiagramValid": 1.0, + "Duration": 5.636605, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 5051.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3902.0, "provider": "Google", - "Metric_request_tokens": 3061.0, - "Metric_response_tokens": 1169.0, - "total_cost": 0.00403405, - "input_cost": 0.00045914999999999997, - "output_cost": 0.0035749 + "Metric_request_tokens": 3219.0, + "Metric_response_tokens": 683.0, + "total_cost": 0.0005951000000000001, + "input_cost": 0.0003219, + "output_cost": 0.0002732 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.042913232, + "Duration": 5.076843, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2542.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3891.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00170505, - "input_cost": 0.00025035, - "output_cost": 0.0014547 + "Metric_request_tokens": 3211.0, + "Metric_response_tokens": 680.0, + "total_cost": 0.0005931, + "input_cost": 0.0003211, + "output_cost": 0.00027200000000000005 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.803645705, + "Duration": 6.715582, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2732.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4411.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0023700499999999994, - "input_cost": 0.00025035, - "output_cost": 0.0021196999999999995 + "Metric_request_tokens": 3209.0, + "Metric_response_tokens": 1202.0, + "total_cost": 0.0008017, + "input_cost": 0.00032090000000000005, + "output_cost": 0.0004808 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 24.280603396, + "Duration": 6.772443, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 55774.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 54589.0, + "Metric_response_tokens": 1185.0, + "total_cost": 0.0059329, + "input_cost": 0.0054589, + "output_cost": 0.00047400000000000003 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 21.444646242, + "Duration": 5.253199, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7586.0, + "total_tokens": 3888.0, "provider": "Google", - "Metric_request_tokens": 3679.0, - "Metric_response_tokens": 722.0, - "total_cost": 0.012132549999999999, - "input_cost": 0.00055185, - "output_cost": 0.0115807 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 678.0, + "total_cost": 0.0005922000000000001, + "input_cost": 0.00032100000000000005, + "output_cost": 0.00027120000000000003 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 14.103361899, + "Duration": 6.156194, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7319.0, + "total_tokens": 4408.0, "provider": "Google", - "Metric_request_tokens": 4540.0, - "Metric_response_tokens": 1218.0, - "total_cost": 0.006875300000000001, - "input_cost": 0.000681, - "output_cost": 0.006194300000000001 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 1198.0, + "total_cost": 0.0008002, + "input_cost": 0.00032100000000000005, + "output_cost": 0.00047920000000000005 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.086590308, - "Score_MermaidDiagramValid": 1.0, + "Duration": 7.078822, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9499.0, + "total_tokens": 55769.0, "provider": "Google", - "Metric_request_tokens": 3969.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.016481950000000002, - "input_cost": 0.00059535, - "output_cost": 0.0158866 + "Metric_request_tokens": 54588.0, + "Metric_response_tokens": 1181.0, + "total_cost": 0.005931199999999999, + "input_cost": 0.0054588, + "output_cost": 0.0004724 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 24.1841836, + "Duration": 6.132538, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4416.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3211.0, + "Metric_response_tokens": 1205.0, + "total_cost": 0.0008031, + "input_cost": 0.0003211, + "output_cost": 0.000482 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 17.981969969, + "Duration": 6.352614, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7548.0, + "total_tokens": 4402.0, "provider": "Google", - "Metric_request_tokens": 3630.0, - "Metric_response_tokens": 1695.0, - "total_cost": 0.009342000000000001, - "input_cost": 0.0005445, - "output_cost": 0.008797500000000001 + "Metric_request_tokens": 3208.0, + "Metric_response_tokens": 1194.0, + "total_cost": 0.0007984, + "input_cost": 0.0003208, + "output_cost": 0.0004776 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 20.823357766, + "Duration": 7.218354, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9790.0, + "total_tokens": 55785.0, "provider": "Google", - "Metric_request_tokens": 5066.0, - "Metric_response_tokens": 1729.0, - "total_cost": 0.0122798, - "input_cost": 0.0007599, - "output_cost": 0.0115199 + "Metric_request_tokens": 54589.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.0059373, + "input_cost": 0.0054589, + "output_cost": 0.00047840000000000003 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 25.462973093, + "Duration": 7.11734, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4410.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3208.0, + "Metric_response_tokens": 1202.0, + "total_cost": 0.0008016, + "input_cost": 0.0003208, + "output_cost": 0.0004808 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 35.722264378, + "Duration": 5.290457, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 16420.0, + "total_tokens": 3901.0, "provider": "Google", - "Metric_request_tokens": 8000.0, - "Metric_response_tokens": 1807.0, - "total_cost": 0.0254297, - "input_cost": 0.0012, - "output_cost": 0.0242297 + "Metric_request_tokens": 3208.0, + "Metric_response_tokens": 693.0, + "total_cost": 0.000598, + "input_cost": 0.0003208, + "output_cost": 0.0002772 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.71420281, + "Duration": 7.43899, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1869.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 55785.0, "provider": "Google", - "Metric_request_tokens": 811.0, - "Metric_response_tokens": 111.0, - "total_cost": 0.0035027500000000002, - "input_cost": 0.00012164999999999999, - "output_cost": 0.0033811 + "Metric_request_tokens": 54589.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.0059373, + "input_cost": 0.0054589, + "output_cost": 0.00047840000000000003 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 22.548778955, + "Duration": 4.882767, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3923.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3211.0, + "Metric_response_tokens": 712.0, + "total_cost": 0.0006058999999999999, + "input_cost": 0.0003211, + "output_cost": 0.0002848 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 18.459759081, + "Duration": 5.189799, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3913.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3208.0, + "Metric_response_tokens": 705.0, + "total_cost": 0.0006028, + "input_cost": 0.0003208, + "output_cost": 0.000282 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 35.081946937, + "Duration": 6.863021, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 11019.0, + "total_tokens": 55806.0, "provider": "Google", - "Metric_request_tokens": 4018.0, - "Metric_response_tokens": 1243.0, - "total_cost": 0.021501500000000003, - "input_cost": 0.0006027, - "output_cost": 0.020898800000000002 + "Metric_request_tokens": 54588.0, + "Metric_response_tokens": 1218.0, + "total_cost": 0.005946, + "input_cost": 0.0054588, + "output_cost": 0.00048719999999999997 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 26.112757598, + "Duration": 6.779658, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 12440.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4381.0, "provider": "Google", - "Metric_request_tokens": 6832.0, - "Metric_response_tokens": 1782.0, - "total_cost": 0.015484999999999999, - "input_cost": 0.0010248, - "output_cost": 0.0144602 + "Metric_request_tokens": 3194.0, + "Metric_response_tokens": 1187.0, + "total_cost": 0.0007942, + "input_cost": 0.0003194, + "output_cost": 0.0004748 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 15.507230852, + "Duration": 6.355412, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4880.0, - "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00988225, - "input_cost": 0.00025035, - "output_cost": 0.0096319 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4410.0, + "provider": "Google", + "Metric_request_tokens": 3209.0, + "Metric_response_tokens": 1201.0, + "total_cost": 0.0008013, + "input_cost": 0.00032090000000000005, + "output_cost": 0.0004804 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 22.268203361, + "Duration": 4.824842, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4007.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3318.0, + "Metric_response_tokens": 689.0, + "total_cost": 0.0006074000000000001, + "input_cost": 0.00033180000000000004, + "output_cost": 0.00027560000000000003 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 29.657737725, + "Duration": 6.020729, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 8725.0, + "total_tokens": 4409.0, "provider": "Google", - "Metric_request_tokens": 3968.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.013843, - "input_cost": 0.0005952, - "output_cost": 0.013247799999999999 + "Metric_request_tokens": 3211.0, + "Metric_response_tokens": 1198.0, + "total_cost": 0.0008003000000000001, + "input_cost": 0.0003211, + "output_cost": 0.00047920000000000005 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 17.856923641, + "Duration": 6.344559, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4406.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3209.0, + "Metric_response_tokens": 1197.0, + "total_cost": 0.0007997, + "input_cost": 0.00032090000000000005, + "output_cost": 0.0004788 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 22.406209617, + "Duration": 6.45661, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9355.0, + "total_tokens": 55777.0, "provider": "Google", - "Metric_request_tokens": 4545.0, - "Metric_response_tokens": 1192.0, - "total_cost": 0.01405995, - "input_cost": 0.0006817500000000001, - "output_cost": 0.0133782 + "Metric_request_tokens": 54589.0, + "Metric_response_tokens": 1188.0, + "total_cost": 0.0059341, + "input_cost": 0.0054589, + "output_cost": 0.00047520000000000006 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 29.67447829, + "Duration": 6.430851, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9321.0, + "total_tokens": 4400.0, "provider": "Google", - "Metric_request_tokens": 3105.0, - "Metric_response_tokens": 1192.0, - "total_cost": 0.01876495, - "input_cost": 0.00046575, - "output_cost": 0.018299199999999998 + "Metric_request_tokens": 3211.0, + "Metric_response_tokens": 1189.0, + "total_cost": 0.0007967, + "input_cost": 0.0003211, + "output_cost": 0.0004756 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.125237979, + "Duration": 6.245205, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1709.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4399.0, "provider": "Google", - "Metric_request_tokens": 810.0, - "Metric_response_tokens": 524.0, - "total_cost": 0.0017484000000000002, - "input_cost": 0.00012149999999999999, - "output_cost": 0.0016269000000000001 + "Metric_request_tokens": 3211.0, + "Metric_response_tokens": 1188.0, + "total_cost": 0.0007963, + "input_cost": 0.0003211, + "output_cost": 0.00047520000000000006 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 32.319621961, + "Duration": 6.328961, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4429.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3218.0, + "Metric_response_tokens": 1211.0, + "total_cost": 0.0008062000000000001, + "input_cost": 0.0003218, + "output_cost": 0.00048440000000000006 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 16.088759709, + "Duration": 7.136504, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4408.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3211.0, + "Metric_response_tokens": 1197.0, + "total_cost": 0.0007999, + "input_cost": 0.0003211, + "output_cost": 0.0004788 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 22.178054671, + "Duration": 6.226974, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 8629.0, + "total_tokens": 4384.0, "provider": "Google", - "Metric_request_tokens": 3965.0, - "Metric_response_tokens": 1170.0, - "total_cost": 0.01352575, - "input_cost": 0.0005947499999999999, - "output_cost": 0.012931 + "Metric_request_tokens": 3208.0, + "Metric_response_tokens": 1176.0, + "total_cost": 0.0007911999999999999, + "input_cost": 0.0003208, + "output_cost": 0.0004704 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 10.029780959, + "Duration": 6.927981, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 3107.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 55786.0, "provider": "Google", - "Metric_request_tokens": 811.0, - "Metric_response_tokens": 536.0, - "total_cost": 0.006603250000000001, - "input_cost": 0.00012164999999999999, - "output_cost": 0.0064816000000000006 + "Metric_request_tokens": 54589.0, + "Metric_response_tokens": 1197.0, + "total_cost": 0.0059377, + "input_cost": 0.0054589, + "output_cost": 0.0004788 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 42.923902196, + "Duration": 5.433176, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 30481.0, + "total_tokens": 4405.0, "provider": "Google", - "Metric_request_tokens": 21012.0, - "Metric_response_tokens": 3073.0, - "total_cost": 0.0273816, - "input_cost": 0.0031517999999999997, - "output_cost": 0.0242298 + "Metric_request_tokens": 3209.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.0007993000000000001, + "input_cost": 0.00032090000000000005, + "output_cost": 0.00047840000000000003 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 39.222518221, + "Duration": 6.001402, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4409.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3211.0, + "Metric_response_tokens": 1198.0, + "total_cost": 0.0008003000000000001, + "input_cost": 0.0003211, + "output_cost": 0.00047920000000000005 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 9.631045956, - "Score_MermaidDiagramValid": 0.0, + "Duration": 6.185808, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 2892.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4489.0, "provider": "Google", - "Metric_request_tokens": 811.0, - "Metric_response_tokens": 625.0, - "total_cost": 0.00559265, - "input_cost": 0.00012164999999999999, - "output_cost": 0.005471 + "Metric_request_tokens": 3219.0, + "Metric_response_tokens": 1270.0, + "total_cost": 0.0008299000000000002, + "input_cost": 0.0003219, + "output_cost": 0.0005080000000000001 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 6.647730144, + "Duration": 6.921527, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 2292.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4467.0, "provider": "Google", - "Metric_request_tokens": 810.0, - "Metric_response_tokens": 524.0, - "total_cost": 0.0037889, - "input_cost": 0.00012149999999999999, - "output_cost": 0.0036674 + "Metric_request_tokens": 3209.0, + "Metric_response_tokens": 1258.0, + "total_cost": 0.0008241, + "input_cost": 0.00032090000000000005, + "output_cost": 0.0005032 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 35.71455716, + "Duration": 6.136839, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4409.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 1199.0, + "total_cost": 0.0008006, + "input_cost": 0.00032100000000000005, + "output_cost": 0.0004796 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.476345308, + "Duration": 6.47105, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2659.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4427.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00210205, - "input_cost": 0.00025065, - "output_cost": 0.0018514 + "Metric_request_tokens": 3218.0, + "Metric_response_tokens": 1209.0, + "total_cost": 0.0008054000000000001, + "input_cost": 0.0003218, + "output_cost": 0.00048360000000000005 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 6.346005219, + "Duration": 6.204263, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2718.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4412.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00232105, - "input_cost": 0.00025035, - "output_cost": 0.0020707 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 1202.0, + "total_cost": 0.0008018000000000001, + "input_cost": 0.00032100000000000005, + "output_cost": 0.0004808 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.836977527, + "Duration": 4.909809, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2539.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3889.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00169455, - "input_cost": 0.00025035, - "output_cost": 0.0014441999999999999 + "Metric_request_tokens": 3208.0, + "Metric_response_tokens": 681.0, + "total_cost": 0.0005932, + "input_cost": 0.0003208, + "output_cost": 0.0002724 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.672693551, + "Duration": 6.411182, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2929.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4421.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00304705, - "input_cost": 0.00025065, - "output_cost": 0.0027964 + "Metric_request_tokens": 3218.0, + "Metric_response_tokens": 1203.0, + "total_cost": 0.0008030000000000001, + "input_cost": 0.0003218, + "output_cost": 0.00048120000000000004 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 14.662619685, + "Duration": 5.180393, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6896.0, + "total_tokens": 3940.0, "provider": "Google", - "Metric_request_tokens": 3981.0, - "Metric_response_tokens": 1184.0, - "total_cost": 0.00736605, - "input_cost": 0.00059715, - "output_cost": 0.0067689 + "Metric_request_tokens": 3211.0, + "Metric_response_tokens": 729.0, + "total_cost": 0.0006127000000000001, + "input_cost": 0.0003211, + "output_cost": 0.00029160000000000004 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.188614707, + "Duration": 5.239312, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2611.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4025.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 557.0, - "total_cost": 0.0019320499999999998, - "input_cost": 0.00025035, - "output_cost": 0.0016817 + "Metric_request_tokens": 3209.0, + "Metric_response_tokens": 816.0, + "total_cost": 0.0006473000000000001, + "input_cost": 0.00032090000000000005, + "output_cost": 0.0003264 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.5603659, + "Duration": 21.349486, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2756.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4849.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00244155, - "input_cost": 0.00025065, - "output_cost": 0.0021909 + "Metric_request_tokens": 3219.0, + "Metric_response_tokens": 1630.0, + "total_cost": 0.024883750000000003, + "input_cost": 0.00402375, + "output_cost": 0.020860000000000004 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 6.128366075, + "Duration": 23.964467, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2798.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5597.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00259525, - "input_cost": 0.00025035, - "output_cost": 0.0023449 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 2387.0, + "total_cost": 0.0402125, + "input_cost": 0.0040125000000000004, + "output_cost": 0.036199999999999996 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.944774864, + "Duration": 20.409625, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2557.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5199.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00175755, - "input_cost": 0.00025035, - "output_cost": 0.0015072 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 1989.0, + "total_cost": 0.0320625, + "input_cost": 0.0040125000000000004, + "output_cost": 0.028050000000000002 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.76290555, - "Score_MermaidDiagramValid": 0.0, + "Duration": 35.195388, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2696.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 60998.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.0022315499999999997, - "input_cost": 0.00025065, - "output_cost": 0.0019809 + "Metric_request_tokens": 57624.0, + "Metric_response_tokens": 3374.0, + "total_cost": 0.12099, + "input_cost": 0.07203, + "output_cost": 0.048960000000000004 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 11.161597903, + "Duration": 25.042637, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7403.0, + "total_tokens": 5164.0, "provider": "Google", - "Metric_request_tokens": 5349.0, - "Metric_response_tokens": 1168.0, - "total_cost": 0.00460415, - "input_cost": 0.00080235, - "output_cost": 0.0038018 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 1954.0, + "total_cost": 0.0315525, + "input_cost": 0.0040125000000000004, + "output_cost": 0.02754 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.529419098, + "Duration": 16.228586, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2839.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4880.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0027445499999999997, - "input_cost": 0.00025035, - "output_cost": 0.0024942 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 1670.0, + "total_cost": 0.025682500000000004, + "input_cost": 0.0040125000000000004, + "output_cost": 0.021670000000000002 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 8.540685767, + "Duration": 22.416002, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3173.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5374.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.0039010499999999997, - "input_cost": 0.00025065, - "output_cost": 0.0036504 + "Metric_request_tokens": 3219.0, + "Metric_response_tokens": 2155.0, + "total_cost": 0.035353749999999996, + "input_cost": 0.00402375, + "output_cost": 0.03133 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.922996591, + "Duration": 34.89023, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2729.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6708.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0023595499999999998, - "input_cost": 0.00025035, - "output_cost": 0.0021092 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 3498.0, + "total_cost": 0.062262500000000005, + "input_cost": 0.0040125000000000004, + "output_cost": 0.05825 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.741784844, + "Duration": 18.19492, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2900.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5057.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00295805, - "input_cost": 0.00025035, - "output_cost": 0.0027077 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 1847.0, + "total_cost": 0.029242500000000005, + "input_cost": 0.0040125000000000004, + "output_cost": 0.025230000000000002 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 7.055616236, - "Score_MermaidDiagramValid": 0.0, + "Duration": 45.85591, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2809.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 62754.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.0026270499999999997, - "input_cost": 0.00025065, - "output_cost": 0.0023764 + "Metric_request_tokens": 58599.0, + "Metric_response_tokens": 4155.0, + "total_cost": 0.13779875, + "input_cost": 0.07324875, + "output_cost": 0.06455 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 21.365465445, + "Duration": 21.491191, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9022.0, + "total_tokens": 4990.0, "provider": "Google", - "Metric_request_tokens": 5361.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.01021305, - "input_cost": 0.00080415, - "output_cost": 0.0094089 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 1780.0, + "total_cost": 0.027902500000000004, + "input_cost": 0.0040125000000000004, + "output_cost": 0.02389 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.080146417, + "Duration": 25.118851, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2513.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5685.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00160355, - "input_cost": 0.00025035, - "output_cost": 0.0013532 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 2475.0, + "total_cost": 0.041842500000000005, + "input_cost": 0.0040125000000000004, + "output_cost": 0.03783 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 25.419348211, - "Score_MermaidDiagramValid": 1.0, + "Duration": 19.464079, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10935.0, + "total_tokens": 5126.0, "provider": "Google", - "Metric_request_tokens": 5358.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.016854800000000003, - "input_cost": 0.0008037, - "output_cost": 0.016051100000000002 + "Metric_request_tokens": 3219.0, + "Metric_response_tokens": 1907.0, + "total_cost": 0.03042375, + "input_cost": 0.00402375, + "output_cost": 0.0264 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.279733312, + "Duration": 24.405728, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2443.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5297.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00135855, - "input_cost": 0.00025035, - "output_cost": 0.0011082 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 2087.0, + "total_cost": 0.0340425, + "input_cost": 0.0040125000000000004, + "output_cost": 0.03003 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.584990518, + "Duration": 67.674063, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2690.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00221725, - "input_cost": 0.00025035, - "output_cost": 0.0019669 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 7.12928071, - "Score_MermaidDiagramValid": 0.0, + "Duration": 21.024953, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2925.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5326.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00303305, - "input_cost": 0.00025065, - "output_cost": 0.0027824 + "Metric_request_tokens": 3219.0, + "Metric_response_tokens": 2107.0, + "total_cost": 0.03420375, + "input_cost": 0.00402375, + "output_cost": 0.03018 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 23.894062173, + "Duration": 34.71675, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 11489.0, + "total_tokens": 6803.0, "provider": "Google", - "Metric_request_tokens": 6861.0, - "Metric_response_tokens": 1806.0, - "total_cost": 0.01198975, - "input_cost": 0.00102915, - "output_cost": 0.010960600000000001 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 3593.0, + "total_cost": 0.0641625, + "input_cost": 0.0040125000000000004, + "output_cost": 0.06015 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 9.010575503, + "Duration": 18.749296, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3355.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5129.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00455055, - "input_cost": 0.00025035, - "output_cost": 0.0043002000000000005 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 1919.0, + "total_cost": 0.0307425, + "input_cost": 0.0040125000000000004, + "output_cost": 0.02673 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.778800106, - "Score_MermaidDiagramValid": 0.0, + "Duration": 36.35102, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2693.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 62398.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.0022210499999999996, - "input_cost": 0.00025065, - "output_cost": 0.0019703999999999998 + "Metric_request_tokens": 59103.0, + "Metric_response_tokens": 3295.0, + "total_cost": 0.12099875000000002, + "input_cost": 0.07387875, + "output_cost": 0.04712 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 8.167258931, + "Duration": 47.352071, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3303.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 16984.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 550.0, - "total_cost": 0.004374349999999999, - "input_cost": 0.00025035, - "output_cost": 0.004123999999999999 + "Metric_request_tokens": 12617.0, + "Metric_response_tokens": 4367.0, + "total_cost": 0.07872125, + "input_cost": 0.01577125, + "output_cost": 0.06295 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.731809965, + "Duration": 80.205807, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2458.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00141105, - "input_cost": 0.00025035, - "output_cost": 0.0011607 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", - "test_group": "easy", - "Duration": 5.421789254, - "Score_MermaidDiagramValid": 0.0, + "test_group": "easy", + "Duration": 33.973289, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2578.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 61926.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 568.0, - "total_cost": 0.0017779500000000001, - "input_cost": 0.00025065, - "output_cost": 0.0015273 + "Metric_request_tokens": 58756.0, + "Metric_response_tokens": 3170.0, + "total_cost": 0.118345, + "input_cost": 0.07344500000000001, + "output_cost": 0.0449 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.939170129, + "Duration": 17.276555, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2535.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4913.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0016805499999999998, - "input_cost": 0.00025035, - "output_cost": 0.0014302 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 1703.0, + "total_cost": 0.0265325, + "input_cost": 0.0040125000000000004, + "output_cost": 0.022520000000000002 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.085231652, + "Duration": 22.695808, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2550.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5513.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0017330499999999999, - "input_cost": 0.00025035, - "output_cost": 0.0014827 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 2303.0, + "total_cost": 0.038402500000000006, + "input_cost": 0.0040125000000000004, + "output_cost": 0.034390000000000004 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 12.230727772, + "Duration": 25.120604, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 11014.0, + "total_tokens": 5607.0, "provider": "Google", - "Metric_request_tokens": 3108.0, - "Metric_response_tokens": 1198.0, - "total_cost": 0.024663, - "input_cost": 0.0004662, - "output_cost": 0.0241968 + "Metric_request_tokens": 3219.0, + "Metric_response_tokens": 2388.0, + "total_cost": 0.03999375, + "input_cost": 0.00402375, + "output_cost": 0.03597 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 8.306374733, + "Duration": 30.955354, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2757.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6230.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00245175, - "input_cost": 0.00025035, - "output_cost": 0.0022014 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 3020.0, + "total_cost": 0.0528825, + "input_cost": 0.0040125000000000004, + "output_cost": 0.04887 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.595447784, + "Duration": 73.925452, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2698.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 21279.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0022510499999999997, - "input_cost": 0.00025035, - "output_cost": 0.0020007 + "Metric_request_tokens": 13494.0, + "Metric_response_tokens": 7785.0, + "total_cost": 0.14814750000000002, + "input_cost": 0.0168675, + "output_cost": 0.13128 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.077756987, + "Duration": 29.693847, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 8290.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6429.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.02181055, - "input_cost": 0.00025065, - "output_cost": 0.0215599 + "Metric_request_tokens": 3219.0, + "Metric_response_tokens": 3210.0, + "total_cost": 0.05648375, + "input_cost": 0.00402375, + "output_cost": 0.05246 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 13.08787764, + "Duration": 25.564584, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 12993.0, + "total_tokens": 5542.0, "provider": "Google", - "Metric_request_tokens": 5355.0, - "Metric_response_tokens": 1171.0, - "total_cost": 0.024140349999999998, - "input_cost": 0.00080325, - "output_cost": 0.0233371 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 2332.0, + "total_cost": 0.0389525, + "input_cost": 0.0040125000000000004, + "output_cost": 0.03494 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 18.203576056, + "Duration": 94.248736, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, "provider": "Google", @@ -11142,3983 +16067,4143 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 7.877005808, + "Duration": 18.571851, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2829.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5030.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00269705, - "input_cost": 0.00025065, - "output_cost": 0.0024464 + "Metric_request_tokens": 3219.0, + "Metric_response_tokens": 1811.0, + "total_cost": 0.02850375, + "input_cost": 0.00402375, + "output_cost": 0.024480000000000002 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 6.691518112, + "Duration": 21.144001, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2766.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5212.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00248905, - "input_cost": 0.00025035, - "output_cost": 0.0022387 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 2002.0, + "total_cost": 0.032322500000000004, + "input_cost": 0.0040125000000000004, + "output_cost": 0.028310000000000002 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.46408462, + "Duration": 36.342038, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2591.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10769.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0018765499999999998, - "input_cost": 0.00025035, - "output_cost": 0.0016262 + "Metric_request_tokens": 7141.0, + "Metric_response_tokens": 3628.0, + "total_cost": 0.06363625, + "input_cost": 0.00892625, + "output_cost": 0.05471 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 11.559526636, + "Duration": 17.376388, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 15503.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4866.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.047056049999999995, - "input_cost": 0.00025065, - "output_cost": 0.0468054 + "Metric_request_tokens": 3219.0, + "Metric_response_tokens": 1647.0, + "total_cost": 0.02522375, + "input_cost": 0.00402375, + "output_cost": 0.0212 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 18.417513707, + "Duration": 22.373867, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 8232.0, + "total_tokens": 5470.0, "provider": "Google", - "Metric_request_tokens": 5361.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.0074480499999999995, - "input_cost": 0.00080415, - "output_cost": 0.0066438999999999995 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 2260.0, + "total_cost": 0.037672500000000005, + "input_cost": 0.0040125000000000004, + "output_cost": 0.03366 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 9.485176875, + "Duration": 64.529238, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2981.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 26102.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00323575, - "input_cost": 0.00025035, - "output_cost": 0.0029854 + "Metric_request_tokens": 19633.0, + "Metric_response_tokens": 6469.0, + "total_cost": 0.12325124999999999, + "input_cost": 0.02454125, + "output_cost": 0.09870999999999999 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", - "test_group": "easy", - "Duration": 6.271124626, - "Score_MermaidDiagramValid": 0.0, + "test_group": "easy", + "Duration": 31.845862, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2722.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 60411.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00232255, - "input_cost": 0.00025065, - "output_cost": 0.0020719000000000002 + "Metric_request_tokens": 57457.0, + "Metric_response_tokens": 2954.0, + "total_cost": 0.11237125, + "input_cost": 0.07182125, + "output_cost": 0.040549999999999996 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 18.342354393, + "Duration": 25.463547, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 20909.0, + "total_tokens": 5763.0, "provider": "Google", - "Metric_request_tokens": 5361.0, - "Metric_response_tokens": 1158.0, - "total_cost": 0.051863950000000006, - "input_cost": 0.00080415, - "output_cost": 0.0510598 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 2553.0, + "total_cost": 0.043362500000000005, + "input_cost": 0.0040125000000000004, + "output_cost": 0.03935 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 19.63725688, + "Duration": 45.089712, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 19611.0, + "total_tokens": 12047.0, "provider": "Google", - "Metric_request_tokens": 3981.0, - "Metric_response_tokens": 1184.0, - "total_cost": 0.05186855, - "input_cost": 0.00059715, - "output_cost": 0.0512714 + "Metric_request_tokens": 7141.0, + "Metric_response_tokens": 4906.0, + "total_cost": 0.08917624999999998, + "input_cost": 0.00892625, + "output_cost": 0.08024999999999999 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.517046246, + "Duration": 18.56191, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4330.0, + "total_tokens": 5052.0, "provider": "Google", - "Metric_request_tokens": 3106.0, - "Metric_response_tokens": 1224.0, - "total_cost": 0.0008002, - "input_cost": 0.0003106, - "output_cost": 0.0004896000000000001 + "Metric_request_tokens": 3219.0, + "Metric_response_tokens": 1833.0, + "total_cost": 0.028943749999999997, + "input_cost": 0.00402375, + "output_cost": 0.024919999999999998 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.366856197, + "Duration": 18.501743, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4300.0, + "total_tokens": 5018.0, "provider": "Google", - "Metric_request_tokens": 3108.0, - "Metric_response_tokens": 1192.0, - "total_cost": 0.0007876000000000001, - "input_cost": 0.0003108, - "output_cost": 0.0004768 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 1808.0, + "total_cost": 0.0284625, + "input_cost": 0.0040125000000000004, + "output_cost": 0.02445 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.691253093, + "Duration": 29.826137, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3771.0, + "total_tokens": 6374.0, "provider": "Google", - "Metric_request_tokens": 3105.0, - "Metric_response_tokens": 666.0, - "total_cost": 0.0005769, - "input_cost": 0.0003105, - "output_cost": 0.0002664 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 3164.0, + "total_cost": 0.05558250000000001, + "input_cost": 0.0040125000000000004, + "output_cost": 0.051570000000000005 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.948628294, + "Duration": 20.088304, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 50365.0, + "total_tokens": 5329.0, "provider": "Google", - "Metric_request_tokens": 49186.0, - "Metric_response_tokens": 1179.0, - "total_cost": 0.0053902, - "input_cost": 0.0049186, - "output_cost": 0.00047159999999999997 + "Metric_request_tokens": 3212.0, + "Metric_response_tokens": 2117.0, + "total_cost": 0.034464999999999996, + "input_cost": 0.004015, + "output_cost": 0.030449999999999998 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.397124725, - "Score_MermaidDiagramValid": 0.0, + "Duration": 66.934286, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4273.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 75854.0, "provider": "Google", - "Metric_request_tokens": 3089.0, - "Metric_response_tokens": 1184.0, - "total_cost": 0.0007825, - "input_cost": 0.00030890000000000003, - "output_cost": 0.0004736 + "Metric_request_tokens": 70352.0, + "Metric_response_tokens": 5502.0, + "total_cost": 0.16721, + "input_cost": 0.08793999999999999, + "output_cost": 0.07927000000000001 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-pro", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.246158175, + "Duration": 31.963868, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4269.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6444.0, "provider": "Google", - "Metric_request_tokens": 3092.0, - "Metric_response_tokens": 1177.0, - "total_cost": 0.0007800000000000001, - "input_cost": 0.00030920000000000003, - "output_cost": 0.00047080000000000006 + "Metric_request_tokens": 3210.0, + "Metric_response_tokens": 3234.0, + "total_cost": 0.0569825, + "input_cost": 0.0040125000000000004, + "output_cost": 0.052969999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.461461037, + "Duration": 23.114865, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4313.0, - "provider": "Google", - "Metric_request_tokens": 3089.0, - "Metric_response_tokens": 1224.0, - "total_cost": 0.0007985000000000002, - "input_cost": 0.00030890000000000003, - "output_cost": 0.0004896000000000001 + "total_tokens": 3405.0, + "provider": "OSS", + "Metric_request_tokens": 2254.0, + "Metric_response_tokens": 1151.0, + "total_cost": 0.00022876, + "input_cost": 6.761999999999999e-5, + "output_cost": 0.00016114 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.826108098, + "Duration": 86.989557, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3829.0, - "provider": "Google", - "Metric_request_tokens": 3136.0, - "Metric_response_tokens": 693.0, - "total_cost": 0.0005908, - "input_cost": 0.00031360000000000003, - "output_cost": 0.0002772 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.812971395, + "Duration": 62.664778, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3771.0, - "provider": "Google", - "Metric_request_tokens": 3103.0, - "Metric_response_tokens": 668.0, - "total_cost": 0.0005775, - "input_cost": 0.0003103, - "output_cost": 0.0002672 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.463204247, + "Duration": 76.102599, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4292.0, - "provider": "Google", - "Metric_request_tokens": 3106.0, - "Metric_response_tokens": 1186.0, - "total_cost": 0.0007850000000000001, - "input_cost": 0.0003106, - "output_cost": 0.00047440000000000004 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.422216874, + "Duration": 53.880214, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6650.0, - "provider": "Google", - "Metric_request_tokens": 5423.0, - "Metric_response_tokens": 1227.0, - "total_cost": 0.0010331, - "input_cost": 0.0005423, - "output_cost": 0.0004908 + "total_tokens": 7985.0, + "provider": "OSS", + "Metric_request_tokens": 5249.0, + "Metric_response_tokens": 2736.0, + "total_cost": 0.0005405100000000001, + "input_cost": 0.00015747, + "output_cost": 0.0003830400000000001 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.450171262, + "Duration": 81.052113, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4287.0, - "provider": "Google", - "Metric_request_tokens": 3091.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.0007875, - "input_cost": 0.00030910000000000003, - "output_cost": 0.00047840000000000003 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.562724232, + "Duration": 26.28867, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4284.0, - "provider": "Google", - "Metric_request_tokens": 3088.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.0007872, - "input_cost": 0.0003088, - "output_cost": 0.00047840000000000003 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3644.0, + "provider": "OSS", + "Metric_request_tokens": 2218.0, + "Metric_response_tokens": 1426.0, + "total_cost": 0.00026618, + "input_cost": 6.654e-5, + "output_cost": 0.00019964 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.407190932, + "Duration": 14.429584, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4378.0, - "provider": "Google", - "Metric_request_tokens": 3136.0, - "Metric_response_tokens": 1242.0, - "total_cost": 0.0008104000000000001, - "input_cost": 0.00031360000000000003, - "output_cost": 0.0004968 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2502.0, + "provider": "OSS", + "Metric_request_tokens": 1753.0, + "Metric_response_tokens": 749.0, + "total_cost": 0.00015745, + "input_cost": 5.2589999999999996e-5, + "output_cost": 0.00010486000000000001 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.620992741, + "Duration": 27.643668, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3766.0, - "provider": "Google", - "Metric_request_tokens": 3091.0, - "Metric_response_tokens": 675.0, - "total_cost": 0.0005791, - "input_cost": 0.00030910000000000003, - "output_cost": 0.00027 + "total_tokens": 3414.0, + "provider": "OSS", + "Metric_request_tokens": 1890.0, + "Metric_response_tokens": 1524.0, + "total_cost": 0.00027006, + "input_cost": 5.6699999999999996e-5, + "output_cost": 0.00021336 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.472499546, + "Duration": 101.379545, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4297.0, - "provider": "Google", - "Metric_request_tokens": 3090.0, - "Metric_response_tokens": 1207.0, - "total_cost": 0.0007918000000000001, - "input_cost": 0.00030900000000000003, - "output_cost": 0.0004828 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 24774.0, + "provider": "OSS", + "Metric_request_tokens": 20505.0, + "Metric_response_tokens": 4269.0, + "total_cost": 0.00121281, + "input_cost": 0.00061515, + "output_cost": 0.0005976600000000001 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.473251097, + "Duration": 13.188534, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4322.0, - "provider": "Google", - "Metric_request_tokens": 3106.0, - "Metric_response_tokens": 1216.0, - "total_cost": 0.0007970000000000001, - "input_cost": 0.0003106, - "output_cost": 0.00048640000000000006 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2395.0, + "provider": "OSS", + "Metric_request_tokens": 1713.0, + "Metric_response_tokens": 682.0, + "total_cost": 0.00014687, + "input_cost": 5.1389999999999994e-5, + "output_cost": 9.548000000000001e-5 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.574995535, + "Duration": 27.1305, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4266.0, - "provider": "Google", - "Metric_request_tokens": 3091.0, - "Metric_response_tokens": 1175.0, - "total_cost": 0.0007791, - "input_cost": 0.00030910000000000003, - "output_cost": 0.00047000000000000004 + "total_tokens": 3468.0, + "provider": "OSS", + "Metric_request_tokens": 1976.0, + "Metric_response_tokens": 1492.0, + "total_cost": 0.00026816000000000003, + "input_cost": 5.9279999999999995e-5, + "output_cost": 0.00020888000000000002 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.094999263, + "Duration": 85.02272, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3858.0, - "provider": "Google", - "Metric_request_tokens": 3137.0, - "Metric_response_tokens": 721.0, - "total_cost": 0.0006021, - "input_cost": 0.00031370000000000004, - "output_cost": 0.0002884 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.658386722, + "Duration": 39.296536, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3778.0, - "provider": "Google", - "Metric_request_tokens": 3089.0, - "Metric_response_tokens": 689.0, - "total_cost": 0.0005845000000000001, - "input_cost": 0.00030890000000000003, - "output_cost": 0.00027560000000000003 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6425.0, + "provider": "OSS", + "Metric_request_tokens": 4463.0, + "Metric_response_tokens": 1962.0, + "total_cost": 0.00040857, + "input_cost": 0.00013388999999999998, + "output_cost": 0.0002746800000000001 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.893656632, + "Duration": 40.194942, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4357.0, - "provider": "Google", - "Metric_request_tokens": 3136.0, - "Metric_response_tokens": 1221.0, - "total_cost": 0.0008020000000000001, - "input_cost": 0.00031360000000000003, - "output_cost": 0.0004884 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4222.0, + "provider": "OSS", + "Metric_request_tokens": 1974.0, + "Metric_response_tokens": 2248.0, + "total_cost": 0.00037394000000000007, + "input_cost": 5.922e-5, + "output_cost": 0.00031472000000000005 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.236565242, + "Duration": 33.429674, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3883.0, - "provider": "Google", - "Metric_request_tokens": 3137.0, - "Metric_response_tokens": 746.0, - "total_cost": 0.0006121000000000001, - "input_cost": 0.00031370000000000004, - "output_cost": 0.00029840000000000004 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4239.0, + "provider": "OSS", + "Metric_request_tokens": 2420.0, + "Metric_response_tokens": 1819.0, + "total_cost": 0.00032726, + "input_cost": 7.259999999999999e-5, + "output_cost": 0.00025466000000000003 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.485359369, + "Duration": 41.898096, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4324.0, - "provider": "Google", - "Metric_request_tokens": 3108.0, - "Metric_response_tokens": 1216.0, - "total_cost": 0.0007972000000000001, - "input_cost": 0.0003108, - "output_cost": 0.00048640000000000006 + "total_tokens": 8870.0, + "provider": "OSS", + "Metric_request_tokens": 6781.0, + "Metric_response_tokens": 2089.0, + "total_cost": 0.00049589, + "input_cost": 0.00020343, + "output_cost": 0.00029246000000000003 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.734365673, + "Duration": 93.890976, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3791.0, - "provider": "Google", - "Metric_request_tokens": 3110.0, - "Metric_response_tokens": 681.0, - "total_cost": 0.0005834, - "input_cost": 0.000311, - "output_cost": 0.0002724 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.500398738, + "Duration": 16.815703, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4304.0, - "provider": "Google", - "Metric_request_tokens": 3088.0, - "Metric_response_tokens": 1216.0, - "total_cost": 0.0007952, - "input_cost": 0.0003088, - "output_cost": 0.00048640000000000006 + "total_tokens": 2737.0, + "provider": "OSS", + "Metric_request_tokens": 1861.0, + "Metric_response_tokens": 876.0, + "total_cost": 0.00017847000000000002, + "input_cost": 5.583e-5, + "output_cost": 0.00012264 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.124787172, + "Duration": 21.848935, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3850.0, - "provider": "Google", - "Metric_request_tokens": 3089.0, - "Metric_response_tokens": 761.0, - "total_cost": 0.0006133, - "input_cost": 0.00030890000000000003, - "output_cost": 0.0003044 + "total_tokens": 3050.0, + "provider": "OSS", + "Metric_request_tokens": 1872.0, + "Metric_response_tokens": 1178.0, + "total_cost": 0.00022108000000000002, + "input_cost": 5.616e-5, + "output_cost": 0.00016492 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.98193398, + "Duration": 21.689143, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3903.0, - "provider": "Google", - "Metric_request_tokens": 3136.0, - "Metric_response_tokens": 767.0, - "total_cost": 0.0006204000000000001, - "input_cost": 0.00031360000000000003, - "output_cost": 0.00030680000000000003 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2890.0, + "provider": "OSS", + "Metric_request_tokens": 1704.0, + "Metric_response_tokens": 1186.0, + "total_cost": 0.00021716, + "input_cost": 5.112e-5, + "output_cost": 0.00016604 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.48101111, + "Duration": 33.564541, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4290.0, - "provider": "Google", - "Metric_request_tokens": 3088.0, - "Metric_response_tokens": 1202.0, - "total_cost": 0.0007896, - "input_cost": 0.0003088, - "output_cost": 0.0004808 + "total_tokens": 3979.0, + "provider": "OSS", + "Metric_request_tokens": 2128.0, + "Metric_response_tokens": 1851.0, + "total_cost": 0.00032298000000000006, + "input_cost": 6.384e-5, + "output_cost": 0.00025914000000000004 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.916593879, + "Duration": 23.284016, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4543.0, - "provider": "Google", - "Metric_request_tokens": 3089.0, - "Metric_response_tokens": 1454.0, - "total_cost": 0.0008905, - "input_cost": 0.00030890000000000003, - "output_cost": 0.0005816 + "total_tokens": 3161.0, + "provider": "OSS", + "Metric_request_tokens": 1903.0, + "Metric_response_tokens": 1258.0, + "total_cost": 0.00023321000000000003, + "input_cost": 5.709e-5, + "output_cost": 0.00017612000000000001 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.879042448, + "Duration": 63.669357, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3885.0, - "provider": "Google", - "Metric_request_tokens": 3092.0, - "Metric_response_tokens": 793.0, - "total_cost": 0.0006264, - "input_cost": 0.00030920000000000003, - "output_cost": 0.0003172 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7992.0, + "provider": "OSS", + "Metric_request_tokens": 4631.0, + "Metric_response_tokens": 3361.0, + "total_cost": 0.00060947, + "input_cost": 0.00013892999999999997, + "output_cost": 0.00047054000000000003 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.444876941, + "Duration": 16.628259, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4284.0, - "provider": "Google", - "Metric_request_tokens": 3088.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.0007872, - "input_cost": 0.0003088, - "output_cost": 0.00047840000000000003 + "total_tokens": 2799.0, + "provider": "OSS", + "Metric_request_tokens": 1935.0, + "Metric_response_tokens": 864.0, + "total_cost": 0.00017901, + "input_cost": 5.805e-5, + "output_cost": 0.00012096000000000001 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.627413202, + "Duration": 42.467932, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4291.0, - "provider": "Google", - "Metric_request_tokens": 3089.0, - "Metric_response_tokens": 1202.0, - "total_cost": 0.0007897, - "input_cost": 0.00030890000000000003, - "output_cost": 0.0004808 + "total_tokens": 4251.0, + "provider": "OSS", + "Metric_request_tokens": 1866.0, + "Metric_response_tokens": 2385.0, + "total_cost": 0.00038988000000000006, + "input_cost": 5.5979999999999996e-5, + "output_cost": 0.00033390000000000004 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.704063431, + "Duration": 14.836333, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3774.0, - "provider": "Google", - "Metric_request_tokens": 3091.0, - "Metric_response_tokens": 683.0, - "total_cost": 0.0005823, - "input_cost": 0.00030910000000000003, - "output_cost": 0.0002732 + "total_tokens": 2629.0, + "provider": "OSS", + "Metric_request_tokens": 1869.0, + "Metric_response_tokens": 760.0, + "total_cost": 0.00016247, + "input_cost": 5.607e-5, + "output_cost": 0.00010640000000000001 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.447734786, + "Duration": 35.948723, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4282.0, - "provider": "Google", - "Metric_request_tokens": 3091.0, - "Metric_response_tokens": 1191.0, - "total_cost": 0.0007855000000000001, - "input_cost": 0.00030910000000000003, - "output_cost": 0.00047640000000000003 + "total_tokens": 4000.0, + "provider": "OSS", + "Metric_request_tokens": 2001.0, + "Metric_response_tokens": 1999.0, + "total_cost": 0.00033989000000000003, + "input_cost": 6.0030000000000006e-5, + "output_cost": 0.00027986 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.576908765, + "Duration": 24.475566, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4271.0, - "provider": "Google", - "Metric_request_tokens": 3091.0, - "Metric_response_tokens": 1180.0, - "total_cost": 0.0007811000000000001, - "input_cost": 0.00030910000000000003, - "output_cost": 0.00047200000000000003 + "total_tokens": 3323.0, + "provider": "OSS", + "Metric_request_tokens": 2000.0, + "Metric_response_tokens": 1323.0, + "total_cost": 0.00024522000000000004, + "input_cost": 6e-5, + "output_cost": 0.00018522000000000002 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.032189281, + "Duration": 100.357682, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4612.0, - "provider": "Google", - "Metric_request_tokens": 3136.0, - "Metric_response_tokens": 1476.0, - "total_cost": 0.0009040000000000001, - "input_cost": 0.00031360000000000003, - "output_cost": 0.0005904 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.472401128, + "Duration": 13.107324, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4274.0, - "provider": "Google", - "Metric_request_tokens": 3088.0, - "Metric_response_tokens": 1186.0, - "total_cost": 0.0007832000000000001, - "input_cost": 0.0003088, - "output_cost": 0.00047440000000000004 + "total_tokens": 2448.0, + "provider": "OSS", + "Metric_request_tokens": 1777.0, + "Metric_response_tokens": 671.0, + "total_cost": 0.00014725, + "input_cost": 5.3309999999999996e-5, + "output_cost": 9.394000000000001e-5 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.732016304, + "Duration": 90.91154, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3778.0, - "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 671.0, - "total_cost": 0.0005791, - "input_cost": 0.0003107, - "output_cost": 0.0002684 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.756348604, + "Duration": 105.653916, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4571.0, - "provider": "Google", - "Metric_request_tokens": 3136.0, - "Metric_response_tokens": 1435.0, - "total_cost": 0.0008876000000000001, - "input_cost": 0.00031360000000000003, - "output_cost": 0.0005740000000000001 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.71480991, - "Score_MermaidDiagramValid": 1.0, + "Duration": 70.854361, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4318.0, - "provider": "Google", - "Metric_request_tokens": 3104.0, - "Metric_response_tokens": 1214.0, - "total_cost": 0.000796, - "input_cost": 0.0003104, - "output_cost": 0.00048560000000000004 + "total_tokens": 22143.0, + "provider": "OSS", + "Metric_request_tokens": 19546.0, + "Metric_response_tokens": 2597.0, + "total_cost": 0.0009499599999999999, + "input_cost": 0.00058638, + "output_cost": 0.00036358 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.797644523, + "Duration": 53.741469, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3776.0, - "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 669.0, - "total_cost": 0.0005783, - "input_cost": 0.0003107, - "output_cost": 0.0002676 + "total_tokens": 7049.0, + "provider": "OSS", + "Metric_request_tokens": 4218.0, + "Metric_response_tokens": 2831.0, + "total_cost": 0.0005228800000000001, + "input_cost": 0.00012654000000000002, + "output_cost": 0.00039634000000000007 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.755809993, + "Duration": 92.663208, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4296.0, - "provider": "Google", - "Metric_request_tokens": 3092.0, - "Metric_response_tokens": 1204.0, - "total_cost": 0.0007908, - "input_cost": 0.00030920000000000003, - "output_cost": 0.0004816 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.602118065, + "Duration": 14.223046, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4273.0, - "provider": "Google", - "Metric_request_tokens": 3088.0, - "Metric_response_tokens": 1185.0, - "total_cost": 0.0007828, - "input_cost": 0.0003088, - "output_cost": 0.00047400000000000003 + "total_tokens": 2539.0, + "provider": "OSS", + "Metric_request_tokens": 1801.0, + "Metric_response_tokens": 738.0, + "total_cost": 0.00015735000000000003, + "input_cost": 5.403e-5, + "output_cost": 0.00010332000000000002 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.466044834, + "Duration": 17.30169, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4312.0, - "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1205.0, - "total_cost": 0.0007927, - "input_cost": 0.0003107, - "output_cost": 0.000482 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2771.0, + "provider": "OSS", + "Metric_request_tokens": 1865.0, + "Metric_response_tokens": 906.0, + "total_cost": 0.00018279000000000003, + "input_cost": 5.595e-5, + "output_cost": 0.00012684000000000003 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.130914105, + "Duration": 123.391272, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3861.0, - "provider": "Google", - "Metric_request_tokens": 3136.0, - "Metric_response_tokens": 725.0, - "total_cost": 0.0006036, - "input_cost": 0.00031360000000000003, - "output_cost": 0.00029 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 19.530914159, + "Duration": 49.694416, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4267.0, - "provider": "Google", - "Metric_request_tokens": 3077.0, - "Metric_response_tokens": 1190.0, - "total_cost": 0.0007837, - "input_cost": 0.0003077, - "output_cost": 0.0004760000000000001 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11075.0, + "provider": "OSS", + "Metric_request_tokens": 8736.0, + "Metric_response_tokens": 2339.0, + "total_cost": 0.00058954, + "input_cost": 0.00026208000000000004, + "output_cost": 0.00032746 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.086535765, + "Duration": 15.808679, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4261.0, - "provider": "Google", - "Metric_request_tokens": 3094.0, - "Metric_response_tokens": 1167.0, - "total_cost": 0.0007762000000000001, - "input_cost": 0.00030940000000000004, - "output_cost": 0.00046680000000000007 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2663.0, + "provider": "OSS", + "Metric_request_tokens": 1835.0, + "Metric_response_tokens": 828.0, + "total_cost": 0.00017097, + "input_cost": 5.5049999999999996e-5, + "output_cost": 0.00011592 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.168974023, + "Duration": 109.244423, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4317.0, - "provider": "Google", - "Metric_request_tokens": 3128.0, - "Metric_response_tokens": 1189.0, - "total_cost": 0.0007884000000000001, - "input_cost": 0.0003128, - "output_cost": 0.0004756 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.200968076, + "Duration": 15.576859, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3763.0, - "provider": "Google", - "Metric_request_tokens": 3076.0, - "Metric_response_tokens": 687.0, - "total_cost": 0.0005824000000000001, - "input_cost": 0.00030760000000000005, - "output_cost": 0.0002748 + "total_tokens": 2736.0, + "provider": "OSS", + "Metric_request_tokens": 1932.0, + "Metric_response_tokens": 804.0, + "total_cost": 0.00017052000000000001, + "input_cost": 5.7959999999999994e-5, + "output_cost": 0.00011256000000000002 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.04668023, + "Duration": 20.871071, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3772.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 694.0, - "total_cost": 0.0005854, - "input_cost": 0.0003078, - "output_cost": 0.00027759999999999997 + "total_tokens": 2939.0, + "provider": "OSS", + "Metric_request_tokens": 1804.0, + "Metric_response_tokens": 1135.0, + "total_cost": 0.00021302000000000002, + "input_cost": 5.412e-5, + "output_cost": 0.0001589 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "gpt-oss-20b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.559231565, + "Duration": 19.211362, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4278.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 1200.0, - "total_cost": 0.0007878, - "input_cost": 0.0003078, - "output_cost": 0.00047999999999999996 + "total_tokens": 2804.0, + "provider": "OSS", + "Metric_request_tokens": 1766.0, + "Metric_response_tokens": 1038.0, + "total_cost": 0.00019830000000000002, + "input_cost": 5.298e-5, + "output_cost": 0.00014532000000000002 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.227105654, + "Duration": 27.093119, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4262.0, - "provider": "Google", - "Metric_request_tokens": 3076.0, - "Metric_response_tokens": 1186.0, - "total_cost": 0.0007820000000000001, - "input_cost": 0.00030760000000000005, - "output_cost": 0.00047440000000000004 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4399.0, + "provider": "OSS", + "Metric_request_tokens": 3355.0, + "Metric_response_tokens": 1044.0, + "total_cost": 0.0004623, + "input_cost": 0.0002013, + "output_cost": 0.000261 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.281790633, + "Duration": 22.077698, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4259.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 1181.0, - "total_cost": 0.0007802, - "input_cost": 0.0003078, - "output_cost": 0.0004724 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4378.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1028.0, + "total_cost": 0.000458, + "input_cost": 0.000201, + "output_cost": 0.000257 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.292229758, + "Duration": 21.577635, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3794.0, - "provider": "Google", - "Metric_request_tokens": 3080.0, - "Metric_response_tokens": 714.0, - "total_cost": 0.0005936, - "input_cost": 0.000308, - "output_cost": 0.0002856 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4383.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1033.0, + "total_cost": 0.00045925, + "input_cost": 0.000201, + "output_cost": 0.00025825 + }, + { + "Model": "qwen3-coder-30b-a3b-instruct-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 23.232638, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4430.0, + "provider": "OSS", + "Metric_request_tokens": 3355.0, + "Metric_response_tokens": 1075.0, + "total_cost": 0.00047004999999999996, + "input_cost": 0.0002013, + "output_cost": 0.00026875 + }, + { + "Model": "qwen3-coder-30b-a3b-instruct-mlx", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 21.75844, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4371.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1021.0, + "total_cost": 0.00045625, + "input_cost": 0.000201, + "output_cost": 0.00025525 + }, + { + "Model": "qwen3-coder-30b-a3b-instruct-mlx", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 21.329475, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4359.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1009.0, + "total_cost": 0.00045325, + "input_cost": 0.000201, + "output_cost": 0.00025225 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.415614026, + "Duration": 20.775547, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3801.0, - "provider": "Google", - "Metric_request_tokens": 3094.0, - "Metric_response_tokens": 707.0, - "total_cost": 0.0005922, - "input_cost": 0.00030940000000000004, - "output_cost": 0.0002828 + "total_tokens": 4347.0, + "provider": "OSS", + "Metric_request_tokens": 3356.0, + "Metric_response_tokens": 991.0, + "total_cost": 0.00044910999999999996, + "input_cost": 0.00020135999999999998, + "output_cost": 0.00024775 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.140784542, + "Duration": 21.736423, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3750.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 672.0, - "total_cost": 0.0005766, - "input_cost": 0.0003078, - "output_cost": 0.0002688 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4393.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1043.0, + "total_cost": 0.00046175000000000003, + "input_cost": 0.000201, + "output_cost": 0.00026075 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.949271176, + "Duration": 21.949257, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4281.0, - "provider": "Google", - "Metric_request_tokens": 3081.0, - "Metric_response_tokens": 1200.0, - "total_cost": 0.0007880999999999999, - "input_cost": 0.0003081, - "output_cost": 0.00047999999999999996 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4417.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1067.0, + "total_cost": 0.00046775, + "input_cost": 0.000201, + "output_cost": 0.00026675 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.131322822, + "Duration": 21.274582, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3759.0, - "provider": "Google", - "Metric_request_tokens": 3092.0, - "Metric_response_tokens": 667.0, - "total_cost": 0.000576, - "input_cost": 0.00030920000000000003, - "output_cost": 0.0002668 + "total_tokens": 4347.0, + "provider": "OSS", + "Metric_request_tokens": 3356.0, + "Metric_response_tokens": 991.0, + "total_cost": 0.00044910999999999996, + "input_cost": 0.00020135999999999998, + "output_cost": 0.00024775 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 15.856126981, + "Duration": 21.893145, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 21095.0, - "provider": "Google", - "Metric_request_tokens": 18585.0, - "Metric_response_tokens": 2510.0, - "total_cost": 0.0028625000000000005, - "input_cost": 0.0018585000000000001, - "output_cost": 0.0010040000000000001 + "total_tokens": 4397.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1047.0, + "total_cost": 0.00046275, + "input_cost": 0.000201, + "output_cost": 0.00026175 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.093578255, + "Duration": 21.346579, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4305.0, - "provider": "Google", - "Metric_request_tokens": 3080.0, - "Metric_response_tokens": 1225.0, - "total_cost": 0.000798, - "input_cost": 0.000308, - "output_cost": 0.00049 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4360.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1010.0, + "total_cost": 0.0004535, + "input_cost": 0.000201, + "output_cost": 0.0002525 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.10008024, + "Duration": 20.946719, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4271.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 1193.0, - "total_cost": 0.0007850000000000001, - "input_cost": 0.0003078, - "output_cost": 0.00047720000000000005 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4347.0, + "provider": "OSS", + "Metric_request_tokens": 3356.0, + "Metric_response_tokens": 991.0, + "total_cost": 0.00044910999999999996, + "input_cost": 0.00020135999999999998, + "output_cost": 0.00024775 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.996482216, + "Duration": 21.315159, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4254.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 1176.0, - "total_cost": 0.0007781999999999999, - "input_cost": 0.0003078, - "output_cost": 0.0004704 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4392.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1042.0, + "total_cost": 0.0004615, + "input_cost": 0.000201, + "output_cost": 0.0002605 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.381453313, + "Duration": 21.647562, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3873.0, - "provider": "Google", - "Metric_request_tokens": 3080.0, - "Metric_response_tokens": 793.0, - "total_cost": 0.0006252, - "input_cost": 0.000308, - "output_cost": 0.0003172 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4390.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1040.0, + "total_cost": 0.000461, + "input_cost": 0.000201, + "output_cost": 0.00026 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.576133277, + "Duration": 20.506917, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 49852.0, - "provider": "Google", - "Metric_request_tokens": 49187.0, - "Metric_response_tokens": 665.0, - "total_cost": 0.0051847, - "input_cost": 0.004918700000000001, - "output_cost": 0.000266 + "total_tokens": 4347.0, + "provider": "OSS", + "Metric_request_tokens": 3356.0, + "Metric_response_tokens": 991.0, + "total_cost": 0.00044910999999999996, + "input_cost": 0.00020135999999999998, + "output_cost": 0.00024775 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.075267329, + "Duration": 21.197962, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3813.0, - "provider": "Google", - "Metric_request_tokens": 3123.0, - "Metric_response_tokens": 690.0, - "total_cost": 0.0005882999999999999, - "input_cost": 0.0003123, - "output_cost": 0.000276 + "total_tokens": 4371.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1021.0, + "total_cost": 0.00045625, + "input_cost": 0.000201, + "output_cost": 0.00025525 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.984536161, + "Duration": 21.534489, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4251.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.000777, - "input_cost": 0.0003078, - "output_cost": 0.0004692 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4404.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1054.0, + "total_cost": 0.0004645, + "input_cost": 0.000201, + "output_cost": 0.0002635 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.060489677, + "Duration": 21.494483, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4378.0, - "provider": "Google", - "Metric_request_tokens": 3125.0, - "Metric_response_tokens": 1253.0, - "total_cost": 0.0008137000000000001, - "input_cost": 0.00031250000000000006, - "output_cost": 0.0005012 + "total_tokens": 4392.0, + "provider": "OSS", + "Metric_request_tokens": 3355.0, + "Metric_response_tokens": 1037.0, + "total_cost": 0.00046055, + "input_cost": 0.0002013, + "output_cost": 0.00025925 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.046059014, + "Duration": 21.831706, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3742.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 664.0, - "total_cost": 0.0005734, - "input_cost": 0.0003078, - "output_cost": 0.0002656 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4408.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1058.0, + "total_cost": 0.0004655, + "input_cost": 0.000201, + "output_cost": 0.0002645 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.931260288, + "Duration": 21.607156, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4296.0, - "provider": "Google", - "Metric_request_tokens": 3095.0, - "Metric_response_tokens": 1201.0, - "total_cost": 0.0007899000000000001, - "input_cost": 0.00030950000000000004, - "output_cost": 0.0004804 + "total_tokens": 4407.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1057.0, + "total_cost": 0.00046525, + "input_cost": 0.000201, + "output_cost": 0.00026425 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.324805262, + "Duration": 20.817056, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4264.0, - "provider": "Google", - "Metric_request_tokens": 3076.0, - "Metric_response_tokens": 1188.0, - "total_cost": 0.0007828000000000002, - "input_cost": 0.00030760000000000005, - "output_cost": 0.00047520000000000006 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4347.0, + "provider": "OSS", + "Metric_request_tokens": 3356.0, + "Metric_response_tokens": 991.0, + "total_cost": 0.00044910999999999996, + "input_cost": 0.00020135999999999998, + "output_cost": 0.00024775 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.532431761, + "Duration": 20.611934, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4482.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 1404.0, - "total_cost": 0.0008694000000000001, - "input_cost": 0.0003078, - "output_cost": 0.0005616000000000001 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4338.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 988.0, + "total_cost": 0.000448, + "input_cost": 0.000201, + "output_cost": 0.000247 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.878045252, + "Duration": 21.084036, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4315.0, - "provider": "Google", - "Metric_request_tokens": 3080.0, - "Metric_response_tokens": 1235.0, - "total_cost": 0.000802, - "input_cost": 0.000308, - "output_cost": 0.000494 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4377.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1027.0, + "total_cost": 0.00045775, + "input_cost": 0.000201, + "output_cost": 0.00025675 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.37411601, + "Duration": 20.520816, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4277.0, - "provider": "Google", - "Metric_request_tokens": 3094.0, - "Metric_response_tokens": 1183.0, - "total_cost": 0.0007826, - "input_cost": 0.00030940000000000004, - "output_cost": 0.0004732 + "total_tokens": 4347.0, + "provider": "OSS", + "Metric_request_tokens": 3356.0, + "Metric_response_tokens": 991.0, + "total_cost": 0.00044910999999999996, + "input_cost": 0.00020135999999999998, + "output_cost": 0.00024775 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.198357553, + "Duration": 21.533163, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3773.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 695.0, - "total_cost": 0.0005858, - "input_cost": 0.0003078, - "output_cost": 0.000278 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4401.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1051.0, + "total_cost": 0.00046375, + "input_cost": 0.000201, + "output_cost": 0.00026275 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.122080385, + "Duration": 21.145521, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4309.0, - "provider": "Google", - "Metric_request_tokens": 3097.0, - "Metric_response_tokens": 1212.0, - "total_cost": 0.0007945000000000001, - "input_cost": 0.0003097, - "output_cost": 0.0004848 + "total_tokens": 4379.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1029.0, + "total_cost": 0.00045825, + "input_cost": 0.000201, + "output_cost": 0.00025725 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.670149233, + "Duration": 22.110844, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6618.0, - "provider": "Google", - "Metric_request_tokens": 5390.0, - "Metric_response_tokens": 1228.0, - "total_cost": 0.0010302, - "input_cost": 0.000539, - "output_cost": 0.0004912 + "total_tokens": 4440.0, + "provider": "OSS", + "Metric_request_tokens": 3356.0, + "Metric_response_tokens": 1084.0, + "total_cost": 0.00047235999999999996, + "input_cost": 0.00020135999999999998, + "output_cost": 0.000271 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.358885211, + "Duration": 21.705767, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4265.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 1187.0, - "total_cost": 0.0007826, - "input_cost": 0.0003078, - "output_cost": 0.0004748 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4398.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1048.0, + "total_cost": 0.00046300000000000003, + "input_cost": 0.000201, + "output_cost": 0.000262 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.529175968, + "Duration": 21.788662, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4276.0, - "provider": "Google", - "Metric_request_tokens": 3097.0, - "Metric_response_tokens": 1179.0, - "total_cost": 0.0007813, - "input_cost": 0.0003097, - "output_cost": 0.00047159999999999997 + "total_tokens": 4410.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1060.0, + "total_cost": 0.000466, + "input_cost": 0.000201, + "output_cost": 0.000265 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.318584704, + "Duration": 20.540309, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4259.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 1181.0, - "total_cost": 0.0007802, - "input_cost": 0.0003078, - "output_cost": 0.0004724 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4345.0, + "provider": "OSS", + "Metric_request_tokens": 3355.0, + "Metric_response_tokens": 990.0, + "total_cost": 0.00044879999999999996, + "input_cost": 0.0002013, + "output_cost": 0.0002475 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 6.129762473, + "Duration": 21.983455, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4518.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 1440.0, - "total_cost": 0.0008838, - "input_cost": 0.0003078, - "output_cost": 0.000576 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4422.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1072.0, + "total_cost": 0.000469, + "input_cost": 0.000201, + "output_cost": 0.000268 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.357050738, + "Duration": 21.368936, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4260.0, - "provider": "Google", - "Metric_request_tokens": 3094.0, - "Metric_response_tokens": 1166.0, - "total_cost": 0.0007758000000000001, - "input_cost": 0.00030940000000000004, - "output_cost": 0.0004664 + "total_tokens": 4387.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1037.0, + "total_cost": 0.00046025, + "input_cost": 0.000201, + "output_cost": 0.00025925 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.448035659, + "Duration": 21.488281, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4277.0, - "provider": "Google", - "Metric_request_tokens": 3076.0, - "Metric_response_tokens": 1201.0, - "total_cost": 0.0007880000000000001, - "input_cost": 0.00030760000000000005, - "output_cost": 0.0004804 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4402.0, + "provider": "OSS", + "Metric_request_tokens": 3356.0, + "Metric_response_tokens": 1046.0, + "total_cost": 0.00046286, + "input_cost": 0.00020135999999999998, + "output_cost": 0.0002615 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.454605744, + "Duration": 21.18801, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4260.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 1182.0, - "total_cost": 0.0007806, - "input_cost": 0.0003078, - "output_cost": 0.00047280000000000005 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4381.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1031.0, + "total_cost": 0.00045875, + "input_cost": 0.000201, + "output_cost": 0.00025775 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.712239535, + "Duration": 20.850931, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3832.0, - "provider": "Google", - "Metric_request_tokens": 3127.0, - "Metric_response_tokens": 705.0, - "total_cost": 0.0005947000000000001, - "input_cost": 0.0003127, - "output_cost": 0.000282 + "total_tokens": 4360.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1010.0, + "total_cost": 0.0004535, + "input_cost": 0.000201, + "output_cost": 0.0002525 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.044155716, + "Duration": 21.871692, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4299.0, - "provider": "Google", - "Metric_request_tokens": 3092.0, - "Metric_response_tokens": 1207.0, - "total_cost": 0.0007920000000000001, - "input_cost": 0.00030920000000000003, - "output_cost": 0.0004828 + "total_tokens": 4396.0, + "provider": "OSS", + "Metric_request_tokens": 3355.0, + "Metric_response_tokens": 1041.0, + "total_cost": 0.00046155, + "input_cost": 0.0002013, + "output_cost": 0.00026025 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.750485612, + "Duration": 21.375532, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3843.0, - "provider": "Google", - "Metric_request_tokens": 3125.0, - "Metric_response_tokens": 718.0, - "total_cost": 0.0005997000000000001, - "input_cost": 0.00031250000000000006, - "output_cost": 0.0002872 + "total_tokens": 4362.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1012.0, + "total_cost": 0.00045400000000000003, + "input_cost": 0.000201, + "output_cost": 0.000253 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.14357062, + "Duration": 21.108107, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4283.0, - "provider": "Google", - "Metric_request_tokens": 3081.0, - "Metric_response_tokens": 1202.0, - "total_cost": 0.0007888999999999999, - "input_cost": 0.0003081, - "output_cost": 0.0004808 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4356.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1006.0, + "total_cost": 0.0004525, + "input_cost": 0.000201, + "output_cost": 0.0002515 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.678468457, + "Duration": 22.047728, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3756.0, - "provider": "Google", - "Metric_request_tokens": 3077.0, - "Metric_response_tokens": 679.0, - "total_cost": 0.0005793, - "input_cost": 0.0003077, - "output_cost": 0.00027160000000000004 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4412.0, + "provider": "OSS", + "Metric_request_tokens": 3356.0, + "Metric_response_tokens": 1056.0, + "total_cost": 0.00046536, + "input_cost": 0.00020135999999999998, + "output_cost": 0.000264 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.395852675, + "Duration": 21.85754, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4255.0, - "provider": "Google", - "Metric_request_tokens": 3078.0, - "Metric_response_tokens": 1177.0, - "total_cost": 0.0007786000000000001, - "input_cost": 0.0003078, - "output_cost": 0.00047080000000000006 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4393.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1043.0, + "total_cost": 0.00046175000000000003, + "input_cost": 0.000201, + "output_cost": 0.00026075 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.920805734, + "Duration": 21.670747, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4262.0, - "provider": "Google", - "Metric_request_tokens": 3094.0, - "Metric_response_tokens": 1168.0, - "total_cost": 0.0007766000000000001, - "input_cost": 0.00030940000000000004, - "output_cost": 0.0004672 + "total_tokens": 4388.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1038.0, + "total_cost": 0.0004605, + "input_cost": 0.000201, + "output_cost": 0.0002595 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 30.535216, - "Score_MermaidDiagramValid": 1.0, + "Duration": 22.248981, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 65104.0, - "provider": "Google", - "Metric_request_tokens": 59569.0, - "Metric_response_tokens": 5535.0, - "total_cost": 0.040898199999999996, - "input_cost": 0.0178707, - "output_cost": 0.0230275 + "total_tokens": 4418.0, + "provider": "OSS", + "Metric_request_tokens": 3356.0, + "Metric_response_tokens": 1062.0, + "total_cost": 0.00046686, + "input_cost": 0.00020135999999999998, + "output_cost": 0.0002655 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 41.27852, + "Duration": 20.912118, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4338.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 988.0, + "total_cost": 0.000448, + "input_cost": 0.000201, + "output_cost": 0.000247 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-coder-30b-a3b-instruct-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 36.451985, + "Duration": 22.500583, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 15437.0, - "provider": "Google", - "Metric_request_tokens": 7985.0, - "Metric_response_tokens": 7452.0, - "total_cost": 0.035218, - "input_cost": 0.0023955, - "output_cost": 0.0328225 + "total_tokens": 4415.0, + "provider": "OSS", + "Metric_request_tokens": 3350.0, + "Metric_response_tokens": 1065.0, + "total_cost": 0.00046725, + "input_cost": 0.000201, + "output_cost": 0.00026625 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 30.921618, - "Score_MermaidDiagramValid": 1.0, + "Duration": 75.093294, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 65447.0, - "provider": "Google", - "Metric_request_tokens": 59634.0, - "Metric_response_tokens": 5813.0, - "total_cost": 0.0422752, - "input_cost": 0.0178902, - "output_cost": 0.024385 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 5063.0, + "provider": "OSS", + "Metric_request_tokens": 946.0, + "Metric_response_tokens": 4117.0, + "total_cost": 0.00126961, + "input_cost": 7.568000000000001e-5, + "output_cost": 0.00119393 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 43.039527, + "Duration": 64.347627, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 8340.0, + "provider": "OSS", + "Metric_request_tokens": 4732.0, + "Metric_response_tokens": 3608.0, + "total_cost": 0.00142488, + "input_cost": 0.00037856, + "output_cost": 0.0010463199999999999 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 40.613476, + "Duration": 70.688606, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 21723.0, - "provider": "Google", - "Metric_request_tokens": 13502.0, - "Metric_response_tokens": 8221.0, - "total_cost": 0.039043100000000004, - "input_cost": 0.0040506, - "output_cost": 0.0349925 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 8421.0, + "provider": "OSS", + "Metric_request_tokens": 4450.0, + "Metric_response_tokens": 3971.0, + "total_cost": 0.0015075899999999998, + "input_cost": 0.000356, + "output_cost": 0.0011515899999999999 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 30.789959, - "Score_MermaidDiagramValid": 1.0, + "Duration": 76.378719, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 65265.0, - "provider": "Google", - "Metric_request_tokens": 59678.0, - "Metric_response_tokens": 5587.0, - "total_cost": 0.041150900000000004, - "input_cost": 0.0179034, - "output_cost": 0.0232475 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 9017.0, + "provider": "OSS", + "Metric_request_tokens": 4985.0, + "Metric_response_tokens": 4032.0, + "total_cost": 0.00156808, + "input_cost": 0.0003988, + "output_cost": 0.00116928 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 20.254133, + "Duration": 96.249934, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7397.0, - "provider": "Google", - "Metric_request_tokens": 3208.0, - "Metric_response_tokens": 4189.0, - "total_cost": 0.018834899999999998, - "input_cost": 0.0009624, - "output_cost": 0.0178725 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 7904.0, + "provider": "OSS", + "Metric_request_tokens": 2520.0, + "Metric_response_tokens": 5384.0, + "total_cost": 0.00176296, + "input_cost": 0.00020160000000000002, + "output_cost": 0.00156136 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 39.96252, + "Duration": 62.571898, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 4471.0, + "provider": "OSS", + "Metric_request_tokens": 945.0, + "Metric_response_tokens": 3526.0, + "total_cost": 0.00109814, + "input_cost": 7.56e-5, + "output_cost": 0.00102254 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 28.708043, - "Score_MermaidDiagramValid": 1.0, + "Duration": 74.067002, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 65454.0, - "provider": "Google", - "Metric_request_tokens": 60032.0, - "Metric_response_tokens": 5422.0, - "total_cost": 0.0404296, - "input_cost": 0.0180096, - "output_cost": 0.022420000000000002 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 5069.0, + "provider": "OSS", + "Metric_request_tokens": 946.0, + "Metric_response_tokens": 4123.0, + "total_cost": 0.0012713499999999999, + "input_cost": 7.568000000000001e-5, + "output_cost": 0.0011956699999999998 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 37.960577, + "Duration": 58.966852, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 4281.0, + "provider": "OSS", + "Metric_request_tokens": 945.0, + "Metric_response_tokens": 3336.0, + "total_cost": 0.0010430399999999999, + "input_cost": 7.56e-5, + "output_cost": 0.00096744 + }, + { + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 64.642366, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 4596.0, + "provider": "OSS", + "Metric_request_tokens": 945.0, + "Metric_response_tokens": 3651.0, + "total_cost": 0.0011343899999999999, + "input_cost": 7.56e-5, + "output_cost": 0.00105879 + }, + { + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 93.608722, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 7632.0, + "provider": "OSS", + "Metric_request_tokens": 2521.0, + "Metric_response_tokens": 5111.0, + "total_cost": 0.00168387, + "input_cost": 0.00020167999999999998, + "output_cost": 0.00148219 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 172.656315, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 15148.0, + "provider": "OSS", + "Metric_request_tokens": 5670.0, + "Metric_response_tokens": 9478.0, + "total_cost": 0.0032022199999999995, + "input_cost": 0.00045359999999999997, + "output_cost": 0.00274862 + }, + { + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 35.53418, + "Duration": 72.972157, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 6641.0, + "provider": "OSS", + "Metric_request_tokens": 2520.0, + "Metric_response_tokens": 4121.0, + "total_cost": 0.00139669, + "input_cost": 0.00020160000000000002, + "output_cost": 0.0011950899999999998 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 16.319646, + "Duration": 51.826864, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6271.0, - "provider": "Google", - "Metric_request_tokens": 3217.0, - "Metric_response_tokens": 3054.0, - "total_cost": 0.013285100000000001, - "input_cost": 0.0009650999999999999, - "output_cost": 0.012320000000000001 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 3895.0, + "provider": "OSS", + "Metric_request_tokens": 946.0, + "Metric_response_tokens": 2949.0, + "total_cost": 0.00093089, + "input_cost": 7.568000000000001e-5, + "output_cost": 0.0008552099999999999 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 34.242218, + "Duration": 43.867205, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 14979.0, - "provider": "Google", - "Metric_request_tokens": 8039.0, - "Metric_response_tokens": 6940.0, - "total_cost": 0.0325442, - "input_cost": 0.0024116999999999997, - "output_cost": 0.0301325 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 3448.0, + "provider": "OSS", + "Metric_request_tokens": 945.0, + "Metric_response_tokens": 2503.0, + "total_cost": 0.00080147, + "input_cost": 7.56e-5, + "output_cost": 0.00072587 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 29.800008, + "Duration": 120.628645, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 19299.0, - "provider": "Google", - "Metric_request_tokens": 13473.0, - "Metric_response_tokens": 5826.0, - "total_cost": 0.0268194, - "input_cost": 0.0040419, - "output_cost": 0.0227775 + "total_tokens": 10726.0, + "provider": "OSS", + "Metric_request_tokens": 4095.0, + "Metric_response_tokens": 6631.0, + "total_cost": 0.00225059, + "input_cost": 0.0003276, + "output_cost": 0.0019229899999999999 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 30.071548, + "Duration": 69.246324, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 65106.0, - "provider": "Google", - "Metric_request_tokens": 59409.0, - "Metric_response_tokens": 5697.0, - "total_cost": 0.0416952, - "input_cost": 0.017822699999999997, - "output_cost": 0.0238725 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 8669.0, + "provider": "OSS", + "Metric_request_tokens": 4804.0, + "Metric_response_tokens": 3865.0, + "total_cost": 0.00150517, + "input_cost": 0.00038432, + "output_cost": 0.0011208499999999998 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 42.437322, + "Duration": 126.350713, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 27727.0, - "provider": "Google", - "Metric_request_tokens": 19471.0, - "Metric_response_tokens": 8256.0, - "total_cost": 0.039598799999999997, - "input_cost": 0.0058413, - "output_cost": 0.033757499999999996 + "total_tokens": 10996.0, + "provider": "OSS", + "Metric_request_tokens": 4095.0, + "Metric_response_tokens": 6901.0, + "total_cost": 0.0023288899999999997, + "input_cost": 0.0003276, + "output_cost": 0.0020012899999999998 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 31.305007, + "Duration": 97.80328, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 19697.0, - "provider": "Google", - "Metric_request_tokens": 13501.0, - "Metric_response_tokens": 6196.0, - "total_cost": 0.0289778, - "input_cost": 0.0040503, - "output_cost": 0.024927500000000002 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 7944.0, + "provider": "OSS", + "Metric_request_tokens": 2520.0, + "Metric_response_tokens": 5424.0, + "total_cost": 0.0017745600000000001, + "input_cost": 0.00020160000000000002, + "output_cost": 0.00157296 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 25.683554, - "Score_MermaidDiagramValid": 1.0, + "Duration": 176.133619, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 64108.0, - "provider": "Google", - "Metric_request_tokens": 59439.0, - "Metric_response_tokens": 4669.0, - "total_cost": 0.0364617, - "input_cost": 0.0178317, - "output_cost": 0.01863 + "total_tokens": 16825.0, + "provider": "OSS", + "Metric_request_tokens": 7331.0, + "Metric_response_tokens": 9494.0, + "total_cost": 0.00333974, + "input_cost": 0.00058648, + "output_cost": 0.00275326 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 35.585454, + "Duration": 140.076518, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11905.0, + "provider": "OSS", + "Metric_request_tokens": 4095.0, + "Metric_response_tokens": 7810.0, + "total_cost": 0.0025924999999999998, + "input_cost": 0.0003276, + "output_cost": 0.0022649 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 36.908135, + "Duration": 60.884128, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 4427.0, + "provider": "OSS", + "Metric_request_tokens": 945.0, + "Metric_response_tokens": 3482.0, + "total_cost": 0.00108538, + "input_cost": 7.56e-5, + "output_cost": 0.0010097799999999998 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 22.669943, - "Score_MermaidDiagramValid": 1.0, + "Duration": 57.921719, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 63800.0, - "provider": "Google", - "Metric_request_tokens": 59409.0, - "Metric_response_tokens": 4391.0, - "total_cost": 0.03520519999999999, - "input_cost": 0.017822699999999997, - "output_cost": 0.0173825 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 7036.0, + "provider": "OSS", + "Metric_request_tokens": 3819.0, + "Metric_response_tokens": 3217.0, + "total_cost": 0.0012384499999999999, + "input_cost": 0.00030552, + "output_cost": 0.0009329299999999999 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 48.34871, - "Score_MermaidDiagramValid": 1.0, + "Duration": 108.932358, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 23572.0, - "provider": "Google", - "Metric_request_tokens": 13446.0, - "Metric_response_tokens": 10126.0, - "total_cost": 0.048568799999999995, - "input_cost": 0.0040338, - "output_cost": 0.044535 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 8516.0, + "provider": "OSS", + "Metric_request_tokens": 2520.0, + "Metric_response_tokens": 5996.0, + "total_cost": 0.0019404399999999999, + "input_cost": 0.00020160000000000002, + "output_cost": 0.0017388399999999997 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 24.075586, + "Duration": 73.64142, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 7125.0, - "provider": "Google", - "Metric_request_tokens": 1685.0, - "Metric_response_tokens": 5440.0, - "total_cost": 0.0263205, - "input_cost": 0.0005055, - "output_cost": 0.025815 + "total_tokens": 9063.0, + "provider": "OSS", + "Metric_request_tokens": 4987.0, + "Metric_response_tokens": 4076.0, + "total_cost": 0.001581, + "input_cost": 0.00039896, + "output_cost": 0.0011820399999999999 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 23.746025, - "Score_MermaidDiagramValid": 1.0, + "Duration": 116.082796, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 63216.0, - "provider": "Google", - "Metric_request_tokens": 59438.0, - "Metric_response_tokens": 3778.0, - "total_cost": 0.0320064, - "input_cost": 0.017831399999999997, - "output_cost": 0.014175 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 7385.0, + "provider": "OSS", + "Metric_request_tokens": 946.0, + "Metric_response_tokens": 6439.0, + "total_cost": 0.00194299, + "input_cost": 7.568000000000001e-5, + "output_cost": 0.0018673099999999998 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 38.982162, + "Duration": 100.111411, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 8100.0, + "provider": "OSS", + "Metric_request_tokens": 2520.0, + "Metric_response_tokens": 5580.0, + "total_cost": 0.0018197999999999999, + "input_cost": 0.00020160000000000002, + "output_cost": 0.0016181999999999998 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 34.937334, + "Duration": 75.741348, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 8646.0, + "provider": "OSS", + "Metric_request_tokens": 4581.0, + "Metric_response_tokens": 4065.0, + "total_cost": 0.00154533, + "input_cost": 0.00036648000000000003, + "output_cost": 0.00117885 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 23.290948, + "Duration": 108.791997, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 63494.0, - "provider": "Google", - "Metric_request_tokens": 59240.0, - "Metric_response_tokens": 4254.0, - "total_cost": 0.034337, - "input_cost": 0.017772, - "output_cost": 0.016565 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 10052.0, + "provider": "OSS", + "Metric_request_tokens": 4236.0, + "Metric_response_tokens": 5816.0, + "total_cost": 0.00202552, + "input_cost": 0.00033888, + "output_cost": 0.0016866399999999999 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 36.192303, + "Duration": 59.848516, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 7838.0, + "provider": "OSS", + "Metric_request_tokens": 4667.0, + "Metric_response_tokens": 3171.0, + "total_cost": 0.00129295, + "input_cost": 0.00037336000000000004, + "output_cost": 0.00091959 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 39.181344, + "Duration": 63.032341, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 20917.0, - "provider": "Google", - "Metric_request_tokens": 13424.0, - "Metric_response_tokens": 7493.0, - "total_cost": 0.0354947, - "input_cost": 0.0040272, - "output_cost": 0.031467499999999995 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 7797.0, + "provider": "OSS", + "Metric_request_tokens": 4472.0, + "Metric_response_tokens": 3325.0, + "total_cost": 0.00132201, + "input_cost": 0.00035776000000000004, + "output_cost": 0.0009642499999999999 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 30.817341, - "Score_MermaidDiagramValid": 1.0, + "Duration": 111.114922, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 8587.0, + "provider": "OSS", + "Metric_request_tokens": 2521.0, + "Metric_response_tokens": 6066.0, + "total_cost": 0.00196082, + "input_cost": 0.00020167999999999998, + "output_cost": 0.00175914 + }, + { + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 134.089581, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 9874.0, + "provider": "OSS", + "Metric_request_tokens": 2520.0, + "Metric_response_tokens": 7354.0, + "total_cost": 0.00233426, + "input_cost": 0.00020160000000000002, + "output_cost": 0.0021326599999999998 + }, + { + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 133.001412, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 70272.0, - "provider": "Google", - "Metric_request_tokens": 64504.0, - "Metric_response_tokens": 5768.0, - "total_cost": 0.0419462, - "input_cost": 0.019351200000000002, - "output_cost": 0.022594999999999997 + "total_tokens": 13555.0, + "provider": "OSS", + "Metric_request_tokens": 6322.0, + "Metric_response_tokens": 7233.0, + "total_cost": 0.00260333, + "input_cost": 0.0005057600000000001, + "output_cost": 0.00209757 + }, + { + "Model": "qwen3-30b-a3b-thinking-2507-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 73.846683, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 5045.0, + "provider": "OSS", + "Metric_request_tokens": 946.0, + "Metric_response_tokens": 4099.0, + "total_cost": 0.00126439, + "input_cost": 7.568000000000001e-5, + "output_cost": 0.00118871 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 37.161836, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Duration": 128.260322, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 9609.0, + "provider": "OSS", + "Metric_request_tokens": 2520.0, + "Metric_response_tokens": 7089.0, + "total_cost": 0.00225741, + "input_cost": 0.00020160000000000002, + "output_cost": 0.00205581 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 30.427731, + "Duration": 113.896917, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 19318.0, - "provider": "Google", - "Metric_request_tokens": 13508.0, - "Metric_response_tokens": 5810.0, - "total_cost": 0.026967400000000002, - "input_cost": 0.004052399999999999, - "output_cost": 0.022915 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 8764.0, + "provider": "OSS", + "Metric_request_tokens": 2520.0, + "Metric_response_tokens": 6244.0, + "total_cost": 0.00201236, + "input_cost": 0.00020160000000000002, + "output_cost": 0.00181076 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 25.665431, + "Duration": 130.652418, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9023.0, - "provider": "Google", - "Metric_request_tokens": 4087.0, - "Metric_response_tokens": 4936.0, - "total_cost": 0.0229061, - "input_cost": 0.0012261000000000001, - "output_cost": 0.021679999999999998 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 9697.0, + "provider": "OSS", + "Metric_request_tokens": 2521.0, + "Metric_response_tokens": 7176.0, + "total_cost": 0.00228272, + "input_cost": 0.00020167999999999998, + "output_cost": 0.0020810399999999997 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 39.694036, + "Duration": 74.239428, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "total_tokens": 5059.0, + "provider": "OSS", + "Metric_request_tokens": 945.0, + "Metric_response_tokens": 4114.0, + "total_cost": 0.00126866, + "input_cost": 7.56e-5, + "output_cost": 0.00119306 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen3-30b-a3b-thinking-2507-mlx", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 2.110298, + "Duration": 66.339824, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 8464.0, + "provider": "OSS", + "Metric_request_tokens": 4917.0, + "Metric_response_tokens": 3547.0, + "total_cost": 0.0014219899999999997, + "input_cost": 0.00039336000000000004, + "output_cost": 0.0010286299999999998 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 29.774367, - "Score_MermaidDiagramValid": 1.0, + "Duration": 44.589993, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 65029.0, - "provider": "Google", - "Metric_request_tokens": 59634.0, - "Metric_response_tokens": 5395.0, - "total_cost": 0.0401827, - "input_cost": 0.0178902, - "output_cost": 0.0222925 + "total_tokens": 4284.0, + "provider": "OSS", + "Metric_request_tokens": 3248.0, + "Metric_response_tokens": 1036.0, + "total_cost": 0.00056028, + "input_cost": 0.00025984, + "output_cost": 0.00030043999999999996 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 41.44486, + "Duration": 22.39458, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4276.0, + "provider": "OSS", + "Metric_request_tokens": 3237.0, + "Metric_response_tokens": 1039.0, + "total_cost": 0.00056027, + "input_cost": 0.00025896, + "output_cost": 0.00030131 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 32.833202, + "Duration": 22.181891, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 19792.0, - "provider": "Google", - "Metric_request_tokens": 13455.0, - "Metric_response_tokens": 6337.0, - "total_cost": 0.029679000000000004, - "input_cost": 0.0040365, - "output_cost": 0.025642500000000002 + "total_tokens": 4265.0, + "provider": "OSS", + "Metric_request_tokens": 3240.0, + "Metric_response_tokens": 1025.0, + "total_cost": 0.00055645, + "input_cost": 0.0002592, + "output_cost": 0.00029725 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 31.837097, - "Score_MermaidDiagramValid": 1.0, + "Duration": 22.551817, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 64441.0, - "provider": "Google", - "Metric_request_tokens": 59551.0, - "Metric_response_tokens": 4890.0, - "total_cost": 0.0376353, - "input_cost": 0.0178653, - "output_cost": 0.01977 + "total_tokens": 4293.0, + "provider": "OSS", + "Metric_request_tokens": 3247.0, + "Metric_response_tokens": 1046.0, + "total_cost": 0.0005631, + "input_cost": 0.00025976, + "output_cost": 0.00030334 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 45.30594, + "Duration": 22.115199, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4262.0, + "provider": "OSS", + "Metric_request_tokens": 3238.0, + "Metric_response_tokens": 1024.0, + "total_cost": 0.000556, + "input_cost": 0.00025904, + "output_cost": 0.00029696 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 33.692928, + "Duration": 22.563389, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4286.0, + "provider": "OSS", + "Metric_request_tokens": 3242.0, + "Metric_response_tokens": 1044.0, + "total_cost": 0.0005621199999999999, + "input_cost": 0.00025936000000000004, + "output_cost": 0.00030275999999999995 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.23502, + "Duration": 22.684495, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4299.0, + "provider": "OSS", + "Metric_request_tokens": 3250.0, + "Metric_response_tokens": 1049.0, + "total_cost": 0.0005642099999999999, + "input_cost": 0.00026, + "output_cost": 0.00030420999999999996 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 37.989877, + "Duration": 23.310987, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, - "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4260.0, + "provider": "OSS", + "Metric_request_tokens": 3237.0, + "Metric_response_tokens": 1023.0, + "total_cost": 0.00055563, + "input_cost": 0.00025896, + "output_cost": 0.00029667 }, { - "Model": "gemini-2.5-flash-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 40.060316, + "Duration": 22.3945, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 21124.0, - "provider": "Google", - "Metric_request_tokens": 13386.0, - "Metric_response_tokens": 7738.0, - "total_cost": 0.0367308, - "input_cost": 0.0040158, - "output_cost": 0.032715 + "total_tokens": 4269.0, + "provider": "OSS", + "Metric_request_tokens": 3242.0, + "Metric_response_tokens": 1027.0, + "total_cost": 0.0005571899999999999, + "input_cost": 0.00025936000000000004, + "output_cost": 0.00029782999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.215447, + "Duration": 22.6648, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5446.0, - "provider": "Google", - "Metric_request_tokens": 4086.0, - "Metric_response_tokens": 1360.0, - "total_cost": 0.0009526000000000002, - "input_cost": 0.00040860000000000007, - "output_cost": 0.0005440000000000001 + "total_tokens": 4299.0, + "provider": "OSS", + "Metric_request_tokens": 3250.0, + "Metric_response_tokens": 1049.0, + "total_cost": 0.0005642099999999999, + "input_cost": 0.00026, + "output_cost": 0.00030420999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.900561, + "Duration": 22.80001, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4506.0, - "provider": "Google", - "Metric_request_tokens": 3213.0, - "Metric_response_tokens": 1293.0, - "total_cost": 0.0008385000000000002, - "input_cost": 0.00032130000000000006, - "output_cost": 0.0005172000000000001 + "total_tokens": 4276.0, + "provider": "OSS", + "Metric_request_tokens": 3237.0, + "Metric_response_tokens": 1039.0, + "total_cost": 0.00056027, + "input_cost": 0.00025896, + "output_cost": 0.00030131 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.647086, + "Duration": 22.441483, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4558.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 1348.0, - "total_cost": 0.0008602, - "input_cost": 0.00032100000000000005, - "output_cost": 0.0005392 + "total_tokens": 4265.0, + "provider": "OSS", + "Metric_request_tokens": 3240.0, + "Metric_response_tokens": 1025.0, + "total_cost": 0.00055645, + "input_cost": 0.0002592, + "output_cost": 0.00029725 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.251769, + "Duration": 23.025514, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5431.0, - "provider": "Google", - "Metric_request_tokens": 4086.0, - "Metric_response_tokens": 1345.0, - "total_cost": 0.0009466000000000001, - "input_cost": 0.00040860000000000007, - "output_cost": 0.0005380000000000001 + "total_tokens": 4299.0, + "provider": "OSS", + "Metric_request_tokens": 3250.0, + "Metric_response_tokens": 1049.0, + "total_cost": 0.0005642099999999999, + "input_cost": 0.00026, + "output_cost": 0.00030420999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 16.549062, + "Duration": 22.23461, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 18304.0, - "provider": "Google", - "Metric_request_tokens": 11757.0, - "Metric_response_tokens": 6547.0, - "total_cost": 0.0037945, - "input_cost": 0.0011757, - "output_cost": 0.0026188 + "total_tokens": 4262.0, + "provider": "OSS", + "Metric_request_tokens": 3238.0, + "Metric_response_tokens": 1024.0, + "total_cost": 0.000556, + "input_cost": 0.00025904, + "output_cost": 0.00029696 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.801133, + "Duration": 23.283626, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4563.0, - "provider": "Google", - "Metric_request_tokens": 3208.0, - "Metric_response_tokens": 1355.0, - "total_cost": 0.0008628, - "input_cost": 0.0003208, - "output_cost": 0.0005420000000000001 + "total_tokens": 4282.0, + "provider": "OSS", + "Metric_request_tokens": 3240.0, + "Metric_response_tokens": 1042.0, + "total_cost": 0.00056138, + "input_cost": 0.0002592, + "output_cost": 0.00030218 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.511676, + "Duration": 22.782429, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5825.0, - "provider": "Google", - "Metric_request_tokens": 4087.0, - "Metric_response_tokens": 1738.0, - "total_cost": 0.0011039, - "input_cost": 0.00040870000000000007, - "output_cost": 0.0006952 + "total_tokens": 4284.0, + "provider": "OSS", + "Metric_request_tokens": 3248.0, + "Metric_response_tokens": 1036.0, + "total_cost": 0.00056028, + "input_cost": 0.00025984, + "output_cost": 0.00030043999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.176256, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5376.0, - "provider": "Google", - "Metric_request_tokens": 4075.0, - "Metric_response_tokens": 1301.0, - "total_cost": 0.0009279000000000001, - "input_cost": 0.0004075, - "output_cost": 0.0005204000000000001 + "Duration": 23.201191, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4266.0, + "provider": "OSS", + "Metric_request_tokens": 3239.0, + "Metric_response_tokens": 1027.0, + "total_cost": 0.00055695, + "input_cost": 0.00025912, + "output_cost": 0.00029782999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.872751, + "Duration": 23.498245, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4539.0, - "provider": "Google", - "Metric_request_tokens": 3206.0, - "Metric_response_tokens": 1333.0, - "total_cost": 0.0008538, - "input_cost": 0.00032060000000000004, - "output_cost": 0.0005332000000000001 + "total_tokens": 4269.0, + "provider": "OSS", + "Metric_request_tokens": 3242.0, + "Metric_response_tokens": 1027.0, + "total_cost": 0.0005571899999999999, + "input_cost": 0.00025936000000000004, + "output_cost": 0.00029782999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.15609, + "Duration": 22.858138, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5888.0, - "provider": "Google", - "Metric_request_tokens": 4087.0, - "Metric_response_tokens": 1801.0, - "total_cost": 0.0011291, - "input_cost": 0.00040870000000000007, - "output_cost": 0.0007204000000000001 + "total_tokens": 4293.0, + "provider": "OSS", + "Metric_request_tokens": 3247.0, + "Metric_response_tokens": 1046.0, + "total_cost": 0.0005631, + "input_cost": 0.00025976, + "output_cost": 0.00030334 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.889272, + "Duration": 22.228085, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5350.0, - "provider": "Google", - "Metric_request_tokens": 4082.0, - "Metric_response_tokens": 1268.0, - "total_cost": 0.0009154, - "input_cost": 0.0004082, - "output_cost": 0.0005072000000000001 + "total_tokens": 4260.0, + "provider": "OSS", + "Metric_request_tokens": 3237.0, + "Metric_response_tokens": 1023.0, + "total_cost": 0.00055563, + "input_cost": 0.00025896, + "output_cost": 0.00029667 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.012574, + "Duration": 22.213242, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5370.0, - "provider": "Google", - "Metric_request_tokens": 4077.0, - "Metric_response_tokens": 1293.0, - "total_cost": 0.0009249000000000001, - "input_cost": 0.0004077, - "output_cost": 0.0005172000000000001 + "total_tokens": 4269.0, + "provider": "OSS", + "Metric_request_tokens": 3242.0, + "Metric_response_tokens": 1027.0, + "total_cost": 0.0005571899999999999, + "input_cost": 0.00025936000000000004, + "output_cost": 0.00029782999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.579592, + "Duration": 22.570688, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4841.0, - "provider": "Google", - "Metric_request_tokens": 3217.0, - "Metric_response_tokens": 1624.0, - "total_cost": 0.0009713, - "input_cost": 0.0003217, - "output_cost": 0.0006496000000000001 + "total_tokens": 4293.0, + "provider": "OSS", + "Metric_request_tokens": 3247.0, + "Metric_response_tokens": 1046.0, + "total_cost": 0.0005631, + "input_cost": 0.00025976, + "output_cost": 0.00030334 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.866885, + "Duration": 22.157159, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5346.0, - "provider": "Google", - "Metric_request_tokens": 4082.0, - "Metric_response_tokens": 1264.0, - "total_cost": 0.0009138, - "input_cost": 0.0004082, - "output_cost": 0.0005056 + "total_tokens": 4262.0, + "provider": "OSS", + "Metric_request_tokens": 3238.0, + "Metric_response_tokens": 1024.0, + "total_cost": 0.000556, + "input_cost": 0.00025904, + "output_cost": 0.00029696 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.845275, + "Duration": 22.256411, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5343.0, - "provider": "Google", - "Metric_request_tokens": 4075.0, - "Metric_response_tokens": 1268.0, - "total_cost": 0.0009147000000000001, - "input_cost": 0.0004075, - "output_cost": 0.0005072000000000001 + "total_tokens": 4269.0, + "provider": "OSS", + "Metric_request_tokens": 3242.0, + "Metric_response_tokens": 1027.0, + "total_cost": 0.0005571899999999999, + "input_cost": 0.00025936000000000004, + "output_cost": 0.00029782999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.136979, + "Duration": 22.529486, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5861.0, - "provider": "Google", - "Metric_request_tokens": 4090.0, - "Metric_response_tokens": 1771.0, - "total_cost": 0.0011174000000000002, - "input_cost": 0.000409, - "output_cost": 0.0007084000000000001 + "total_tokens": 4293.0, + "provider": "OSS", + "Metric_request_tokens": 3247.0, + "Metric_response_tokens": 1046.0, + "total_cost": 0.0005631, + "input_cost": 0.00025976, + "output_cost": 0.00030334 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.112432, + "Duration": 22.262842, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5400.0, - "provider": "Google", - "Metric_request_tokens": 4082.0, - "Metric_response_tokens": 1318.0, - "total_cost": 0.0009354000000000001, - "input_cost": 0.0004082, - "output_cost": 0.0005272 + "total_tokens": 4262.0, + "provider": "OSS", + "Metric_request_tokens": 3238.0, + "Metric_response_tokens": 1024.0, + "total_cost": 0.000556, + "input_cost": 0.00025904, + "output_cost": 0.00029696 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.028322, + "Duration": 22.180974, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4481.0, - "provider": "Google", - "Metric_request_tokens": 3208.0, - "Metric_response_tokens": 1273.0, - "total_cost": 0.00083, - "input_cost": 0.0003208, - "output_cost": 0.0005092 + "total_tokens": 4269.0, + "provider": "OSS", + "Metric_request_tokens": 3242.0, + "Metric_response_tokens": 1027.0, + "total_cost": 0.0005571899999999999, + "input_cost": 0.00025936000000000004, + "output_cost": 0.00029782999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.260043, + "Duration": 22.633818, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5714.0, - "provider": "Google", - "Metric_request_tokens": 4087.0, - "Metric_response_tokens": 1627.0, - "total_cost": 0.0010595000000000001, - "input_cost": 0.00040870000000000007, - "output_cost": 0.0006508 + "total_tokens": 4293.0, + "provider": "OSS", + "Metric_request_tokens": 3247.0, + "Metric_response_tokens": 1046.0, + "total_cost": 0.0005631, + "input_cost": 0.00025976, + "output_cost": 0.00030334 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.62823, + "Duration": 22.45287, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4480.0, - "provider": "Google", - "Metric_request_tokens": 3213.0, - "Metric_response_tokens": 1267.0, - "total_cost": 0.0008281, - "input_cost": 0.00032130000000000006, - "output_cost": 0.0005068 + "total_tokens": 4278.0, + "provider": "OSS", + "Metric_request_tokens": 3238.0, + "Metric_response_tokens": 1040.0, + "total_cost": 0.0005606399999999999, + "input_cost": 0.00025904, + "output_cost": 0.00030159999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.569272, + "Duration": 22.353297, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4509.0, - "provider": "Google", - "Metric_request_tokens": 3206.0, - "Metric_response_tokens": 1303.0, - "total_cost": 0.0008418, - "input_cost": 0.00032060000000000004, - "output_cost": 0.0005212 + "total_tokens": 4265.0, + "provider": "OSS", + "Metric_request_tokens": 3240.0, + "Metric_response_tokens": 1025.0, + "total_cost": 0.00055645, + "input_cost": 0.0002592, + "output_cost": 0.00029725 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.411835, + "Duration": 22.318229, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5643.0, - "provider": "Google", - "Metric_request_tokens": 4086.0, - "Metric_response_tokens": 1557.0, - "total_cost": 0.0010314, - "input_cost": 0.00040860000000000007, - "output_cost": 0.0006228000000000001 + "total_tokens": 4284.0, + "provider": "OSS", + "Metric_request_tokens": 3248.0, + "Metric_response_tokens": 1036.0, + "total_cost": 0.00056028, + "input_cost": 0.00025984, + "output_cost": 0.00030043999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.118662, + "Duration": 22.521446, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4696.0, - "provider": "Google", - "Metric_request_tokens": 3208.0, - "Metric_response_tokens": 1488.0, - "total_cost": 0.000916, - "input_cost": 0.0003208, - "output_cost": 0.0005952 + "total_tokens": 4278.0, + "provider": "OSS", + "Metric_request_tokens": 3238.0, + "Metric_response_tokens": 1040.0, + "total_cost": 0.0005606399999999999, + "input_cost": 0.00025904, + "output_cost": 0.00030159999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.876956, + "Duration": 22.149108, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5328.0, - "provider": "Google", - "Metric_request_tokens": 4075.0, - "Metric_response_tokens": 1253.0, - "total_cost": 0.0009087, - "input_cost": 0.0004075, - "output_cost": 0.0005012 + "total_tokens": 4269.0, + "provider": "OSS", + "Metric_request_tokens": 3242.0, + "Metric_response_tokens": 1027.0, + "total_cost": 0.0005571899999999999, + "input_cost": 0.00025936000000000004, + "output_cost": 0.00029782999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.165924, + "Duration": 22.549563, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5628.0, - "provider": "Google", - "Metric_request_tokens": 4087.0, - "Metric_response_tokens": 1541.0, - "total_cost": 0.0010251000000000001, - "input_cost": 0.00040870000000000007, - "output_cost": 0.0006164 + "total_tokens": 4293.0, + "provider": "OSS", + "Metric_request_tokens": 3247.0, + "Metric_response_tokens": 1046.0, + "total_cost": 0.0005631, + "input_cost": 0.00025976, + "output_cost": 0.00030334 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.339767, + "Duration": 23.723442, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4434.0, - "provider": "Google", - "Metric_request_tokens": 3208.0, - "Metric_response_tokens": 1226.0, - "total_cost": 0.0008112000000000001, - "input_cost": 0.0003208, - "output_cost": 0.0004904000000000001 + "total_tokens": 4278.0, + "provider": "OSS", + "Metric_request_tokens": 3238.0, + "Metric_response_tokens": 1040.0, + "total_cost": 0.0005606399999999999, + "input_cost": 0.00025904, + "output_cost": 0.00030159999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.04924, + "Duration": 22.185447, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4630.0, - "provider": "Google", - "Metric_request_tokens": 3208.0, - "Metric_response_tokens": 1422.0, - "total_cost": 0.0008896, - "input_cost": 0.0003208, - "output_cost": 0.0005688000000000001 + "total_tokens": 4269.0, + "provider": "OSS", + "Metric_request_tokens": 3242.0, + "Metric_response_tokens": 1027.0, + "total_cost": 0.0005571899999999999, + "input_cost": 0.00025936000000000004, + "output_cost": 0.00029782999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.26326, + "Duration": 22.548306, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5715.0, - "provider": "Google", - "Metric_request_tokens": 4087.0, - "Metric_response_tokens": 1628.0, - "total_cost": 0.0010599000000000001, - "input_cost": 0.00040870000000000007, - "output_cost": 0.0006512000000000001 + "total_tokens": 4293.0, + "provider": "OSS", + "Metric_request_tokens": 3247.0, + "Metric_response_tokens": 1046.0, + "total_cost": 0.0005631, + "input_cost": 0.00025976, + "output_cost": 0.00030334 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.144055, + "Duration": 22.48174, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5510.0, - "provider": "Google", - "Metric_request_tokens": 4075.0, - "Metric_response_tokens": 1435.0, - "total_cost": 0.0009815000000000002, - "input_cost": 0.0004075, - "output_cost": 0.0005740000000000001 + "total_tokens": 4284.0, + "provider": "OSS", + "Metric_request_tokens": 3240.0, + "Metric_response_tokens": 1044.0, + "total_cost": 0.00056196, + "input_cost": 0.0002592, + "output_cost": 0.00030275999999999995 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.860893, + "Duration": 22.228258, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5344.0, - "provider": "Google", - "Metric_request_tokens": 4075.0, - "Metric_response_tokens": 1269.0, - "total_cost": 0.0009151, - "input_cost": 0.0004075, - "output_cost": 0.0005076 + "total_tokens": 4269.0, + "provider": "OSS", + "Metric_request_tokens": 3242.0, + "Metric_response_tokens": 1027.0, + "total_cost": 0.0005571899999999999, + "input_cost": 0.00025936000000000004, + "output_cost": 0.00029782999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.738872, + "Duration": 22.477072, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5531.0, - "provider": "Google", - "Metric_request_tokens": 4087.0, - "Metric_response_tokens": 1444.0, - "total_cost": 0.0009863, - "input_cost": 0.00040870000000000007, - "output_cost": 0.0005776 + "total_tokens": 4284.0, + "provider": "OSS", + "Metric_request_tokens": 3248.0, + "Metric_response_tokens": 1036.0, + "total_cost": 0.00056028, + "input_cost": 0.00025984, + "output_cost": 0.00030043999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.025443, + "Duration": 22.405864, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4496.0, - "provider": "Google", - "Metric_request_tokens": 3206.0, - "Metric_response_tokens": 1290.0, - "total_cost": 0.0008366000000000001, - "input_cost": 0.00032060000000000004, - "output_cost": 0.000516 + "total_tokens": 4276.0, + "provider": "OSS", + "Metric_request_tokens": 3237.0, + "Metric_response_tokens": 1039.0, + "total_cost": 0.00056027, + "input_cost": 0.00025896, + "output_cost": 0.00030131 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 8.195003, + "Duration": 22.489404, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5789.0, - "provider": "Google", - "Metric_request_tokens": 3208.0, - "Metric_response_tokens": 2581.0, - "total_cost": 0.0013532, - "input_cost": 0.0003208, - "output_cost": 0.0010324 + "total_tokens": 4286.0, + "provider": "OSS", + "Metric_request_tokens": 3242.0, + "Metric_response_tokens": 1044.0, + "total_cost": 0.0005621199999999999, + "input_cost": 0.00025936000000000004, + "output_cost": 0.00030275999999999995 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.925279, + "Duration": 22.589341, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5623.0, - "provider": "Google", - "Metric_request_tokens": 4087.0, - "Metric_response_tokens": 1536.0, - "total_cost": 0.0010231, - "input_cost": 0.00040870000000000007, - "output_cost": 0.0006144000000000001 + "total_tokens": 4299.0, + "provider": "OSS", + "Metric_request_tokens": 3250.0, + "Metric_response_tokens": 1049.0, + "total_cost": 0.0005642099999999999, + "input_cost": 0.00026, + "output_cost": 0.00030420999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.235465, + "Duration": 22.555942, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5293.0, - "provider": "Google", - "Metric_request_tokens": 4082.0, - "Metric_response_tokens": 1211.0, - "total_cost": 0.0008926000000000001, - "input_cost": 0.0004082, - "output_cost": 0.00048440000000000006 + "total_tokens": 4276.0, + "provider": "OSS", + "Metric_request_tokens": 3237.0, + "Metric_response_tokens": 1039.0, + "total_cost": 0.00056027, + "input_cost": 0.00025896, + "output_cost": 0.00030131 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "qwen/qwen3-30b-a3b-2507", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 8.433263, + "Duration": 22.115462, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10652.0, - "provider": "Google", - "Metric_request_tokens": 7846.0, - "Metric_response_tokens": 2806.0, - "total_cost": 0.001907, - "input_cost": 0.0007846000000000001, - "output_cost": 0.0011224 + "total_tokens": 4269.0, + "provider": "OSS", + "Metric_request_tokens": 3242.0, + "Metric_response_tokens": 1027.0, + "total_cost": 0.0005571899999999999, + "input_cost": 0.00025936000000000004, + "output_cost": 0.00029782999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.637824, + "Duration": 106.689721, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5585.0, - "provider": "Google", - "Metric_request_tokens": 4086.0, - "Metric_response_tokens": 1499.0, - "total_cost": 0.0010082000000000001, - "input_cost": 0.00040860000000000007, - "output_cost": 0.0005996 + "total_tokens": 6511.0, + "provider": "OSS", + "Metric_request_tokens": 5691.0, + "Metric_response_tokens": 820.0, + "total_cost": 0.00064339, + "input_cost": 0.0005121899999999999, + "output_cost": 0.0001312 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.961917, + "Duration": 135.103735, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5394.0, - "provider": "Google", - "Metric_request_tokens": 4082.0, - "Metric_response_tokens": 1312.0, - "total_cost": 0.000933, - "input_cost": 0.0004082, - "output_cost": 0.0005248000000000001 + "total_tokens": 7064.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1370.0, + "total_cost": 0.00073166, + "input_cost": 0.00051246, + "output_cost": 0.0002192 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.664187, + "Duration": 131.253167, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5329.0, - "provider": "Google", - "Metric_request_tokens": 4075.0, - "Metric_response_tokens": 1254.0, - "total_cost": 0.0009090999999999999, - "input_cost": 0.0004075, - "output_cost": 0.0005015999999999999 + "total_tokens": 7028.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1334.0, + "total_cost": 0.0007258999999999999, + "input_cost": 0.00051246, + "output_cost": 0.00021344 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.58097, + "Duration": 96.853991, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5603.0, - "provider": "Google", - "Metric_request_tokens": 4086.0, - "Metric_response_tokens": 1517.0, - "total_cost": 0.0010154, - "input_cost": 0.00040860000000000007, - "output_cost": 0.0006068 + "total_tokens": 7014.0, + "provider": "OSS", + "Metric_request_tokens": 6242.0, + "Metric_response_tokens": 772.0, + "total_cost": 0.0006853, + "input_cost": 0.00056178, + "output_cost": 0.00012352 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.893051, + "Duration": 133.779222, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5331.0, - "provider": "Google", - "Metric_request_tokens": 4082.0, - "Metric_response_tokens": 1249.0, - "total_cost": 0.0009078000000000001, - "input_cost": 0.0004082, - "output_cost": 0.0004996 + "total_tokens": 7047.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1353.0, + "total_cost": 0.00072894, + "input_cost": 0.00051246, + "output_cost": 0.00021648000000000001 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.497755, + "Duration": 93.312005, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5531.0, - "provider": "Google", - "Metric_request_tokens": 3206.0, - "Metric_response_tokens": 2325.0, - "total_cost": 0.0012506, - "input_cost": 0.00032060000000000004, - "output_cost": 0.0009299999999999999 + "total_tokens": 6475.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 781.0, + "total_cost": 0.00063742, + "input_cost": 0.00051246, + "output_cost": 0.00012496 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 8.960675, + "Duration": 93.907787, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7000.0, - "provider": "Google", - "Metric_request_tokens": 4086.0, - "Metric_response_tokens": 2914.0, - "total_cost": 0.0015742, - "input_cost": 0.00040860000000000007, - "output_cost": 0.0011656 + "total_tokens": 6486.0, + "provider": "OSS", + "Metric_request_tokens": 5691.0, + "Metric_response_tokens": 795.0, + "total_cost": 0.0006393899999999999, + "input_cost": 0.0005121899999999999, + "output_cost": 0.0001272 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.787697, + "Duration": 129.138106, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5354.0, - "provider": "Google", - "Metric_request_tokens": 4082.0, - "Metric_response_tokens": 1272.0, - "total_cost": 0.0009170000000000001, - "input_cost": 0.0004082, - "output_cost": 0.0005088 + "total_tokens": 7000.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1306.0, + "total_cost": 0.00072142, + "input_cost": 0.00051246, + "output_cost": 0.00020896 }, { - "Model": "gemini-2.5-flash-lite-preview-09-2025", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.830706, + "Duration": 128.769237, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5382.0, - "provider": "Google", - "Metric_request_tokens": 4079.0, - "Metric_response_tokens": 1303.0, - "total_cost": 0.0009291, - "input_cost": 0.00040790000000000005, - "output_cost": 0.0005212 + "total_tokens": 6992.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1298.0, + "total_cost": 0.00072014, + "input_cost": 0.00051246, + "output_cost": 0.00020768 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 8.399291, + "Duration": 92.960291, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 55783.0, - "provider": "Google", - "Metric_request_tokens": 54588.0, - "Metric_response_tokens": 1195.0, - "total_cost": 0.0059368, - "input_cost": 0.0054588, - "output_cost": 0.00047800000000000007 + "total_tokens": 6456.0, + "provider": "OSS", + "Metric_request_tokens": 5691.0, + "Metric_response_tokens": 765.0, + "total_cost": 0.00063459, + "input_cost": 0.0005121899999999999, + "output_cost": 0.0001224 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.992778, + "Duration": 130.396503, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3911.0, - "provider": "Google", - "Metric_request_tokens": 3211.0, - "Metric_response_tokens": 700.0, - "total_cost": 0.0006011, - "input_cost": 0.0003211, - "output_cost": 0.00028000000000000003 + "total_tokens": 7017.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1323.0, + "total_cost": 0.00072414, + "input_cost": 0.00051246, + "output_cost": 0.00021168 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.646862, + "Duration": 131.657691, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4409.0, - "provider": "Google", - "Metric_request_tokens": 3209.0, - "Metric_response_tokens": 1200.0, - "total_cost": 0.0008009, - "input_cost": 0.00032090000000000005, - "output_cost": 0.00047999999999999996 + "total_tokens": 7032.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1338.0, + "total_cost": 0.0007265399999999999, + "input_cost": 0.00051246, + "output_cost": 0.00021408 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.466695, - "Score_MermaidDiagramValid": 1.0, + "Duration": 93.323413, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4443.0, - "provider": "Google", - "Metric_request_tokens": 3218.0, - "Metric_response_tokens": 1225.0, - "total_cost": 0.0008118, - "input_cost": 0.0003218, - "output_cost": 0.00049 + "total_tokens": 6478.0, + "provider": "OSS", + "Metric_request_tokens": 5690.0, + "Metric_response_tokens": 788.0, + "total_cost": 0.0006381799999999999, + "input_cost": 0.0005120999999999999, + "output_cost": 0.00012607999999999999 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 6.38993, + "Duration": 130.551754, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4398.0, - "provider": "Google", - "Metric_request_tokens": 3211.0, - "Metric_response_tokens": 1187.0, - "total_cost": 0.0007959, - "input_cost": 0.0003211, - "output_cost": 0.0004748 + "total_tokens": 7020.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1326.0, + "total_cost": 0.0007246199999999999, + "input_cost": 0.00051246, + "output_cost": 0.00021216 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.511308, + "Duration": 134.125181, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4408.0, - "provider": "Google", - "Metric_request_tokens": 3211.0, - "Metric_response_tokens": 1197.0, - "total_cost": 0.0007999, - "input_cost": 0.0003211, - "output_cost": 0.0004788 + "total_tokens": 7026.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1332.0, + "total_cost": 0.00072558, + "input_cost": 0.00051246, + "output_cost": 0.00021312000000000002 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.636605, + "Duration": 96.120786, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3902.0, - "provider": "Google", - "Metric_request_tokens": 3219.0, - "Metric_response_tokens": 683.0, - "total_cost": 0.0005951000000000001, - "input_cost": 0.0003219, - "output_cost": 0.0002732 + "total_tokens": 6518.0, + "provider": "OSS", + "Metric_request_tokens": 5691.0, + "Metric_response_tokens": 827.0, + "total_cost": 0.0006445099999999999, + "input_cost": 0.0005121899999999999, + "output_cost": 0.00013232 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.076843, + "Duration": 92.852892, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3891.0, - "provider": "Google", - "Metric_request_tokens": 3211.0, - "Metric_response_tokens": 680.0, - "total_cost": 0.0005931, - "input_cost": 0.0003211, - "output_cost": 0.00027200000000000005 + "total_tokens": 6475.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 781.0, + "total_cost": 0.00063742, + "input_cost": 0.00051246, + "output_cost": 0.00012496 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.715582, + "Duration": 130.821102, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4411.0, - "provider": "Google", - "Metric_request_tokens": 3209.0, - "Metric_response_tokens": 1202.0, - "total_cost": 0.0008017, - "input_cost": 0.00032090000000000005, - "output_cost": 0.0004808 + "total_tokens": 7019.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1325.0, + "total_cost": 0.0007244599999999999, + "input_cost": 0.00051246, + "output_cost": 0.000212 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.772443, + "Duration": 95.015366, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 55774.0, - "provider": "Google", - "Metric_request_tokens": 54589.0, - "Metric_response_tokens": 1185.0, - "total_cost": 0.0059329, - "input_cost": 0.0054589, - "output_cost": 0.00047400000000000003 + "total_tokens": 6502.0, + "provider": "OSS", + "Metric_request_tokens": 5691.0, + "Metric_response_tokens": 811.0, + "total_cost": 0.00064195, + "input_cost": 0.0005121899999999999, + "output_cost": 0.00012976 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.253199, + "Duration": 130.521105, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3888.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 678.0, - "total_cost": 0.0005922000000000001, - "input_cost": 0.00032100000000000005, - "output_cost": 0.00027120000000000003 + "total_tokens": 7019.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1325.0, + "total_cost": 0.0007244599999999999, + "input_cost": 0.00051246, + "output_cost": 0.000212 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.156194, + "Duration": 130.625672, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4408.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 1198.0, - "total_cost": 0.0008002, - "input_cost": 0.00032100000000000005, - "output_cost": 0.00047920000000000005 + "total_tokens": 7021.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1327.0, + "total_cost": 0.00072478, + "input_cost": 0.00051246, + "output_cost": 0.00021232 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 7.078822, + "Duration": 93.892455, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 55769.0, - "provider": "Google", - "Metric_request_tokens": 54588.0, - "Metric_response_tokens": 1181.0, - "total_cost": 0.005931199999999999, - "input_cost": 0.0054588, - "output_cost": 0.0004724 + "total_tokens": 6487.0, + "provider": "OSS", + "Metric_request_tokens": 5691.0, + "Metric_response_tokens": 796.0, + "total_cost": 0.0006395499999999999, + "input_cost": 0.0005121899999999999, + "output_cost": 0.00012736 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 6.132538, + "Duration": 132.023228, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4416.0, - "provider": "Google", - "Metric_request_tokens": 3211.0, - "Metric_response_tokens": 1205.0, - "total_cost": 0.0008031, - "input_cost": 0.0003211, - "output_cost": 0.000482 + "total_tokens": 7022.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1328.0, + "total_cost": 0.00072494, + "input_cost": 0.00051246, + "output_cost": 0.00021248 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.352614, + "Duration": 130.705318, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4402.0, - "provider": "Google", - "Metric_request_tokens": 3208.0, - "Metric_response_tokens": 1194.0, - "total_cost": 0.0007984, - "input_cost": 0.0003208, - "output_cost": 0.0004776 + "total_tokens": 7022.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1328.0, + "total_cost": 0.00072494, + "input_cost": 0.00051246, + "output_cost": 0.00021248 }, - { - "Model": "gemini-2.5-flash-lite", - "Case": "fix_invalid_diagram_easy", - "test_group": "easy", - "Duration": 7.218354, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 55785.0, - "provider": "Google", - "Metric_request_tokens": 54589.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.0059373, - "input_cost": 0.0054589, - "output_cost": 0.00047840000000000003 + { + "Model": "google/gemma-3-27b", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 95.077056, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6505.0, + "provider": "OSS", + "Metric_request_tokens": 5693.0, + "Metric_response_tokens": 812.0, + "total_cost": 0.00064229, + "input_cost": 0.00051237, + "output_cost": 0.00012992 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 7.11734, + "Duration": 133.879, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4410.0, - "provider": "Google", - "Metric_request_tokens": 3208.0, - "Metric_response_tokens": 1202.0, - "total_cost": 0.0008016, - "input_cost": 0.0003208, - "output_cost": 0.0004808 + "total_tokens": 7066.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1372.0, + "total_cost": 0.0007319799999999999, + "input_cost": 0.00051246, + "output_cost": 0.00021951999999999999 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.290457, + "Duration": 128.580821, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3901.0, - "provider": "Google", - "Metric_request_tokens": 3208.0, - "Metric_response_tokens": 693.0, - "total_cost": 0.000598, - "input_cost": 0.0003208, - "output_cost": 0.0002772 + "total_tokens": 6992.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1298.0, + "total_cost": 0.00072014, + "input_cost": 0.00051246, + "output_cost": 0.00020768 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 7.43899, + "Duration": 97.960615, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 55785.0, - "provider": "Google", - "Metric_request_tokens": 54589.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.0059373, - "input_cost": 0.0054589, - "output_cost": 0.00047840000000000003 + "total_tokens": 7024.0, + "provider": "OSS", + "Metric_request_tokens": 6242.0, + "Metric_response_tokens": 782.0, + "total_cost": 0.0006869, + "input_cost": 0.00056178, + "output_cost": 0.00012512 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.882767, + "Duration": 131.07977, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3923.0, - "provider": "Google", - "Metric_request_tokens": 3211.0, - "Metric_response_tokens": 712.0, - "total_cost": 0.0006058999999999999, - "input_cost": 0.0003211, - "output_cost": 0.0002848 + "total_tokens": 7027.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1333.0, + "total_cost": 0.0007257399999999999, + "input_cost": 0.00051246, + "output_cost": 0.00021328 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.189799, + "Duration": 130.312166, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3913.0, - "provider": "Google", - "Metric_request_tokens": 3208.0, - "Metric_response_tokens": 705.0, - "total_cost": 0.0006028, - "input_cost": 0.0003208, - "output_cost": 0.000282 + "total_tokens": 7017.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1323.0, + "total_cost": 0.00072414, + "input_cost": 0.00051246, + "output_cost": 0.00021168 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.863021, + "Duration": 94.397851, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 55806.0, - "provider": "Google", - "Metric_request_tokens": 54588.0, - "Metric_response_tokens": 1218.0, - "total_cost": 0.005946, - "input_cost": 0.0054588, - "output_cost": 0.00048719999999999997 + "total_tokens": 6479.0, + "provider": "OSS", + "Metric_request_tokens": 5691.0, + "Metric_response_tokens": 788.0, + "total_cost": 0.0006382699999999999, + "input_cost": 0.0005121899999999999, + "output_cost": 0.00012607999999999999 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 6.779658, + "Duration": 130.946353, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4381.0, - "provider": "Google", - "Metric_request_tokens": 3194.0, - "Metric_response_tokens": 1187.0, - "total_cost": 0.0007942, - "input_cost": 0.0003194, - "output_cost": 0.0004748 + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7016.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1322.0, + "total_cost": 0.0007239799999999999, + "input_cost": 0.00051246, + "output_cost": 0.00021152 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.355412, + "Duration": 131.627735, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4410.0, - "provider": "Google", - "Metric_request_tokens": 3209.0, - "Metric_response_tokens": 1201.0, - "total_cost": 0.0008013, - "input_cost": 0.00032090000000000005, - "output_cost": 0.0004804 + "total_tokens": 7033.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1339.0, + "total_cost": 0.0007266999999999999, + "input_cost": 0.00051246, + "output_cost": 0.00021423999999999998 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.824842, + "Duration": 99.097172, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4007.0, - "provider": "Google", - "Metric_request_tokens": 3318.0, - "Metric_response_tokens": 689.0, - "total_cost": 0.0006074000000000001, - "input_cost": 0.00033180000000000004, - "output_cost": 0.00027560000000000003 + "total_tokens": 7027.0, + "provider": "OSS", + "Metric_request_tokens": 6242.0, + "Metric_response_tokens": 785.0, + "total_cost": 0.00068738, + "input_cost": 0.00056178, + "output_cost": 0.0001256 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 6.020729, + "Duration": 130.803492, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4409.0, - "provider": "Google", - "Metric_request_tokens": 3211.0, - "Metric_response_tokens": 1198.0, - "total_cost": 0.0008003000000000001, - "input_cost": 0.0003211, - "output_cost": 0.00047920000000000005 + "total_tokens": 7021.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1327.0, + "total_cost": 0.00072478, + "input_cost": 0.00051246, + "output_cost": 0.00021232 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.344559, + "Duration": 134.279998, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4406.0, - "provider": "Google", - "Metric_request_tokens": 3209.0, - "Metric_response_tokens": 1197.0, - "total_cost": 0.0007997, - "input_cost": 0.00032090000000000005, - "output_cost": 0.0004788 + "total_tokens": 7064.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1370.0, + "total_cost": 0.00073166, + "input_cost": 0.00051246, + "output_cost": 0.0002192 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.45661, + "Duration": 94.222779, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 55777.0, - "provider": "Google", - "Metric_request_tokens": 54589.0, - "Metric_response_tokens": 1188.0, - "total_cost": 0.0059341, - "input_cost": 0.0054589, - "output_cost": 0.00047520000000000006 + "total_tokens": 6487.0, + "provider": "OSS", + "Metric_request_tokens": 5691.0, + "Metric_response_tokens": 796.0, + "total_cost": 0.0006395499999999999, + "input_cost": 0.0005121899999999999, + "output_cost": 0.00012736 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 6.430851, + "Duration": 132.05352, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4400.0, - "provider": "Google", - "Metric_request_tokens": 3211.0, - "Metric_response_tokens": 1189.0, - "total_cost": 0.0007967, - "input_cost": 0.0003211, - "output_cost": 0.0004756 + "total_tokens": 7020.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1326.0, + "total_cost": 0.0007246199999999999, + "input_cost": 0.00051246, + "output_cost": 0.00021216 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.245205, + "Duration": 95.574459, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4399.0, - "provider": "Google", - "Metric_request_tokens": 3211.0, - "Metric_response_tokens": 1188.0, - "total_cost": 0.0007963, - "input_cost": 0.0003211, - "output_cost": 0.00047520000000000006 + "total_tokens": 6513.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 819.0, + "total_cost": 0.0006435, + "input_cost": 0.00051246, + "output_cost": 0.00013104 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.328961, + "Duration": 94.656839, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4429.0, - "provider": "Google", - "Metric_request_tokens": 3218.0, - "Metric_response_tokens": 1211.0, - "total_cost": 0.0008062000000000001, - "input_cost": 0.0003218, - "output_cost": 0.00048440000000000006 + "total_tokens": 6494.0, + "provider": "OSS", + "Metric_request_tokens": 5691.0, + "Metric_response_tokens": 803.0, + "total_cost": 0.0006406699999999999, + "input_cost": 0.0005121899999999999, + "output_cost": 0.00012848 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 7.136504, + "Duration": 133.471015, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4408.0, - "provider": "Google", - "Metric_request_tokens": 3211.0, - "Metric_response_tokens": 1197.0, - "total_cost": 0.0007999, - "input_cost": 0.0003211, - "output_cost": 0.0004788 + "total_tokens": 7058.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1364.0, + "total_cost": 0.0007306999999999999, + "input_cost": 0.00051246, + "output_cost": 0.00021824 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.226974, + "Duration": 282.170394, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4384.0, - "provider": "Google", - "Metric_request_tokens": 3208.0, - "Metric_response_tokens": 1176.0, - "total_cost": 0.0007911999999999999, - "input_cost": 0.0003208, - "output_cost": 0.0004704 + "total_tokens": 11876.0, + "provider": "OSS", + "Metric_request_tokens": 8781.0, + "Metric_response_tokens": 3095.0, + "total_cost": 0.00128549, + "input_cost": 0.00079029, + "output_cost": 0.0004952 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.927981, + "Duration": 93.236155, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 55786.0, - "provider": "Google", - "Metric_request_tokens": 54589.0, - "Metric_response_tokens": 1197.0, - "total_cost": 0.0059377, - "input_cost": 0.0054589, - "output_cost": 0.0004788 + "total_tokens": 6455.0, + "provider": "OSS", + "Metric_request_tokens": 5691.0, + "Metric_response_tokens": 764.0, + "total_cost": 0.00063443, + "input_cost": 0.0005121899999999999, + "output_cost": 0.00012224 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.433176, + "Duration": 133.413174, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4405.0, - "provider": "Google", - "Metric_request_tokens": 3209.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.0007993000000000001, - "input_cost": 0.00032090000000000005, - "output_cost": 0.00047840000000000003 + "total_tokens": 7057.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1363.0, + "total_cost": 0.00073054, + "input_cost": 0.00051246, + "output_cost": 0.00021808000000000003 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "google/gemma-3-27b", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.001402, + "Duration": 128.689856, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4409.0, - "provider": "Google", - "Metric_request_tokens": 3211.0, - "Metric_response_tokens": 1198.0, - "total_cost": 0.0008003000000000001, - "input_cost": 0.0003211, - "output_cost": 0.00047920000000000005 + "total_tokens": 6989.0, + "provider": "OSS", + "Metric_request_tokens": 5694.0, + "Metric_response_tokens": 1295.0, + "total_cost": 0.00071966, + "input_cost": 0.00051246, + "output_cost": 0.0002072 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "magistral-small-2509-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.185808, - "Score_MermaidDiagramValid": 1.0, + "Duration": 221.478146, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4489.0, - "provider": "Google", - "Metric_request_tokens": 3219.0, - "Metric_response_tokens": 1270.0, - "total_cost": 0.0008299000000000002, - "input_cost": 0.0003219, - "output_cost": 0.0005080000000000001 + "total_tokens": 6400.0, + "provider": "OSS", + "Metric_request_tokens": 4338.0, + "Metric_response_tokens": 2062.0, + "total_cost": 0.005262, + "input_cost": 0.002169, + "output_cost": 0.0030930000000000003 }, { - "Model": "gemini-2.5-flash-lite", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 6.921527, + "Model": "magistral-small-2509-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 931.613421, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4467.0, - "provider": "Google", - "Metric_request_tokens": 3209.0, - "Metric_response_tokens": 1258.0, - "total_cost": 0.0008241, - "input_cost": 0.00032090000000000005, - "output_cost": 0.0005032 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 6.136839, + "Model": "magistral-small-2509-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 953.253205, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4409.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 1199.0, - "total_cost": 0.0008006, - "input_cost": 0.00032100000000000005, - "output_cost": 0.0004796 + "total_tokens": 15129.0, + "provider": "OSS", + "Metric_request_tokens": 6813.0, + "Metric_response_tokens": 8316.0, + "total_cost": 0.0158805, + "input_cost": 0.0034065, + "output_cost": 0.012474 + }, + { + "Model": "magistral-small-2509-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 560.604626, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "magistral-small-2509-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 679.858581, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "magistral-small-2509-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.47105, + "Duration": 232.473946, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4427.0, - "provider": "Google", - "Metric_request_tokens": 3218.0, - "Metric_response_tokens": 1209.0, - "total_cost": 0.0008054000000000001, - "input_cost": 0.0003218, - "output_cost": 0.00048360000000000005 + "total_tokens": 5861.0, + "provider": "OSS", + "Metric_request_tokens": 3764.0, + "Metric_response_tokens": 2097.0, + "total_cost": 0.0050275, + "input_cost": 0.001882, + "output_cost": 0.0031455 }, { - "Model": "gemini-2.5-flash-lite", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 6.204263, + "Model": "magistral-small-2509-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 616.020088, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4412.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 1202.0, - "total_cost": 0.0008018000000000001, - "input_cost": 0.00032100000000000005, - "output_cost": 0.0004808 + "total_tokens": 12554.0, + "provider": "OSS", + "Metric_request_tokens": 6841.0, + "Metric_response_tokens": 5713.0, + "total_cost": 0.01199, + "input_cost": 0.0034205, + "output_cost": 0.0085695 }, { - "Model": "gemini-2.5-flash-lite", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 4.909809, + "Model": "magistral-small-2509-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 670.158492, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3889.0, - "provider": "Google", - "Metric_request_tokens": 3208.0, - "Metric_response_tokens": 681.0, - "total_cost": 0.0005932, - "input_cost": 0.0003208, - "output_cost": 0.0002724 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite", + "Model": "magistral-small-2509-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.411182, + "Duration": 736.851506, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4421.0, - "provider": "Google", - "Metric_request_tokens": 3218.0, - "Metric_response_tokens": 1203.0, - "total_cost": 0.0008030000000000001, - "input_cost": 0.0003218, - "output_cost": 0.00048120000000000004 + "total_tokens": 14963.0, + "provider": "OSS", + "Metric_request_tokens": 8225.0, + "Metric_response_tokens": 6738.0, + "total_cost": 0.0142195, + "input_cost": 0.0041125, + "output_cost": 0.010107 }, { - "Model": "gemini-2.5-flash-lite", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 5.180393, + "Model": "magistral-small-2509-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 651.311046, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3940.0, - "provider": "Google", - "Metric_request_tokens": 3211.0, - "Metric_response_tokens": 729.0, - "total_cost": 0.0006127000000000001, - "input_cost": 0.0003211, - "output_cost": 0.00029160000000000004 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash-lite", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 5.239312, + "Model": "magistral-small-2509-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 790.215271, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4025.0, - "provider": "Google", - "Metric_request_tokens": 3209.0, - "Metric_response_tokens": 816.0, - "total_cost": 0.0006473000000000001, - "input_cost": 0.00032090000000000005, - "output_cost": 0.0003264 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro", + "Model": "magistral-small-2509-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 21.349486, + "Duration": 161.134766, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4849.0, - "provider": "Google", - "Metric_request_tokens": 3219.0, - "Metric_response_tokens": 1630.0, - "total_cost": 0.024883750000000003, - "input_cost": 0.00402375, - "output_cost": 0.020860000000000004 + "total_tokens": 5225.0, + "provider": "OSS", + "Metric_request_tokens": 3719.0, + "Metric_response_tokens": 1506.0, + "total_cost": 0.0041185, + "input_cost": 0.0018595, + "output_cost": 0.0022589999999999997 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 23.964467, + "Model": "magistral-small-2509-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 669.20072, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5597.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 2387.0, - "total_cost": 0.0402125, - "input_cost": 0.0040125000000000004, - "output_cost": 0.036199999999999996 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 20.409625, + "Model": "magistral-small-2509-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 575.884068, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5199.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 1989.0, - "total_cost": 0.0320625, - "input_cost": 0.0040125000000000004, - "output_cost": 0.028050000000000002 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro", + "Model": "magistral-small-2509-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 35.195388, - "Score_MermaidDiagramValid": 1.0, + "Duration": 282.838466, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 60998.0, - "provider": "Google", - "Metric_request_tokens": 57624.0, - "Metric_response_tokens": 3374.0, - "total_cost": 0.12099, - "input_cost": 0.07203, - "output_cost": 0.048960000000000004 + "total_tokens": 6443.0, + "provider": "OSS", + "Metric_request_tokens": 3758.0, + "Metric_response_tokens": 2685.0, + "total_cost": 0.0059065, + "input_cost": 0.001879, + "output_cost": 0.0040275 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 25.042637, + "Model": "seed-oss-36b-instruct-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 222.118945, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5164.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 1954.0, - "total_cost": 0.0315525, - "input_cost": 0.0040125000000000004, - "output_cost": 0.02754 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4939.0, + "provider": "OSS", + "Metric_request_tokens": 3432.0, + "Metric_response_tokens": 1507.0, + "total_cost": 0.00157971, + "input_cost": 0.00072072, + "output_cost": 0.00085899 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 16.228586, + "Model": "seed-oss-36b-instruct-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 34.573083, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4880.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 1670.0, - "total_cost": 0.025682500000000004, - "input_cost": 0.0040125000000000004, - "output_cost": 0.021670000000000002 + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1173.0, + "provider": "OSS", + "Metric_request_tokens": 904.0, + "Metric_response_tokens": 269.0, + "total_cost": 0.00034316999999999996, + "input_cost": 0.00018983999999999998, + "output_cost": 0.00015332999999999997 }, { - "Model": "gemini-2.5-pro", + "Model": "seed-oss-36b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 22.416002, + "Duration": 616.861841, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5374.0, - "provider": "Google", - "Metric_request_tokens": 3219.0, - "Metric_response_tokens": 2155.0, - "total_cost": 0.035353749999999996, - "input_cost": 0.00402375, - "output_cost": 0.03133 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 34.89023, + "Model": "seed-oss-36b-instruct-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 260.045248, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6708.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 3498.0, - "total_cost": 0.062262500000000005, - "input_cost": 0.0040125000000000004, - "output_cost": 0.05825 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5633.0, + "provider": "OSS", + "Metric_request_tokens": 3735.0, + "Metric_response_tokens": 1898.0, + "total_cost": 0.0018662099999999997, + "input_cost": 0.00078435, + "output_cost": 0.0010818599999999998 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 18.19492, + "Model": "seed-oss-36b-instruct-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 472.29148, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5057.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 1847.0, - "total_cost": 0.029242500000000005, - "input_cost": 0.0040125000000000004, - "output_cost": 0.025230000000000002 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro", + "Model": "seed-oss-36b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 45.85591, + "Duration": 264.236543, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 62754.0, - "provider": "Google", - "Metric_request_tokens": 58599.0, - "Metric_response_tokens": 4155.0, - "total_cost": 0.13779875, - "input_cost": 0.07324875, - "output_cost": 0.06455 + "total_tokens": 8195.0, + "provider": "OSS", + "Metric_request_tokens": 6275.0, + "Metric_response_tokens": 1920.0, + "total_cost": 0.00241215, + "input_cost": 0.00131775, + "output_cost": 0.0010944 }, { - "Model": "gemini-2.5-pro", + "Model": "seed-oss-36b-instruct-mlx", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 21.491191, - "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4990.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 1780.0, - "total_cost": 0.027902500000000004, - "input_cost": 0.0040125000000000004, - "output_cost": 0.02389 - }, - { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 25.118851, + "Duration": 1111.227472, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5685.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 2475.0, - "total_cost": 0.041842500000000005, - "input_cost": 0.0040125000000000004, - "output_cost": 0.03783 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro", + "Model": "seed-oss-36b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 19.464079, + "Duration": 372.39031, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5126.0, - "provider": "Google", - "Metric_request_tokens": 3219.0, - "Metric_response_tokens": 1907.0, - "total_cost": 0.03042375, - "input_cost": 0.00402375, - "output_cost": 0.0264 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 24.405728, + "Model": "seed-oss-36b-instruct-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 245.610536, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5297.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 2087.0, - "total_cost": 0.0340425, - "input_cost": 0.0040125000000000004, - "output_cost": 0.03003 + "total_tokens": 6720.0, + "provider": "OSS", + "Metric_request_tokens": 4952.0, + "Metric_response_tokens": 1768.0, + "total_cost": 0.0020476799999999996, + "input_cost": 0.00103992, + "output_cost": 0.00100776 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 67.674063, + "Model": "seed-oss-36b-instruct-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 343.635407, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "OSS", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -15126,95 +20211,95 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro", + "Model": "seed-oss-36b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 21.024953, - "Score_MermaidDiagramValid": 1.0, + "Duration": 223.604289, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5326.0, - "provider": "Google", - "Metric_request_tokens": 3219.0, - "Metric_response_tokens": 2107.0, - "total_cost": 0.03420375, - "input_cost": 0.00402375, - "output_cost": 0.03018 + "total_tokens": 5995.0, + "provider": "OSS", + "Metric_request_tokens": 4405.0, + "Metric_response_tokens": 1590.0, + "total_cost": 0.0018313499999999998, + "input_cost": 0.00092505, + "output_cost": 0.0009063 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 34.71675, + "Model": "seed-oss-36b-instruct-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 617.463283, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6803.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 3593.0, - "total_cost": 0.0641625, - "input_cost": 0.0040125000000000004, - "output_cost": 0.06015 + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 18.749296, + "Model": "seed-oss-36b-instruct-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 215.414808, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5129.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 1919.0, - "total_cost": 0.0307425, - "input_cost": 0.0040125000000000004, - "output_cost": 0.02673 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4941.0, + "provider": "OSS", + "Metric_request_tokens": 3407.0, + "Metric_response_tokens": 1534.0, + "total_cost": 0.0015898499999999999, + "input_cost": 0.00071547, + "output_cost": 0.0008743799999999999 }, { - "Model": "gemini-2.5-pro", + "Model": "seed-oss-36b-instruct-mlx", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 36.35102, - "Score_MermaidDiagramValid": 1.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 62398.0, - "provider": "Google", - "Metric_request_tokens": 59103.0, - "Metric_response_tokens": 3295.0, - "total_cost": 0.12099875000000002, - "input_cost": 0.07387875, - "output_cost": 0.04712 + "Duration": 531.784582, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 47.352071, + "Model": "seed-oss-36b-instruct-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 387.793645, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 16984.0, - "provider": "Google", - "Metric_request_tokens": 12617.0, - "Metric_response_tokens": 4367.0, - "total_cost": 0.07872125, - "input_cost": 0.01577125, - "output_cost": 0.06295 + "total_tokens": 11259.0, + "provider": "OSS", + "Metric_request_tokens": 8599.0, + "Metric_response_tokens": 2660.0, + "total_cost": 0.00332199, + "input_cost": 0.00180579, + "output_cost": 0.0015161999999999999 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 80.205807, + "Model": "seed-oss-36b-instruct-mlx", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 432.305206, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "OSS", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -15222,143 +20307,175 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro", + "Model": "xlam-2-32b-fc-r", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 33.973289, - "Score_MermaidDiagramValid": 1.0, + "Duration": 211.541775, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 61926.0, - "provider": "Google", - "Metric_request_tokens": 58756.0, - "Metric_response_tokens": 3170.0, - "total_cost": 0.118345, - "input_cost": 0.07344500000000001, - "output_cost": 0.0449 + "total_tokens": 11338.0, + "provider": "OSS", + "Metric_request_tokens": 10242.0, + "Metric_response_tokens": 1096.0, + "total_cost": 0.00058504, + "input_cost": 0.00040968, + "output_cost": 0.00017536 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 17.276555, + "Model": "xlam-2-32b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 128.883973, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4913.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 1703.0, - "total_cost": 0.0265325, - "input_cost": 0.0040125000000000004, - "output_cost": 0.022520000000000002 + "total_tokens": 7333.0, + "provider": "OSS", + "Metric_request_tokens": 6720.0, + "Metric_response_tokens": 613.0, + "total_cost": 0.00036688000000000004, + "input_cost": 0.00026880000000000003, + "output_cost": 9.808000000000001e-5 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 22.695808, + "Model": "xlam-2-32b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 201.307022, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5513.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 2303.0, - "total_cost": 0.038402500000000006, - "input_cost": 0.0040125000000000004, - "output_cost": 0.034390000000000004 + "total_tokens": 7840.0, + "provider": "OSS", + "Metric_request_tokens": 6726.0, + "Metric_response_tokens": 1114.0, + "total_cost": 0.00044728, + "input_cost": 0.00026904, + "output_cost": 0.00017824 }, { - "Model": "gemini-2.5-pro", + "Model": "xlam-2-32b-fc-r", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 25.120604, - "Score_MermaidDiagramValid": 1.0, + "Duration": 149.400551, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5607.0, - "provider": "Google", - "Metric_request_tokens": 3219.0, - "Metric_response_tokens": 2388.0, - "total_cost": 0.03999375, - "input_cost": 0.00402375, - "output_cost": 0.03597 + "total_tokens": 10970.0, + "provider": "OSS", + "Metric_request_tokens": 10237.0, + "Metric_response_tokens": 733.0, + "total_cost": 0.00052676, + "input_cost": 0.00040948, + "output_cost": 0.00011728 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 30.955354, + "Model": "xlam-2-32b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 201.478213, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6230.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 3020.0, - "total_cost": 0.0528825, - "input_cost": 0.0040125000000000004, - "output_cost": 0.04887 + "total_tokens": 11328.0, + "provider": "OSS", + "Metric_request_tokens": 10236.0, + "Metric_response_tokens": 1092.0, + "total_cost": 0.00058416, + "input_cost": 0.00040944, + "output_cost": 0.00017472 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 73.925452, + "Model": "xlam-2-32b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 196.62274, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "OSS", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "xlam-2-32b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 131.57768, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7346.0, + "provider": "OSS", + "Metric_request_tokens": 6726.0, + "Metric_response_tokens": 620.0, + "total_cost": 0.00036824000000000004, + "input_cost": 0.00026904, + "output_cost": 9.92e-5 + }, + { + "Model": "xlam-2-32b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 199.37541, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 21279.0, - "provider": "Google", - "Metric_request_tokens": 13494.0, - "Metric_response_tokens": 7785.0, - "total_cost": 0.14814750000000002, - "input_cost": 0.0168675, - "output_cost": 0.13128 + "total_tokens": 11329.0, + "provider": "OSS", + "Metric_request_tokens": 10235.0, + "Metric_response_tokens": 1094.0, + "total_cost": 0.00058444, + "input_cost": 0.0004094, + "output_cost": 0.00017503999999999998 }, { - "Model": "gemini-2.5-pro", + "Model": "xlam-2-32b-fc-r", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 29.693847, + "Duration": 137.219709, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6429.0, - "provider": "Google", - "Metric_request_tokens": 3219.0, - "Metric_response_tokens": 3210.0, - "total_cost": 0.05648375, - "input_cost": 0.00402375, - "output_cost": 0.05246 + "total_tokens": 7388.0, + "provider": "OSS", + "Metric_request_tokens": 6726.0, + "Metric_response_tokens": 662.0, + "total_cost": 0.00037496000000000003, + "input_cost": 0.00026904, + "output_cost": 0.00010592000000000002 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 25.564584, + "Model": "xlam-2-32b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 128.485932, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5542.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 2332.0, - "total_cost": 0.0389525, - "input_cost": 0.0040125000000000004, - "output_cost": 0.03494 + "total_tokens": 7330.0, + "provider": "OSS", + "Metric_request_tokens": 6718.0, + "Metric_response_tokens": 612.0, + "total_cost": 0.00036664, + "input_cost": 0.00026872, + "output_cost": 9.792000000000001e-5 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 94.248736, + "Model": "xlam-2-32b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 213.403265, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, - "provider": "Google", + "provider": "OSS", "Metric_request_tokens": 0.0, "Metric_response_tokens": 0.0, "total_cost": 0.0, @@ -15366,244 +20483,308 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro", + "Model": "xlam-2-32b-fc-r", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 18.571851, + "Duration": 135.140454, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5030.0, - "provider": "Google", - "Metric_request_tokens": 3219.0, - "Metric_response_tokens": 1811.0, - "total_cost": 0.02850375, - "input_cost": 0.00402375, - "output_cost": 0.024480000000000002 + "total_tokens": 7384.0, + "provider": "OSS", + "Metric_request_tokens": 6726.0, + "Metric_response_tokens": 658.0, + "total_cost": 0.00037432000000000003, + "input_cost": 0.00026904, + "output_cost": 0.00010528 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 21.144001, + "Model": "xlam-2-32b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 201.065759, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5212.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 2002.0, - "total_cost": 0.032322500000000004, - "input_cost": 0.0040125000000000004, - "output_cost": 0.028310000000000002 + "total_tokens": 11328.0, + "provider": "OSS", + "Metric_request_tokens": 10237.0, + "Metric_response_tokens": 1091.0, + "total_cost": 0.00058404, + "input_cost": 0.00040948, + "output_cost": 0.00017456 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 36.342038, + "Model": "xlam-2-32b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 197.966458, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10769.0, - "provider": "Google", - "Metric_request_tokens": 7141.0, - "Metric_response_tokens": 3628.0, - "total_cost": 0.06363625, - "input_cost": 0.00892625, - "output_cost": 0.05471 + "total_tokens": 11304.0, + "provider": "OSS", + "Metric_request_tokens": 10219.0, + "Metric_response_tokens": 1085.0, + "total_cost": 0.0005823600000000001, + "input_cost": 0.00040876000000000004, + "output_cost": 0.00017360000000000002 }, { - "Model": "gemini-2.5-pro", + "Model": "xlam-2-32b-fc-r", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 17.376388, + "Duration": 144.46481, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4866.0, - "provider": "Google", - "Metric_request_tokens": 3219.0, - "Metric_response_tokens": 1647.0, - "total_cost": 0.02522375, - "input_cost": 0.00402375, - "output_cost": 0.0212 + "total_tokens": 10935.0, + "provider": "OSS", + "Metric_request_tokens": 10235.0, + "Metric_response_tokens": 700.0, + "total_cost": 0.0005214, + "input_cost": 0.0004094, + "output_cost": 0.000112 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 22.373867, + "Model": "llama-xlam-2-70b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 345.412931, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5470.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 2260.0, - "total_cost": 0.037672500000000005, - "input_cost": 0.0040125000000000004, - "output_cost": 0.03366 + "total_tokens": 9427.0, + "provider": "OSS", + "Metric_request_tokens": 8372.0, + "Metric_response_tokens": 1055.0, + "total_cost": 0.0029335999999999997, + "input_cost": 0.0025115999999999997, + "output_cost": 0.000422 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 64.529238, + "Model": "llama-xlam-2-70b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 299.73699, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 26102.0, - "provider": "Google", - "Metric_request_tokens": 19633.0, - "Metric_response_tokens": 6469.0, - "total_cost": 0.12325124999999999, - "input_cost": 0.02454125, - "output_cost": 0.09870999999999999 + "total_tokens": 9465.0, + "provider": "OSS", + "Metric_request_tokens": 8372.0, + "Metric_response_tokens": 1093.0, + "total_cost": 0.0029487999999999997, + "input_cost": 0.0025115999999999997, + "output_cost": 0.0004372 }, { - "Model": "gemini-2.5-pro", + "Model": "llama-xlam-2-70b-fc-r", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 31.845862, - "Score_MermaidDiagramValid": 1.0, + "Duration": 206.267984, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 60411.0, - "provider": "Google", - "Metric_request_tokens": 57457.0, - "Metric_response_tokens": 2954.0, - "total_cost": 0.11237125, - "input_cost": 0.07182125, - "output_cost": 0.040549999999999996 + "total_tokens": 9033.0, + "provider": "OSS", + "Metric_request_tokens": 8372.0, + "Metric_response_tokens": 661.0, + "total_cost": 0.002776, + "input_cost": 0.0025115999999999997, + "output_cost": 0.00026440000000000003 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 25.463547, + "Model": "llama-xlam-2-70b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 209.753547, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5763.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 2553.0, - "total_cost": 0.043362500000000005, - "input_cost": 0.0040125000000000004, - "output_cost": 0.03935 + "total_tokens": 9054.0, + "provider": "OSS", + "Metric_request_tokens": 8372.0, + "Metric_response_tokens": 682.0, + "total_cost": 0.0027844, + "input_cost": 0.0025115999999999997, + "output_cost": 0.0002728 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 45.089712, + "Model": "llama-xlam-2-70b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 309.397481, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 12047.0, - "provider": "Google", - "Metric_request_tokens": 7141.0, - "Metric_response_tokens": 4906.0, - "total_cost": 0.08917624999999998, - "input_cost": 0.00892625, - "output_cost": 0.08024999999999999 + "total_tokens": 9502.0, + "provider": "OSS", + "Metric_request_tokens": 8372.0, + "Metric_response_tokens": 1130.0, + "total_cost": 0.0029636, + "input_cost": 0.0025115999999999997, + "output_cost": 0.000452 }, { - "Model": "gemini-2.5-pro", + "Model": "llama-xlam-2-70b-fc-r", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 18.56191, + "Duration": 219.444415, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5052.0, - "provider": "Google", - "Metric_request_tokens": 3219.0, - "Metric_response_tokens": 1833.0, - "total_cost": 0.028943749999999997, - "input_cost": 0.00402375, - "output_cost": 0.024919999999999998 + "total_tokens": 9097.0, + "provider": "OSS", + "Metric_request_tokens": 8372.0, + "Metric_response_tokens": 725.0, + "total_cost": 0.0028015999999999996, + "input_cost": 0.0025115999999999997, + "output_cost": 0.00029 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 18.501743, + "Model": "llama-xlam-2-70b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 197.690273, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5018.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 1808.0, - "total_cost": 0.0284625, - "input_cost": 0.0040125000000000004, - "output_cost": 0.02445 + "total_tokens": 7272.0, + "provider": "OSS", + "Metric_request_tokens": 6627.0, + "Metric_response_tokens": 645.0, + "total_cost": 0.0022461, + "input_cost": 0.0019881, + "output_cost": 0.000258 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 29.826137, + "Model": "llama-xlam-2-70b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 217.080805, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6374.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 3164.0, - "total_cost": 0.05558250000000001, - "input_cost": 0.0040125000000000004, - "output_cost": 0.051570000000000005 + "total_tokens": 9087.0, + "provider": "OSS", + "Metric_request_tokens": 8372.0, + "Metric_response_tokens": 715.0, + "total_cost": 0.0027976, + "input_cost": 0.0025115999999999997, + "output_cost": 0.000286 }, { - "Model": "gemini-2.5-pro", + "Model": "llama-xlam-2-70b-fc-r", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 20.088304, - "Score_MermaidDiagramValid": 1.0, + "Duration": 210.926958, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5329.0, - "provider": "Google", - "Metric_request_tokens": 3212.0, - "Metric_response_tokens": 2117.0, - "total_cost": 0.034464999999999996, - "input_cost": 0.004015, - "output_cost": 0.030449999999999998 + "total_tokens": 9051.0, + "provider": "OSS", + "Metric_request_tokens": 8372.0, + "Metric_response_tokens": 679.0, + "total_cost": 0.0027831999999999996, + "input_cost": 0.0025115999999999997, + "output_cost": 0.00027160000000000004 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_medium", - "test_group": "medium", - "Duration": 66.934286, - "Score_MermaidDiagramValid": 1.0, + "Model": "llama-xlam-2-70b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 217.539308, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 75854.0, - "provider": "Google", - "Metric_request_tokens": 70352.0, - "Metric_response_tokens": 5502.0, - "total_cost": 0.16721, - "input_cost": 0.08793999999999999, - "output_cost": 0.07927000000000001 + "total_tokens": 9083.0, + "provider": "OSS", + "Metric_request_tokens": 8372.0, + "Metric_response_tokens": 711.0, + "total_cost": 0.002796, + "input_cost": 0.0025115999999999997, + "output_cost": 0.00028440000000000003 }, { - "Model": "gemini-2.5-pro", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 31.963868, + "Model": "llama-xlam-2-70b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 210.683199, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6444.0, - "provider": "Google", - "Metric_request_tokens": 3210.0, - "Metric_response_tokens": 3234.0, - "total_cost": 0.0569825, - "input_cost": 0.0040125000000000004, - "output_cost": 0.052969999999999996 + "total_tokens": 9051.0, + "provider": "OSS", + "Metric_request_tokens": 8372.0, + "Metric_response_tokens": 679.0, + "total_cost": 0.0027831999999999996, + "input_cost": 0.0025115999999999997, + "output_cost": 0.00027160000000000004 + }, + { + "Model": "llama-xlam-2-70b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 216.695075, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 7355.0, + "provider": "OSS", + "Metric_request_tokens": 6621.0, + "Metric_response_tokens": 734.0, + "total_cost": 0.0022798999999999996, + "input_cost": 0.0019863, + "output_cost": 0.0002936 + }, + { + "Model": "llama-xlam-2-70b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 224.351893, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9128.0, + "provider": "OSS", + "Metric_request_tokens": 8378.0, + "Metric_response_tokens": 750.0, + "total_cost": 0.0028133999999999998, + "input_cost": 0.0025134, + "output_cost": 0.00030000000000000003 + }, + { + "Model": "llama-xlam-2-70b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 288.932326, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7687.0, + "provider": "OSS", + "Metric_request_tokens": 6627.0, + "Metric_response_tokens": 1060.0, + "total_cost": 0.0024121, + "input_cost": 0.0019881, + "output_cost": 0.000424 + }, + { + "Model": "llama-xlam-2-70b-fc-r", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 196.932026, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5577.0, + "provider": "OSS", + "Metric_request_tokens": 4919.0, + "Metric_response_tokens": 658.0, + "total_cost": 0.0017388999999999998, + "input_cost": 0.0014757, + "output_cost": 0.0002632 } ], "config": {