From 5b17b2ec39c16d498078608a616ce3bd46dfe794 Mon Sep 17 00:00:00 2001 From: Andrew Ginns Date: Wed, 23 Jul 2025 07:49:13 +0000 Subject: [PATCH 1/2] chore: Update pricing --- src/data/merbench_data.json | 4382 +++++++++++++++++++++++++++++++---- 1 file changed, 3919 insertions(+), 463 deletions(-) diff --git a/src/data/merbench_data.json b/src/data/merbench_data.json index bdd8dee..437c0c3 100644 --- a/src/data/merbench_data.json +++ b/src/data/merbench_data.json @@ -1,6 +1,6 @@ { "stats": { - "total_runs": 180, + "total_runs": 396, "models_evaluated": 12, "test_cases": 3, "test_groups": ["easy", "hard", "medium"], @@ -19,97 +19,97 @@ "gemini-2.5-pro-preview-05-06", "gemini-2.5-pro-preview-06-05" ], - "total_cost": 11.899785959999999, - "avg_cost_per_run": 0.066109922 + "total_cost": 15.01060421, + "avg_cost_per_run": 0.03790556618686869 }, "leaderboard": [ { "Model": "gemini-2.5-pro-preview-06-05", - "Success_Rate": 40.0, - "Avg_Duration": 46.88785905726666, - "Avg_Tokens": 8693.733333333334, - "Avg_Cost": 0.04551366666666667, - "Avg_Input_Cost": 0.0059176666666666666, - "Avg_Output_Cost": 0.039596, - "Runs": 15, + "Success_Rate": 33.333333333333336, + "Avg_Duration": 38.34483030143589, + "Avg_Tokens": 8813.564102564103, + "Avg_Cost": 0.04050108974358974, + "Avg_Input_Cost": 0.006804935897435898, + "Avg_Output_Cost": 0.03369615384615384, + "Runs": 39, "Provider": "Google" }, { "Model": "gemini-2.5-pro-preview-05-06", - "Success_Rate": 33.333333333333336, - "Avg_Duration": 77.48776330746666, - "Avg_Tokens": 46132.333333333336, - "Avg_Cost": 0.3224205833333333, - "Avg_Input_Cost": 0.01984325, - "Avg_Output_Cost": 0.3025773333333333, - "Runs": 15, + "Success_Rate": 27.272727272727273, + "Avg_Duration": 56.42929048663636, + "Avg_Tokens": 24795.242424242424, + "Avg_Cost": 0.16677734848484846, + "Avg_Input_Cost": 0.011596439393939393, + "Avg_Output_Cost": 0.1551809090909091, + "Runs": 33, "Provider": "Google" }, { "Model": "gemini-2.5-pro-preview-03-25", - "Success_Rate": 26.666666666666668, - "Avg_Duration": 100.73428291506666, - "Avg_Tokens": 37934.066666666666, - "Avg_Cost": 0.28487625, - "Avg_Input_Cost": 0.013494916666666667, - "Avg_Output_Cost": 0.2713813333333333, - "Runs": 15, + "Success_Rate": 23.076923076923077, + "Avg_Duration": 60.486014026717946, + "Avg_Tokens": 18381.05128205128, + "Avg_Cost": 0.12938753205128206, + "Avg_Input_Cost": 0.0077747115384615385, + "Avg_Output_Cost": 0.1216128205128205, + "Runs": 39, "Provider": "Google" }, { "Model": "gemini-2.5-flash", - "Success_Rate": 20.0, - "Avg_Duration": 12.848961589133333, - "Avg_Tokens": 12838.466666666667, - "Avg_Cost": 0.012343429999999999, - "Avg_Input_Cost": 0.0005051799999999999, - "Avg_Output_Cost": 0.01183825, - "Runs": 15, + "Success_Rate": 13.333333333333334, + "Avg_Duration": 10.148847224155556, + "Avg_Tokens": 6990.466666666666, + "Avg_Cost": 0.012757218888888887, + "Avg_Input_Cost": 0.0004194433333333333, + "Avg_Output_Cost": 0.012337775555555555, + "Runs": 45, "Provider": "Google" }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Success_Rate": 6.666666666666667, - "Avg_Duration": 4.4174195914666665, - "Avg_Tokens": 4198.2, - "Avg_Cost": 0.00074826, - "Avg_Input_Cost": 0.00031034000000000004, - "Avg_Output_Cost": 0.00043792, + "Avg_Duration": 78.3300724666, + "Avg_Tokens": 15556.266666666666, + "Avg_Cost": 0.05645733333333334, + "Avg_Input_Cost": 0.034498999999999995, + "Avg_Output_Cost": 0.021958333333333333, "Runs": 15, - "Provider": "Google" + "Provider": "Amazon" }, { - "Model": "gemini-2.5-flash-preview-04-17", - "Success_Rate": 6.666666666666667, - "Avg_Duration": 27.5010082518, - "Avg_Tokens": 20486.066666666666, - "Avg_Cost": 0.053945726666666666, - "Avg_Input_Cost": 0.0006465599999999999, - "Avg_Output_Cost": 0.05329916666666667, - "Runs": 15, + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Success_Rate": 4.444444444444445, + "Avg_Duration": 4.399183285022222, + "Avg_Tokens": 5233.377777777778, + "Avg_Cost": 0.0008394444444444445, + "Avg_Input_Cost": 0.00041796888888888893, + "Avg_Output_Cost": 0.0004214755555555556, + "Runs": 45, "Provider": "Google" }, { "Model": "gemini-2.5-flash-preview-05-20", - "Success_Rate": 6.666666666666667, - "Avg_Duration": 11.216488175266667, - "Avg_Tokens": 7726.4, - "Avg_Cost": 0.016411616666666667, - "Avg_Input_Cost": 0.00038195, - "Avg_Output_Cost": 0.016029666666666668, - "Runs": 15, + "Success_Rate": 4.444444444444445, + "Avg_Duration": 9.259261195111112, + "Avg_Tokens": 5119.933333333333, + "Avg_Cost": 0.007952797777777779, + "Avg_Input_Cost": 0.00035615999999999995, + "Avg_Output_Cost": 0.007596637777777778, + "Runs": 45, "Provider": "Google" }, { - "Model": "bedrock:us.amazon.nova-premier-v1:0", - "Success_Rate": 6.666666666666667, - "Avg_Duration": 78.3300724666, - "Avg_Tokens": 15556.266666666666, - "Avg_Cost": 0.05645733333333334, - "Avg_Input_Cost": 0.034498999999999995, - "Avg_Output_Cost": 0.021958333333333333, - "Runs": 15, - "Provider": "Amazon" + "Model": "gemini-2.5-flash-preview-04-17", + "Success_Rate": 4.444444444444445, + "Avg_Duration": 24.15054471668889, + "Avg_Tokens": 10492.711111111112, + "Avg_Cost": 0.023275714444444442, + "Avg_Input_Cost": 0.0004890233333333333, + "Avg_Output_Cost": 0.02278669111111111, + "Runs": 45, + "Provider": "Google" }, { "Model": "bedrock:us.amazon.nova-pro-v1:0", @@ -147,12 +147,12 @@ { "Model": "gemini-2.0-flash", "Success_Rate": 0.0, - "Avg_Duration": 7.099265544333333, - "Avg_Tokens": 1581.5333333333333, - "Avg_Cost": 0.0003323333333333334, - "Avg_Input_Cost": 0.00010009333333333333, - "Avg_Output_Cost": 0.00023223999999999999, - "Runs": 15, + "Avg_Duration": 4.5703242256222225, + "Avg_Tokens": 1354.6, + "Avg_Cost": 0.00029454666666666666, + "Avg_Input_Cost": 8.243111111111111e-5, + "Avg_Output_Cost": 0.00021211555555555555, + "Runs": 45, "Provider": "Google" } ], @@ -204,90 +204,90 @@ { "Model": "gemini-2.0-flash", "Success_Rate": 0.0, - "Duration": 7.099265544333333, - "total_tokens": 1581.5333333333333, - "total_cost": 0.0003323333333333334, - "input_cost": 0.00010009333333333333, - "output_cost": 0.00023223999999999999, - "Metric_request_tokens": 1000.9333333333333, - "Metric_response_tokens": 580.6 + "Duration": 4.5703242256222225, + "total_tokens": 1354.6, + "total_cost": 0.00029454666666666666, + "input_cost": 8.243111111111111e-5, + "output_cost": 0.00021211555555555555, + "Metric_request_tokens": 824.3111111111111, + "Metric_response_tokens": 530.2888888888889 }, { "Model": "gemini-2.5-flash", - "Success_Rate": 20.0, - "Duration": 12.848961589133333, - "total_tokens": 12838.466666666667, - "total_cost": 0.012343429999999999, - "input_cost": 0.0005051799999999999, - "output_cost": 0.01183825, - "Metric_request_tokens": 3367.866666666667, - "Metric_response_tokens": 936.1333333333333 + "Success_Rate": 13.333333333333334, + "Duration": 10.148847224155556, + "total_tokens": 6990.466666666666, + "total_cost": 0.012757218888888887, + "input_cost": 0.0004194433333333333, + "output_cost": 0.012337775555555555, + "Metric_request_tokens": 2796.288888888889, + "Metric_response_tokens": 807.5333333333333 }, { "Model": "gemini-2.5-flash-lite-preview-06-17", - "Success_Rate": 6.666666666666667, - "Duration": 4.4174195914666665, - "total_tokens": 4198.2, - "total_cost": 0.00074826, - "input_cost": 0.00031034000000000004, - "output_cost": 0.00043792, - "Metric_request_tokens": 3103.4, - "Metric_response_tokens": 1094.8 + "Success_Rate": 4.444444444444445, + "Duration": 4.399183285022222, + "total_tokens": 5233.377777777778, + "total_cost": 0.0008394444444444445, + "input_cost": 0.00041796888888888893, + "output_cost": 0.0004214755555555556, + "Metric_request_tokens": 4179.688888888889, + "Metric_response_tokens": 1053.6888888888889 }, { "Model": "gemini-2.5-flash-preview-04-17", - "Success_Rate": 6.666666666666667, - "Duration": 27.5010082518, - "total_tokens": 20486.066666666666, - "total_cost": 0.053945726666666666, - "input_cost": 0.0006465599999999999, - "output_cost": 0.05329916666666667, - "Metric_request_tokens": 4618.285714285715, - "Metric_response_tokens": 1225.0 + "Success_Rate": 4.444444444444445, + "Duration": 24.15054471668889, + "total_tokens": 10492.711111111112, + "total_cost": 0.023275714444444442, + "input_cost": 0.0004890233333333333, + "output_cost": 0.02278669111111111, + "Metric_request_tokens": 4445.666666666667, + "Metric_response_tokens": 1188.3636363636363 }, { "Model": "gemini-2.5-flash-preview-05-20", - "Success_Rate": 6.666666666666667, - "Duration": 11.216488175266667, - "total_tokens": 7726.4, - "total_cost": 0.016411616666666667, - "input_cost": 0.00038195, - "output_cost": 0.016029666666666668, - "Metric_request_tokens": 2728.214285714286, - "Metric_response_tokens": 776.0714285714286 + "Success_Rate": 4.444444444444445, + "Duration": 9.259261195111112, + "total_tokens": 5119.933333333333, + "total_cost": 0.007952797777777779, + "input_cost": 0.00035615999999999995, + "output_cost": 0.007596637777777778, + "Metric_request_tokens": 2428.3636363636365, + "Metric_response_tokens": 709.8181818181819 }, { "Model": "gemini-2.5-pro-preview-03-25", - "Success_Rate": 26.666666666666668, - "Duration": 100.73428291506666, - "total_tokens": 37934.066666666666, - "total_cost": 0.28487625, - "input_cost": 0.013494916666666667, - "output_cost": 0.2713813333333333, - "Metric_request_tokens": 26989.833333333332, - "Metric_response_tokens": 1990.1666666666667 + "Success_Rate": 23.076923076923077, + "Duration": 60.486014026717946, + "total_tokens": 18381.05128205128, + "total_cost": 0.12938753205128206, + "input_cost": 0.0077747115384615385, + "output_cost": 0.1216128205128205, + "Metric_request_tokens": 8364.51724137931, + "Metric_response_tokens": 1380.8275862068965 }, { "Model": "gemini-2.5-pro-preview-05-06", - "Success_Rate": 33.333333333333336, - "Duration": 77.48776330746666, - "total_tokens": 46132.333333333336, - "total_cost": 0.3224205833333333, - "input_cost": 0.01984325, - "output_cost": 0.3025773333333333, - "Metric_request_tokens": 39686.5, - "Metric_response_tokens": 2584.0 + "Success_Rate": 27.272727272727273, + "Duration": 56.42929048663636, + "total_tokens": 24795.242424242424, + "total_cost": 0.16677734848484846, + "input_cost": 0.011596439393939393, + "output_cost": 0.1551809090909091, + "Metric_request_tokens": 12756.083333333334, + "Metric_response_tokens": 1607.2916666666667 }, { "Model": "gemini-2.5-pro-preview-06-05", - "Success_Rate": 40.0, - "Duration": 46.88785905726666, - "total_tokens": 8693.733333333334, - "total_cost": 0.04551366666666667, - "input_cost": 0.0059176666666666666, - "output_cost": 0.039596, - "Metric_request_tokens": 4734.133333333333, - "Metric_response_tokens": 1421.9333333333334 + "Success_Rate": 33.333333333333336, + "Duration": 38.34483030143589, + "total_tokens": 8813.564102564103, + "total_cost": 0.04050108974358974, + "input_cost": 0.006804935897435898, + "output_cost": 0.03369615384615384, + "Metric_request_tokens": 5443.948717948718, + "Metric_response_tokens": 1348.3846153846155 } ], "test_groups_data": [ @@ -429,10 +429,10 @@ "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_cost": 0.00027672, + "total_cost": 0.00027685333333333335, "input_cost": 7.36e-5, - "output_cost": 0.00020312, - "total_tokens": 1243.8 + "output_cost": 0.00020325333333333332, + "total_tokens": 1244.1333333333334 }, { "Model": "gemini-2.0-flash", @@ -440,76 +440,76 @@ "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_cost": 0.00027488, + "total_cost": 0.00027490666666666664, "input_cost": 7.36e-5, - "output_cost": 0.00020128000000000002, - "total_tokens": 1239.2 + "output_cost": 0.0002013066666666667, + "total_tokens": 1239.2666666666667 }, { "Model": "gemini-2.0-flash", "test_group": "medium", "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.4, - "total_cost": 0.00044540000000000004, - "input_cost": 0.00015308000000000002, - "output_cost": 0.00029232, - "total_tokens": 2261.6 + "Score_UsedBothMCPTools": 0.13333333333333333, + "total_cost": 0.00033188, + "input_cost": 0.00010009333333333333, + "output_cost": 0.0002317866666666667, + "total_tokens": 1580.4 }, { "Model": "gemini-2.5-flash", "test_group": "easy", - "Score_MermaidDiagramValid": 0.6, + "Score_MermaidDiagramValid": 0.4, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.7, - "total_cost": 0.01891206, - "input_cost": 0.00043130999999999996, - "output_cost": 0.01848075, - "total_tokens": 17660.0 + "Score_UsedBothMCPTools": 0.6333333333333333, + "total_cost": 0.018718946666666667, + "input_cost": 0.00037073999999999994, + "output_cost": 0.01834820666666666, + "total_tokens": 8388.733333333334 }, { "Model": "gemini-2.5-flash", "test_group": "hard", "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.8, - "total_cost": 0.01202555, - "input_cost": 0.0004998, - "output_cost": 0.01152575, - "total_tokens": 12552.6 + "Score_UsedBothMCPTools": 0.6666666666666666, + "total_cost": 0.012811633333333329, + "input_cost": 0.0004289, + "output_cost": 0.012382733333333333, + "total_tokens": 7064.533333333334 }, { "Model": "gemini-2.5-flash", "test_group": "medium", "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.7, - "total_cost": 0.0060926800000000005, - "input_cost": 0.0005844299999999999, - "output_cost": 0.005508250000000001, - "total_tokens": 8302.8 + "Score_UsedBothMCPTools": 0.6666666666666666, + "total_cost": 0.006741076666666666, + "input_cost": 0.00045869, + "output_cost": 0.006282386666666667, + "total_tokens": 5518.133333333333 }, { "Model": "gemini-2.5-flash-lite-preview-06-17", "test_group": "easy", - "Score_MermaidDiagramValid": 0.2, + "Score_MermaidDiagramValid": 0.13333333333333333, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.6, - "total_cost": 0.0007869400000000001, - "input_cost": 0.00030918000000000005, - "output_cost": 0.00047776, - "total_tokens": 4286.2 + "Score_UsedBothMCPTools": 0.7, + "total_cost": 0.00107244, + "input_cost": 0.0006171600000000001, + "output_cost": 0.00045528, + "total_tokens": 7309.8 }, { "Model": "gemini-2.5-flash-lite-preview-06-17", "test_group": "hard", "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.8, - "total_cost": 0.0007536600000000001, - "input_cost": 0.00031182000000000003, - "output_cost": 0.00044184000000000004, - "total_tokens": 4222.8 + "Score_UsedBothMCPTools": 0.7666666666666667, + "total_cost": 0.0006987066666666668, + "input_cost": 0.00031092, + "output_cost": 0.0003877866666666667, + "total_tokens": 4078.6666666666665 }, { "Model": "gemini-2.5-flash-lite-preview-06-17", @@ -517,54 +517,54 @@ "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.8, - "total_cost": 0.00070418, - "input_cost": 0.00031002, - "output_cost": 0.00039416, - "total_tokens": 4085.6 + "total_cost": 0.0007471866666666667, + "input_cost": 0.0003258266666666667, + "output_cost": 0.00042136, + "total_tokens": 4311.666666666667 }, { "Model": "gemini-2.5-flash-preview-04-17", "test_group": "easy", - "Score_MermaidDiagramValid": 0.2, + "Score_MermaidDiagramValid": 0.13333333333333333, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.6, - "total_cost": 0.09745198000000001, - "input_cost": 0.0008380199999999999, - "output_cost": 0.09661396000000001, - "total_tokens": 34186.4 + "Score_UsedBothMCPTools": 0.4666666666666667, + "total_cost": 0.037818783333333335, + "input_cost": 0.00047965, + "output_cost": 0.03733913333333334, + "total_tokens": 14564.2 }, { "Model": "gemini-2.5-flash-preview-04-17", "test_group": "hard", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.8, - "total_cost": 0.029262609999999994, - "input_cost": 0.00045363, - "output_cost": 0.02880898, - "total_tokens": 12207.2 + "Score_UsageLimitNotExceeded": 0.9333333333333333, + "Score_UsedBothMCPTools": 0.5666666666666667, + "total_cost": 0.014207763333333333, + "input_cost": 0.00037735, + "output_cost": 0.013830413333333331, + "total_tokens": 7169.4 }, { "Model": "gemini-2.5-flash-preview-04-17", "test_group": "medium", "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.7, - "total_cost": 0.035122589999999995, - "input_cost": 0.00064803, - "output_cost": 0.03447456, - "total_tokens": 15064.6 + "Score_UsedBothMCPTools": 0.5666666666666667, + "total_cost": 0.017800596666666668, + "input_cost": 0.00061007, + "output_cost": 0.017190526666666667, + "total_tokens": 9744.533333333333 }, { "Model": "gemini-2.5-flash-preview-05-20", "test_group": "easy", - "Score_MermaidDiagramValid": 0.2, + "Score_MermaidDiagramValid": 0.13333333333333333, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.6, - "total_cost": 0.01970984, - "input_cost": 0.00029376, - "output_cost": 0.01941608, - "total_tokens": 8071.6 + "Score_UsedBothMCPTools": 0.5666666666666667, + "total_cost": 0.009252423333333334, + "input_cost": 0.00030188999999999997, + "output_cost": 0.008950533333333333, + "total_tokens": 5100.733333333334 }, { "Model": "gemini-2.5-flash-preview-05-20", @@ -572,98 +572,98 @@ "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_cost": 0.01184638, - "input_cost": 0.00026964, - "output_cost": 0.01157674, - "total_tokens": 5576.2 + "total_cost": 0.005455606666666666, + "input_cost": 0.00025677999999999996, + "output_cost": 0.005198826666666667, + "total_tokens": 3659.5333333333333 }, { "Model": "gemini-2.5-flash-preview-05-20", "test_group": "medium", "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.8, - "total_cost": 0.01767863, - "input_cost": 0.00058245, - "output_cost": 0.01709618, - "total_tokens": 9531.4 + "Score_UsedBothMCPTools": 0.7333333333333333, + "total_cost": 0.009150363333333333, + "input_cost": 0.00050981, + "output_cost": 0.008640553333333334, + "total_tokens": 6599.533333333334 }, { "Model": "gemini-2.5-pro-preview-03-25", "test_group": "easy", - "Score_MermaidDiagramValid": 0.8, + "Score_MermaidDiagramValid": 0.6923076923076923, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_cost": 0.82516775, - "input_cost": 0.03948375, - "output_cost": 0.785684, - "total_tokens": 110155.4 + "total_cost": 0.33611653846153844, + "input_cost": 0.017935, + "output_cost": 0.31818153846153846, + "total_tokens": 46166.153846153844 }, { "Model": "gemini-2.5-pro-preview-03-25", "test_group": "hard", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.2, - "Score_UsedBothMCPTools": 0.2, - "total_cost": 0.029460999999999998, - "input_cost": 0.001001, - "output_cost": 0.028459999999999996, - "total_tokens": 3646.8 + "Score_UsageLimitNotExceeded": 0.6153846153846154, + "Score_UsedBothMCPTools": 0.6153846153846154, + "total_cost": 0.030362788461538462, + "input_cost": 0.0028320192307692304, + "output_cost": 0.02753076923076923, + "total_tokens": 5018.692307692308 }, { "Model": "gemini-2.5-pro-preview-03-25", "test_group": "medium", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.2, - "Score_UsedBothMCPTools": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0, - "total_tokens": 0.0 + "Score_UsageLimitNotExceeded": 0.6923076923076923, + "Score_UsedBothMCPTools": 0.6153846153846154, + "total_cost": 0.021683269230769228, + "input_cost": 0.0025571153846153847, + "output_cost": 0.019126153846153845, + "total_tokens": 3958.3076923076924 }, { "Model": "gemini-2.5-pro-preview-05-06", "test_group": "easy", - "Score_MermaidDiagramValid": 0.8, + "Score_MermaidDiagramValid": 0.7272727272727273, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.8, - "total_cost": 0.530149, - "input_cost": 0.043318999999999996, - "output_cost": 0.48683, - "total_tokens": 83338.2 + "Score_UsedBothMCPTools": 0.9090909090909091, + "total_cost": 0.2612590909090909, + "input_cost": 0.022234545454545455, + "output_cost": 0.23902454545454546, + "total_tokens": 41690.09090909091 }, { "Model": "gemini-2.5-pro-preview-05-06", "test_group": "hard", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.8, - "Score_UsedBothMCPTools": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0, - "total_tokens": 0.0 + "Score_UsageLimitNotExceeded": 0.9090909090909091, + "Score_UsedBothMCPTools": 0.5454545454545454, + "total_cost": 0.01853659090909091, + "input_cost": 0.002118409090909091, + "output_cost": 0.016418181818181818, + "total_tokens": 3336.5454545454545 }, { "Model": "gemini-2.5-pro-preview-05-06", "test_group": "medium", - "Score_MermaidDiagramValid": 0.2, - "Score_UsageLimitNotExceeded": 0.4, - "Score_UsedBothMCPTools": 0.4, - "total_cost": 0.43711275, - "input_cost": 0.01621075, - "output_cost": 0.420902, - "total_tokens": 55058.8 + "Score_MermaidDiagramValid": 0.09090909090909091, + "Score_UsageLimitNotExceeded": 0.7272727272727273, + "Score_UsedBothMCPTools": 0.7272727272727273, + "total_cost": 0.22053636363636364, + "input_cost": 0.010436363636363636, + "output_cost": 0.21009999999999998, + "total_tokens": 29359.090909090908 }, { "Model": "gemini-2.5-pro-preview-06-05", "test_group": "easy", - "Score_MermaidDiagramValid": 1.0, + "Score_MermaidDiagramValid": 0.8461538461538461, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_cost": 0.030609249999999998, - "input_cost": 0.00399125, - "output_cost": 0.026617999999999996, - "total_tokens": 5854.8 + "total_cost": 0.03721326923076923, + "input_cost": 0.009261730769230769, + "output_cost": 0.027951538461538458, + "total_tokens": 10204.538461538461 }, { "Model": "gemini-2.5-pro-preview-06-05", @@ -671,21 +671,21 @@ "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_cost": 0.047156750000000004, - "input_cost": 0.006776750000000001, - "output_cost": 0.04038, - "total_tokens": 9459.4 + "total_cost": 0.04328519230769231, + "input_cost": 0.0057175, + "output_cost": 0.03756769230769231, + "total_tokens": 8330.76923076923 }, { "Model": "gemini-2.5-pro-preview-06-05", "test_group": "medium", - "Score_MermaidDiagramValid": 0.2, + "Score_MermaidDiagramValid": 0.15384615384615385, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_cost": 0.058775, - "input_cost": 0.006985, - "output_cost": 0.05179, - "total_tokens": 10767.0 + "total_cost": 0.041004807692307695, + "input_cost": 0.005435576923076923, + "output_cost": 0.03556923076923077, + "total_tokens": 7905.384615384615 } ], "failure_analysis_data": [ @@ -715,49 +715,49 @@ }, { "Model": "gemini-2.0-flash", - "Invalid Diagram": 15, - "MCP Tool Failure": 13, + "Invalid Diagram": 45, + "MCP Tool Failure": 43, "Usage Limit Exceeded": 0 }, { "Model": "gemini-2.5-flash", - "Invalid Diagram": 12, - "MCP Tool Failure": 8, + "Invalid Diagram": 39, + "MCP Tool Failure": 31, "Usage Limit Exceeded": 0 }, { "Model": "gemini-2.5-flash-lite-preview-06-17", - "Invalid Diagram": 14, - "MCP Tool Failure": 8, + "Invalid Diagram": 43, + "MCP Tool Failure": 22, "Usage Limit Exceeded": 0 }, { "Model": "gemini-2.5-flash-preview-04-17", - "Invalid Diagram": 14, - "MCP Tool Failure": 5, - "Usage Limit Exceeded": 0 + "Invalid Diagram": 43, + "MCP Tool Failure": 22, + "Usage Limit Exceeded": 1 }, { "Model": "gemini-2.5-flash-preview-05-20", - "Invalid Diagram": 14, - "MCP Tool Failure": 10, + "Invalid Diagram": 43, + "MCP Tool Failure": 35, "Usage Limit Exceeded": 0 }, { "Model": "gemini-2.5-pro-preview-03-25", - "Invalid Diagram": 11, - "MCP Tool Failure": 9, - "Usage Limit Exceeded": 8 + "Invalid Diagram": 30, + "MCP Tool Failure": 10, + "Usage Limit Exceeded": 9 }, { "Model": "gemini-2.5-pro-preview-05-06", - "Invalid Diagram": 10, + "Invalid Diagram": 24, "MCP Tool Failure": 9, "Usage Limit Exceeded": 4 }, { "Model": "gemini-2.5-pro-preview-06-05", - "Invalid Diagram": 9, + "Invalid Diagram": 26, "MCP Tool Failure": 0, "Usage Limit Exceeded": 0 } @@ -899,265 +899,265 @@ "Model": "gemini-2.0-flash", "test_group": "easy", "avg_total_cost": 0.000277, - "sum_total_cost": 0.001384, - "run_count": 5, + "sum_total_cost": 0.004153, + "run_count": 15, "avg_input_cost": 7.4e-5, - "sum_input_cost": 0.000368, + "sum_input_cost": 0.001104, "avg_output_cost": 0.000203, - "sum_output_cost": 0.001016 + "sum_output_cost": 0.003049 }, { "Model": "gemini-2.0-flash", "test_group": "hard", "avg_total_cost": 0.000275, - "sum_total_cost": 0.001374, - "run_count": 5, + "sum_total_cost": 0.004124, + "run_count": 15, "avg_input_cost": 7.4e-5, - "sum_input_cost": 0.000368, + "sum_input_cost": 0.001104, "avg_output_cost": 0.000201, - "sum_output_cost": 0.001006 + "sum_output_cost": 0.00302 }, { "Model": "gemini-2.0-flash", "test_group": "medium", - "avg_total_cost": 0.000445, - "sum_total_cost": 0.002227, - "run_count": 5, - "avg_input_cost": 0.000153, - "sum_input_cost": 0.000765, - "avg_output_cost": 0.000292, - "sum_output_cost": 0.001462 + "avg_total_cost": 0.000332, + "sum_total_cost": 0.004978, + "run_count": 15, + "avg_input_cost": 0.0001, + "sum_input_cost": 0.001501, + "avg_output_cost": 0.000232, + "sum_output_cost": 0.003477 }, { "Model": "gemini-2.5-flash", "test_group": "easy", - "avg_total_cost": 0.018912, - "sum_total_cost": 0.09456, - "run_count": 5, - "avg_input_cost": 0.000431, - "sum_input_cost": 0.002157, - "avg_output_cost": 0.018481, - "sum_output_cost": 0.092404 + "avg_total_cost": 0.018719, + "sum_total_cost": 0.280784, + "run_count": 15, + "avg_input_cost": 0.000371, + "sum_input_cost": 0.005561, + "avg_output_cost": 0.018348, + "sum_output_cost": 0.275223 }, { "Model": "gemini-2.5-flash", "test_group": "hard", - "avg_total_cost": 0.012026, - "sum_total_cost": 0.060128, - "run_count": 5, - "avg_input_cost": 0.0005, - "sum_input_cost": 0.002499, - "avg_output_cost": 0.011526, - "sum_output_cost": 0.057629 + "avg_total_cost": 0.012812, + "sum_total_cost": 0.192174, + "run_count": 15, + "avg_input_cost": 0.000429, + "sum_input_cost": 0.006434, + "avg_output_cost": 0.012383, + "sum_output_cost": 0.185741 }, { "Model": "gemini-2.5-flash", "test_group": "medium", - "avg_total_cost": 0.006093, - "sum_total_cost": 0.030463, - "run_count": 5, - "avg_input_cost": 0.000584, - "sum_input_cost": 0.002922, - "avg_output_cost": 0.005508, - "sum_output_cost": 0.027541 + "avg_total_cost": 0.006741, + "sum_total_cost": 0.101116, + "run_count": 15, + "avg_input_cost": 0.000459, + "sum_input_cost": 0.00688, + "avg_output_cost": 0.006282, + "sum_output_cost": 0.094236 }, { "Model": "gemini-2.5-flash-lite-preview-06-17", "test_group": "easy", - "avg_total_cost": 0.000787, - "sum_total_cost": 0.003935, - "run_count": 5, - "avg_input_cost": 0.000309, - "sum_input_cost": 0.001546, - "avg_output_cost": 0.000478, - "sum_output_cost": 0.002389 + "avg_total_cost": 0.001072, + "sum_total_cost": 0.016087, + "run_count": 15, + "avg_input_cost": 0.000617, + "sum_input_cost": 0.009257, + "avg_output_cost": 0.000455, + "sum_output_cost": 0.006829 }, { "Model": "gemini-2.5-flash-lite-preview-06-17", "test_group": "hard", - "avg_total_cost": 0.000754, - "sum_total_cost": 0.003768, - "run_count": 5, - "avg_input_cost": 0.000312, - "sum_input_cost": 0.001559, - "avg_output_cost": 0.000442, - "sum_output_cost": 0.002209 + "avg_total_cost": 0.000699, + "sum_total_cost": 0.010481, + "run_count": 15, + "avg_input_cost": 0.000311, + "sum_input_cost": 0.004664, + "avg_output_cost": 0.000388, + "sum_output_cost": 0.005817 }, { "Model": "gemini-2.5-flash-lite-preview-06-17", "test_group": "medium", - "avg_total_cost": 0.000704, - "sum_total_cost": 0.003521, - "run_count": 5, - "avg_input_cost": 0.00031, - "sum_input_cost": 0.00155, - "avg_output_cost": 0.000394, - "sum_output_cost": 0.001971 + "avg_total_cost": 0.000747, + "sum_total_cost": 0.011208, + "run_count": 15, + "avg_input_cost": 0.000326, + "sum_input_cost": 0.004887, + "avg_output_cost": 0.000421, + "sum_output_cost": 0.00632 }, { "Model": "gemini-2.5-flash-preview-04-17", "test_group": "easy", - "avg_total_cost": 0.097452, - "sum_total_cost": 0.48726, - "run_count": 5, - "avg_input_cost": 0.000838, - "sum_input_cost": 0.00419, - "avg_output_cost": 0.096614, - "sum_output_cost": 0.48307 + "avg_total_cost": 0.037819, + "sum_total_cost": 0.567282, + "run_count": 15, + "avg_input_cost": 0.00048, + "sum_input_cost": 0.007195, + "avg_output_cost": 0.037339, + "sum_output_cost": 0.560087 }, { "Model": "gemini-2.5-flash-preview-04-17", "test_group": "hard", - "avg_total_cost": 0.029263, - "sum_total_cost": 0.146313, - "run_count": 5, - "avg_input_cost": 0.000454, - "sum_input_cost": 0.002268, - "avg_output_cost": 0.028809, - "sum_output_cost": 0.144045 + "avg_total_cost": 0.014208, + "sum_total_cost": 0.213116, + "run_count": 15, + "avg_input_cost": 0.000377, + "sum_input_cost": 0.00566, + "avg_output_cost": 0.01383, + "sum_output_cost": 0.207456 }, { "Model": "gemini-2.5-flash-preview-04-17", "test_group": "medium", - "avg_total_cost": 0.035123, - "sum_total_cost": 0.175613, - "run_count": 5, - "avg_input_cost": 0.000648, - "sum_input_cost": 0.00324, - "avg_output_cost": 0.034475, - "sum_output_cost": 0.172373 + "avg_total_cost": 0.017801, + "sum_total_cost": 0.267009, + "run_count": 15, + "avg_input_cost": 0.00061, + "sum_input_cost": 0.009151, + "avg_output_cost": 0.017191, + "sum_output_cost": 0.257858 }, { "Model": "gemini-2.5-flash-preview-05-20", "test_group": "easy", - "avg_total_cost": 0.01971, - "sum_total_cost": 0.098549, - "run_count": 5, - "avg_input_cost": 0.000294, - "sum_input_cost": 0.001469, - "avg_output_cost": 0.019416, - "sum_output_cost": 0.09708 + "avg_total_cost": 0.009252, + "sum_total_cost": 0.138786, + "run_count": 15, + "avg_input_cost": 0.000302, + "sum_input_cost": 0.004528, + "avg_output_cost": 0.008951, + "sum_output_cost": 0.134258 }, { "Model": "gemini-2.5-flash-preview-05-20", "test_group": "hard", - "avg_total_cost": 0.011846, - "sum_total_cost": 0.059232, - "run_count": 5, - "avg_input_cost": 0.00027, - "sum_input_cost": 0.001348, - "avg_output_cost": 0.011577, - "sum_output_cost": 0.057884 + "avg_total_cost": 0.005456, + "sum_total_cost": 0.081834, + "run_count": 15, + "avg_input_cost": 0.000257, + "sum_input_cost": 0.003852, + "avg_output_cost": 0.005199, + "sum_output_cost": 0.077982 }, { "Model": "gemini-2.5-flash-preview-05-20", "test_group": "medium", - "avg_total_cost": 0.017679, - "sum_total_cost": 0.088393, - "run_count": 5, - "avg_input_cost": 0.000582, - "sum_input_cost": 0.002912, - "avg_output_cost": 0.017096, - "sum_output_cost": 0.085481 + "avg_total_cost": 0.00915, + "sum_total_cost": 0.137255, + "run_count": 15, + "avg_input_cost": 0.00051, + "sum_input_cost": 0.007647, + "avg_output_cost": 0.008641, + "sum_output_cost": 0.129608 }, { "Model": "gemini-2.5-pro-preview-03-25", "test_group": "easy", - "avg_total_cost": 0.825168, - "sum_total_cost": 4.125839, - "run_count": 5, - "avg_input_cost": 0.039484, - "sum_input_cost": 0.197419, - "avg_output_cost": 0.785684, - "sum_output_cost": 3.92842 + "avg_total_cost": 0.336117, + "sum_total_cost": 4.369515, + "run_count": 13, + "avg_input_cost": 0.017935, + "sum_input_cost": 0.233155, + "avg_output_cost": 0.318182, + "sum_output_cost": 4.13636 }, { "Model": "gemini-2.5-pro-preview-03-25", "test_group": "hard", - "avg_total_cost": 0.029461, - "sum_total_cost": 0.147305, - "run_count": 5, - "avg_input_cost": 0.001001, - "sum_input_cost": 0.005005, - "avg_output_cost": 0.02846, - "sum_output_cost": 0.1423 + "avg_total_cost": 0.030363, + "sum_total_cost": 0.394716, + "run_count": 13, + "avg_input_cost": 0.002832, + "sum_input_cost": 0.036816, + "avg_output_cost": 0.027531, + "sum_output_cost": 0.3579 }, { "Model": "gemini-2.5-pro-preview-03-25", "test_group": "medium", - "avg_total_cost": 0.0, - "sum_total_cost": 0.0, - "run_count": 5, - "avg_input_cost": 0.0, - "sum_input_cost": 0.0, - "avg_output_cost": 0.0, - "sum_output_cost": 0.0 + "avg_total_cost": 0.021683, + "sum_total_cost": 0.281882, + "run_count": 13, + "avg_input_cost": 0.002557, + "sum_input_cost": 0.033242, + "avg_output_cost": 0.019126, + "sum_output_cost": 0.24864 }, { "Model": "gemini-2.5-pro-preview-05-06", "test_group": "easy", - "avg_total_cost": 0.530149, - "sum_total_cost": 2.650745, - "run_count": 5, - "avg_input_cost": 0.043319, - "sum_input_cost": 0.216595, - "avg_output_cost": 0.48683, - "sum_output_cost": 2.43415 + "avg_total_cost": 0.261259, + "sum_total_cost": 2.87385, + "run_count": 11, + "avg_input_cost": 0.022235, + "sum_input_cost": 0.24458, + "avg_output_cost": 0.239025, + "sum_output_cost": 2.62927 }, { "Model": "gemini-2.5-pro-preview-05-06", "test_group": "hard", - "avg_total_cost": 0.0, - "sum_total_cost": 0.0, - "run_count": 5, - "avg_input_cost": 0.0, - "sum_input_cost": 0.0, - "avg_output_cost": 0.0, - "sum_output_cost": 0.0 + "avg_total_cost": 0.018537, + "sum_total_cost": 0.203902, + "run_count": 11, + "avg_input_cost": 0.002118, + "sum_input_cost": 0.023302, + "avg_output_cost": 0.016418, + "sum_output_cost": 0.1806 }, { "Model": "gemini-2.5-pro-preview-05-06", "test_group": "medium", - "avg_total_cost": 0.437113, - "sum_total_cost": 2.185564, - "run_count": 5, - "avg_input_cost": 0.016211, - "sum_input_cost": 0.081054, - "avg_output_cost": 0.420902, - "sum_output_cost": 2.10451 + "avg_total_cost": 0.220536, + "sum_total_cost": 2.4259, + "run_count": 11, + "avg_input_cost": 0.010436, + "sum_input_cost": 0.1148, + "avg_output_cost": 0.2101, + "sum_output_cost": 2.3111 }, { "Model": "gemini-2.5-pro-preview-06-05", "test_group": "easy", - "avg_total_cost": 0.030609, - "sum_total_cost": 0.153046, - "run_count": 5, - "avg_input_cost": 0.003991, - "sum_input_cost": 0.019956, - "avg_output_cost": 0.026618, - "sum_output_cost": 0.13309 + "avg_total_cost": 0.037213, + "sum_total_cost": 0.483772, + "run_count": 13, + "avg_input_cost": 0.009262, + "sum_input_cost": 0.120402, + "avg_output_cost": 0.027952, + "sum_output_cost": 0.36337 }, { "Model": "gemini-2.5-pro-preview-06-05", "test_group": "hard", - "avg_total_cost": 0.047157, - "sum_total_cost": 0.235784, - "run_count": 5, - "avg_input_cost": 0.006777, - "sum_input_cost": 0.033884, - "avg_output_cost": 0.04038, - "sum_output_cost": 0.2019 + "avg_total_cost": 0.043285, + "sum_total_cost": 0.562708, + "run_count": 13, + "avg_input_cost": 0.005718, + "sum_input_cost": 0.074328, + "avg_output_cost": 0.037568, + "sum_output_cost": 0.48838 }, { "Model": "gemini-2.5-pro-preview-06-05", "test_group": "medium", - "avg_total_cost": 0.058775, - "sum_total_cost": 0.293875, - "run_count": 5, - "avg_input_cost": 0.006985, - "sum_input_cost": 0.034925, - "avg_output_cost": 0.05179, - "sum_output_cost": 0.25895 + "avg_total_cost": 0.041005, + "sum_total_cost": 0.533062, + "run_count": 13, + "avg_input_cost": 0.005436, + "sum_input_cost": 0.070662, + "avg_output_cost": 0.035569, + "sum_output_cost": 0.4624 } ], "raw_data": [ @@ -2373,9 +2373,9 @@ "provider": "Google", "Metric_request_tokens": 3060.0, "Metric_response_tokens": 1164.0, - "total_cost": 0.026557750000000005, + "total_cost": 0.0701599, "input_cost": 0.00045899999999999994, - "output_cost": 0.026098750000000004 + "output_cost": 0.0697009 }, { "Model": "gemini-2.5-flash", @@ -2389,9 +2389,9 @@ "provider": "Google", "Metric_request_tokens": 5361.0, "Metric_response_tokens": 1174.0, - "total_cost": 0.00366415, + "total_cost": 0.00540755, "input_cost": 0.00080415, - "output_cost": 0.00286 + "output_cost": 0.0046034 }, { "Model": "gemini-2.5-flash", @@ -2405,9 +2405,9 @@ "provider": "Google", "Metric_request_tokens": 1669.0, "Metric_response_tokens": 552.0, - "total_cost": 0.0014616, + "total_cost": 0.00204105, "input_cost": 0.00025035, - "output_cost": 0.00121125 + "output_cost": 0.0017907 }, { "Model": "gemini-2.5-flash", @@ -2421,9 +2421,9 @@ "provider": "Google", "Metric_request_tokens": 1671.0, "Metric_response_tokens": 554.0, - "total_cost": 0.00143315, + "total_cost": 0.00195505, "input_cost": 0.00025065, - "output_cost": 0.0011825 + "output_cost": 0.0017044 }, { "Model": "gemini-2.5-flash", @@ -2437,9 +2437,9 @@ "provider": "Google", "Metric_request_tokens": 1669.0, "Metric_response_tokens": 552.0, - "total_cost": 0.0013590999999999998, + "total_cost": 0.00175405, "input_cost": 0.00025035, - "output_cost": 0.00110875 + "output_cost": 0.0015037 }, { "Model": "gemini-2.5-flash", @@ -2453,9 +2453,9 @@ "provider": "Google", "Metric_request_tokens": 3981.0, "Metric_response_tokens": 1184.0, - "total_cost": 0.0042859000000000005, + "total_cost": 0.00749205, "input_cost": 0.00059715, - "output_cost": 0.00368875 + "output_cost": 0.0068949 }, { "Model": "gemini-2.5-flash", @@ -2469,9 +2469,9 @@ "provider": "Google", "Metric_request_tokens": 1671.0, "Metric_response_tokens": 554.0, - "total_cost": 0.004914399999999999, + "total_cost": 0.01170255, "input_cost": 0.00025065, - "output_cost": 0.0046637499999999995 + "output_cost": 0.011451900000000001 }, { "Model": "gemini-2.5-flash", @@ -2485,9 +2485,9 @@ "provider": "Google", "Metric_request_tokens": 1669.0, "Metric_response_tokens": 554.0, - "total_cost": 0.00175785, + "total_cost": 0.00286475, "input_cost": 0.00025035, - "output_cost": 0.0015075000000000002 + "output_cost": 0.0026144000000000002 }, { "Model": "gemini-2.5-flash", @@ -2501,9 +2501,9 @@ "provider": "Google", "Metric_request_tokens": 1669.0, "Metric_response_tokens": 554.0, - "total_cost": 0.0028366, + "total_cost": 0.00588525, "input_cost": 0.00025035, - "output_cost": 0.00258625 + "output_cost": 0.0056349 }, { "Model": "gemini-2.5-flash", @@ -2517,9 +2517,9 @@ "provider": "Google", "Metric_request_tokens": 4005.0, "Metric_response_tokens": 1266.0, - "total_cost": 0.05410325, + "total_cost": 0.14673635, "input_cost": 0.0006007499999999999, - "output_cost": 0.0535025 + "output_cost": 0.14613559999999998 }, { "Model": "gemini-2.5-flash", @@ -2533,9 +2533,9 @@ "provider": "Google", "Metric_request_tokens": 1669.0, "Metric_response_tokens": 552.0, - "total_cost": 0.00122535, + "total_cost": 0.0013795500000000002, "input_cost": 0.00025035, - "output_cost": 0.000975 + "output_cost": 0.0011292000000000001 }, { "Model": "gemini-2.5-flash", @@ -2549,9 +2549,9 @@ "provider": "Google", "Metric_request_tokens": 3980.0, "Metric_response_tokens": 1185.0, - "total_cost": 0.048207, + "total_cost": 0.13046849999999996, "input_cost": 0.000597, - "output_cost": 0.04761 + "output_cost": 0.12987149999999997 }, { "Model": "gemini-2.5-flash", @@ -2565,9 +2565,9 @@ "provider": "Google", "Metric_request_tokens": 3970.0, "Metric_response_tokens": 1219.0, - "total_cost": 0.0075517499999999994, + "total_cost": 0.016537899999999998, "input_cost": 0.0005954999999999999, - "output_cost": 0.00695625 + "output_cost": 0.0159424 }, { "Model": "gemini-2.5-flash", @@ -2581,9 +2581,9 @@ "provider": "Google", "Metric_request_tokens": 9113.0, "Metric_response_tokens": 1804.0, - "total_cost": 0.02245695, + "total_cost": 0.055187349999999996, "input_cost": 0.0013669499999999998, - "output_cost": 0.02109 + "output_cost": 0.0538204 }, { "Model": "gemini-2.5-flash", @@ -2597,9 +2597,9 @@ "provider": "Google", "Metric_request_tokens": 5361.0, "Metric_response_tokens": 1174.0, - "total_cost": 0.00333665, + "total_cost": 0.004490549999999999, "input_cost": 0.00080415, - "output_cost": 0.0025325 + "output_cost": 0.0036864 }, { "Model": "gemini-2.5-flash-lite-preview-06-17", @@ -4040,6 +4040,3462 @@ "total_cost": 0.0, "input_cost": 0.0, "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 24.207496913, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5044.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1158.0, + "total_cost": 0.023253749999999997, + "input_cost": 0.00388375, + "output_cost": 0.01937 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 24.803203804, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5199.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.02480375, + "input_cost": 0.00388375, + "output_cost": 0.02092 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 34.032753508, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6600.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.03881375, + "input_cost": 0.00388375, + "output_cost": 0.03493 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 24.432732425, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5326.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1176.0, + "total_cost": 0.02607375, + "input_cost": 0.00388375, + "output_cost": 0.02219 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 27.33412975, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5806.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1201.0, + "total_cost": 0.03087375, + "input_cost": 0.00388375, + "output_cost": 0.02699 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 32.488361727, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5925.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1154.0, + "total_cost": 0.03206375, + "input_cost": 0.00388375, + "output_cost": 0.028180000000000004 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 26.454849392, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5526.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.02807375, + "input_cost": 0.00388375, + "output_cost": 0.024190000000000003 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 40.878031462, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10420.0, + "provider": "Google", + "Metric_request_tokens": 6847.0, + "Metric_response_tokens": 1815.0, + "total_cost": 0.04428875, + "input_cost": 0.00855875, + "output_cost": 0.03573 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 67.384048514, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 13422.0, + "provider": "Google", + "Metric_request_tokens": 6855.0, + "Metric_response_tokens": 1785.0, + "total_cost": 0.07423875, + "input_cost": 0.00856875, + "output_cost": 0.06567 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 27.014404973, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5406.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.026873749999999995, + "input_cost": 0.00388375, + "output_cost": 0.022989999999999997 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 26.744050775, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5425.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.027063750000000004, + "input_cost": 0.00388375, + "output_cost": 0.023180000000000003 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 39.712500277, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6614.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.03895375, + "input_cost": 0.00388375, + "output_cost": 0.035070000000000004 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 41.219766849, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7059.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.04340375, + "input_cost": 0.00388375, + "output_cost": 0.03952 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 24.833685412, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5205.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.024863749999999997, + "input_cost": 0.00388375, + "output_cost": 0.02098 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 30.57386297, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5837.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.031183749999999996, + "input_cost": 0.00388375, + "output_cost": 0.027299999999999998 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 53.658760503, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 63389.0, + "provider": "Google", + "Metric_request_tokens": 58608.0, + "Metric_response_tokens": 2447.0, + "total_cost": 0.12107000000000001, + "input_cost": 0.07326, + "output_cost": 0.04781 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 32.435081791, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6158.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1150.0, + "total_cost": 0.03439375, + "input_cost": 0.00388375, + "output_cost": 0.03051 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 26.427531899, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5629.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.029103749999999998, + "input_cost": 0.00388375, + "output_cost": 0.02522 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 33.541754196, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6253.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1164.0, + "total_cost": 0.03534375, + "input_cost": 0.00388375, + "output_cost": 0.03146 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 24.7685358, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5160.0, + "provider": "Google", + "Metric_request_tokens": 3101.0, + "Metric_response_tokens": 1172.0, + "total_cost": 0.024466250000000002, + "input_cost": 0.00387625, + "output_cost": 0.02059 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 25.723323767, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5428.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.02709375, + "input_cost": 0.00388375, + "output_cost": 0.02321 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 26.441471015, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5382.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.026633749999999998, + "input_cost": 0.00388375, + "output_cost": 0.02275 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 26.604628416, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5562.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.02843375, + "input_cost": 0.00388375, + "output_cost": 0.024550000000000002 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 50.415529759, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11548.0, + "provider": "Google", + "Metric_request_tokens": 6858.0, + "Metric_response_tokens": 1806.0, + "total_cost": 0.055472499999999994, + "input_cost": 0.0085725, + "output_cost": 0.0469 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 22.205452715, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4960.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1187.0, + "total_cost": 0.022413750000000003, + "input_cost": 0.00388375, + "output_cost": 0.01853 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 45.254514377, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7035.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1153.0, + "total_cost": 0.04316375, + "input_cost": 0.00388375, + "output_cost": 0.03928 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 25.540973715, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5321.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1168.0, + "total_cost": 0.02602375, + "input_cost": 0.00388375, + "output_cost": 0.02214 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 29.203818188, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5909.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.031903749999999995, + "input_cost": 0.00388375, + "output_cost": 0.028019999999999996 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 41.27588133, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10620.0, + "provider": "Google", + "Metric_request_tokens": 6855.0, + "Metric_response_tokens": 1805.0, + "total_cost": 0.04621875, + "input_cost": 0.00856875, + "output_cost": 0.03765 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 68.89242445, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9389.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.06670375, + "input_cost": 0.00388375, + "output_cost": 0.06282 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 34.105280135, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6279.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1280.0, + "total_cost": 0.035603749999999997, + "input_cost": 0.00388375, + "output_cost": 0.03172 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 120.860207911, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 13787.0, + "provider": "Google", + "Metric_request_tokens": 7717.0, + "Metric_response_tokens": 1789.0, + "total_cost": 0.07034625, + "input_cost": 0.00964625, + "output_cost": 0.0607 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 29.936865091, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5784.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1157.0, + "total_cost": 0.03065375, + "input_cost": 0.00388375, + "output_cost": 0.026770000000000002 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 27.396380686, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5674.0, + "provider": "Google", + "Metric_request_tokens": 3106.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.029562500000000002, + "input_cost": 0.0038824999999999997, + "output_cost": 0.02568 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 23.581064657, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5037.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1154.0, + "total_cost": 0.023183750000000003, + "input_cost": 0.00388375, + "output_cost": 0.0193 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 29.262028799, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5839.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1150.0, + "total_cost": 0.031203750000000002, + "input_cost": 0.00388375, + "output_cost": 0.02732 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 53.601437466, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8093.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1193.0, + "total_cost": 0.05374375, + "input_cost": 0.00388375, + "output_cost": 0.04986 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 23.554440753, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5308.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.02589375, + "input_cost": 0.00388375, + "output_cost": 0.02201 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 23.896141303, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5227.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.025083750000000002, + "input_cost": 0.00388375, + "output_cost": 0.0212 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 46.506492861, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10985.0, + "provider": "Google", + "Metric_request_tokens": 6854.0, + "Metric_response_tokens": 1829.0, + "total_cost": 0.0498775, + "input_cost": 0.0085675, + "output_cost": 0.04131 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 31.379013607, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5869.0, + "provider": "Google", + "Metric_request_tokens": 3104.0, + "Metric_response_tokens": 1195.0, + "total_cost": 0.03153, + "input_cost": 0.0038799999999999998, + "output_cost": 0.02765 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 23.397718403, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5142.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1136.0, + "total_cost": 0.02423375, + "input_cost": 0.00388375, + "output_cost": 0.02035 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 26.18099707, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5581.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.028623750000000003, + "input_cost": 0.00388375, + "output_cost": 0.02474 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 37.387479684, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6757.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.040383749999999996, + "input_cost": 0.00388375, + "output_cost": 0.0365 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 41.910195327, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10585.0, + "provider": "Google", + "Metric_request_tokens": 6807.0, + "Metric_response_tokens": 1723.0, + "total_cost": 0.046288750000000004, + "input_cost": 0.008508749999999999, + "output_cost": 0.03778 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 18.566825519, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4762.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.02043375, + "input_cost": 0.00388375, + "output_cost": 0.01655 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 35.952256562, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6120.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.03401375, + "input_cost": 0.00388375, + "output_cost": 0.03013 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 23.562834473, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5167.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.02448375, + "input_cost": 0.00388375, + "output_cost": 0.0206 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 26.116377335, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5461.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1175.0, + "total_cost": 0.027423750000000004, + "input_cost": 0.00388375, + "output_cost": 0.023540000000000002 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 42.19434694, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7549.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.04830375, + "input_cost": 0.00388375, + "output_cost": 0.04442 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 30.915458356, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5835.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1148.0, + "total_cost": 0.031163749999999997, + "input_cost": 0.00388375, + "output_cost": 0.02728 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 26.201752563, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5374.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.02655375, + "input_cost": 0.00388375, + "output_cost": 0.02267 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 23.905880551, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5249.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.02530375, + "input_cost": 0.00388375, + "output_cost": 0.021419999999999998 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 24.308699595, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5184.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1153.0, + "total_cost": 0.024653749999999995, + "input_cost": 0.00388375, + "output_cost": 0.020769999999999997 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 39.750523403, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6263.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.035443749999999996, + "input_cost": 0.00388375, + "output_cost": 0.03156 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 65.476559976, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6971.0, + "provider": "Google", + "Metric_request_tokens": 3984.0, + "Metric_response_tokens": 1170.0, + "total_cost": 0.03485, + "input_cost": 0.00498, + "output_cost": 0.02987 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 72.910902918, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 43.602165928, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10530.0, + "provider": "Google", + "Metric_request_tokens": 6840.0, + "Metric_response_tokens": 1786.0, + "total_cost": 0.045450000000000004, + "input_cost": 0.00855, + "output_cost": 0.0369 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 33.823405726, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6724.0, + "provider": "Google", + "Metric_request_tokens": 3968.0, + "Metric_response_tokens": 1157.0, + "total_cost": 0.03252, + "input_cost": 0.00496, + "output_cost": 0.02756 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 30.694557094, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5928.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1154.0, + "total_cost": 0.032093750000000004, + "input_cost": 0.00388375, + "output_cost": 0.028210000000000002 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 28.020181574, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5702.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.02983375, + "input_cost": 0.00388375, + "output_cost": 0.02595 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 26.120680251, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5541.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.02822375, + "input_cost": 0.00388375, + "output_cost": 0.02434 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 51.401740722, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8168.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.05449375, + "input_cost": 0.00388375, + "output_cost": 0.05061 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 30.757611824, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5710.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.029913749999999996, + "input_cost": 0.00388375, + "output_cost": 0.026029999999999998 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 34.511741088, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6547.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1176.0, + "total_cost": 0.03828375, + "input_cost": 0.00388375, + "output_cost": 0.0344 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 33.667128837, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6142.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.03423375, + "input_cost": 0.00388375, + "output_cost": 0.03035 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 3.16239572, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.095654126, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.573285539, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 3.13363762, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.075430478, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.734203872, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 3.207070397, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1261.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 525.0, + "total_cost": 0.0002836, + "input_cost": 7.36e-5, + "output_cost": 0.00021 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.173871208, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.445568835, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 3.087279343, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.14396244, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.481531585, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1242.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 506.0, + "total_cost": 0.00027600000000000004, + "input_cost": 7.36e-5, + "output_cost": 0.00020240000000000004 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 3.053321468, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.041527691, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.656550888, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 3.232291255, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1261.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 525.0, + "total_cost": 0.0002836, + "input_cost": 7.36e-5, + "output_cost": 0.00021 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.16402353, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.371460427, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 3.306933591, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1261.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 525.0, + "total_cost": 0.0002836, + "input_cost": 7.36e-5, + "output_cost": 0.00021 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.199269162, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.715297386, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 3.390987167, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1256.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 520.0, + "total_cost": 0.0002816, + "input_cost": 7.36e-5, + "output_cost": 0.000208 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.009082393, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.572387722, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 3.097230117, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.342514555, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.742365295, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 3.291870391, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.237470473, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1251.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 515.0, + "total_cost": 0.0002796, + "input_cost": 7.36e-5, + "output_cost": 0.00020600000000000002 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.437132314, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.205071529, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2632.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00200755, + "input_cost": 0.00025065, + "output_cost": 0.0017569 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 9.27147079, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3259.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 557.0, + "total_cost": 0.00420005, + "input_cost": 0.00025035, + "output_cost": 0.0039497 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 4.703598385, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2531.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0016665500000000001, + "input_cost": 0.00025035, + "output_cost": 0.0014162 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.890864449, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2638.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 591.0, + "total_cost": 0.00192125, + "input_cost": 0.00025065, + "output_cost": 0.0016706 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 5.237042556, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2595.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00189055, + "input_cost": 0.00025035, + "output_cost": 0.0016401999999999999 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.402304393, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2648.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0020760500000000003, + "input_cost": 0.00025035, + "output_cost": 0.0018257000000000002 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.681153876, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2673.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00215105, + "input_cost": 0.00025065, + "output_cost": 0.0019004 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 5.74444077, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2683.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00219855, + "input_cost": 0.00025035, + "output_cost": 0.0019482 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 35.715282683, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 15172.0, + "provider": "Google", + "Metric_request_tokens": 8909.0, + "Metric_response_tokens": 1841.0, + "total_cost": 0.01791795, + "input_cost": 0.00133635, + "output_cost": 0.0165816 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 19.026625667, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7184.0, + "provider": "Google", + "Metric_request_tokens": 3969.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.00837945, + "input_cost": 0.00059535, + "output_cost": 0.0077840999999999995 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 5.674318695, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2643.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00205855, + "input_cost": 0.00025035, + "output_cost": 0.0018082 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 6.052854086, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2773.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00251355, + "input_cost": 0.00025035, + "output_cost": 0.0022632 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 6.691234945, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2764.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.0024695499999999996, + "input_cost": 0.00025065, + "output_cost": 0.0022188999999999998 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 10.540987174, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7199.0, + "provider": "Google", + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.00383255, + "input_cost": 0.00080415, + "output_cost": 0.0030284 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.512122765, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2665.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00213555, + "input_cost": 0.00025035, + "output_cost": 0.0018852 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 6.02414223, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2737.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.0023750499999999996, + "input_cost": 0.00025065, + "output_cost": 0.0021244 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 18.309828457, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7772.0, + "provider": "Google", + "Metric_request_tokens": 3983.0, + "Metric_response_tokens": 1188.0, + "total_cost": 0.01041375, + "input_cost": 0.00059745, + "output_cost": 0.0098163 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.858011472, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2586.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0018590500000000001, + "input_cost": 0.00025035, + "output_cost": 0.0016087 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.887542438, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2715.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00229805, + "input_cost": 0.00025065, + "output_cost": 0.0020474 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 5.607493865, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2660.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00211225, + "input_cost": 0.00025035, + "output_cost": 0.0018619 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.350370934, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2632.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0020200500000000002, + "input_cost": 0.00025035, + "output_cost": 0.0017697000000000001 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 13.369507864, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6433.0, + "provider": "Google", + "Metric_request_tokens": 3970.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.0057969, + "input_cost": 0.0005954999999999999, + "output_cost": 0.0052014 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 5.427887401, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2640.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00204805, + "input_cost": 0.00025035, + "output_cost": 0.0017977 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 6.886046113, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2860.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00281805, + "input_cost": 0.00025035, + "output_cost": 0.0025677 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 6.190617486, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2704.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00225955, + "input_cost": 0.00025065, + "output_cost": 0.0020089 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 12.313419785, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7265.0, + "provider": "Google", + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.00406355, + "input_cost": 0.00080415, + "output_cost": 0.0032594 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 14.12684929, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6606.0, + "provider": "Google", + "Metric_request_tokens": 3969.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.00642025, + "input_cost": 0.00059535, + "output_cost": 0.0058249 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 11.416052215, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5051.0, + "provider": "Google", + "Metric_request_tokens": 3061.0, + "Metric_response_tokens": 1169.0, + "total_cost": 0.00403405, + "input_cost": 0.00045914999999999997, + "output_cost": 0.0035749 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 5.042913232, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2542.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00170505, + "input_cost": 0.00025035, + "output_cost": 0.0014547 + }, + { + "Model": "gemini-2.5-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.803645705, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2732.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0023700499999999994, + "input_cost": 0.00025035, + "output_cost": 0.0021196999999999995 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 24.280603396, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 21.444646242, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7586.0, + "provider": "Google", + "Metric_request_tokens": 3679.0, + "Metric_response_tokens": 722.0, + "total_cost": 0.012132549999999999, + "input_cost": 0.00055185, + "output_cost": 0.0115807 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 14.103361899, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7319.0, + "provider": "Google", + "Metric_request_tokens": 4540.0, + "Metric_response_tokens": 1218.0, + "total_cost": 0.006875300000000001, + "input_cost": 0.000681, + "output_cost": 0.006194300000000001 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 26.086590308, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9499.0, + "provider": "Google", + "Metric_request_tokens": 3969.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.016481950000000002, + "input_cost": 0.00059535, + "output_cost": 0.0158866 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 24.1841836, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 17.981969969, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7548.0, + "provider": "Google", + "Metric_request_tokens": 3630.0, + "Metric_response_tokens": 1695.0, + "total_cost": 0.009342000000000001, + "input_cost": 0.0005445, + "output_cost": 0.008797500000000001 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 20.823357766, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9790.0, + "provider": "Google", + "Metric_request_tokens": 5066.0, + "Metric_response_tokens": 1729.0, + "total_cost": 0.0122798, + "input_cost": 0.0007599, + "output_cost": 0.0115199 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 25.462973093, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 35.722264378, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 16420.0, + "provider": "Google", + "Metric_request_tokens": 8000.0, + "Metric_response_tokens": 1807.0, + "total_cost": 0.0254297, + "input_cost": 0.0012, + "output_cost": 0.0242297 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.71420281, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1869.0, + "provider": "Google", + "Metric_request_tokens": 811.0, + "Metric_response_tokens": 111.0, + "total_cost": 0.0035027500000000002, + "input_cost": 0.00012164999999999999, + "output_cost": 0.0033811 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 22.548778955, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 18.459759081, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 35.081946937, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11019.0, + "provider": "Google", + "Metric_request_tokens": 4018.0, + "Metric_response_tokens": 1243.0, + "total_cost": 0.021501500000000003, + "input_cost": 0.0006027, + "output_cost": 0.020898800000000002 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 26.112757598, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 12440.0, + "provider": "Google", + "Metric_request_tokens": 6832.0, + "Metric_response_tokens": 1782.0, + "total_cost": 0.015484999999999999, + "input_cost": 0.0010248, + "output_cost": 0.0144602 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 15.507230852, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4880.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00988225, + "input_cost": 0.00025035, + "output_cost": 0.0096319 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 22.268203361, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 29.657737725, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8725.0, + "provider": "Google", + "Metric_request_tokens": 3968.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.013843, + "input_cost": 0.0005952, + "output_cost": 0.013247799999999999 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 17.856923641, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 22.406209617, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9355.0, + "provider": "Google", + "Metric_request_tokens": 4545.0, + "Metric_response_tokens": 1192.0, + "total_cost": 0.01405995, + "input_cost": 0.0006817500000000001, + "output_cost": 0.0133782 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 29.67447829, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9321.0, + "provider": "Google", + "Metric_request_tokens": 3105.0, + "Metric_response_tokens": 1192.0, + "total_cost": 0.01876495, + "input_cost": 0.00046575, + "output_cost": 0.018299199999999998 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 4.125237979, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1709.0, + "provider": "Google", + "Metric_request_tokens": 810.0, + "Metric_response_tokens": 524.0, + "total_cost": 0.0017484000000000002, + "input_cost": 0.00012149999999999999, + "output_cost": 0.0016269000000000001 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 32.319621961, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 16.088759709, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 22.178054671, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8629.0, + "provider": "Google", + "Metric_request_tokens": 3965.0, + "Metric_response_tokens": 1170.0, + "total_cost": 0.01352575, + "input_cost": 0.0005947499999999999, + "output_cost": 0.012931 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 10.029780959, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 3107.0, + "provider": "Google", + "Metric_request_tokens": 811.0, + "Metric_response_tokens": 536.0, + "total_cost": 0.006603250000000001, + "input_cost": 0.00012164999999999999, + "output_cost": 0.0064816000000000006 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 42.923902196, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 30481.0, + "provider": "Google", + "Metric_request_tokens": 21012.0, + "Metric_response_tokens": 3073.0, + "total_cost": 0.0273816, + "input_cost": 0.0031517999999999997, + "output_cost": 0.0242298 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 39.222518221, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 9.631045956, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 2892.0, + "provider": "Google", + "Metric_request_tokens": 811.0, + "Metric_response_tokens": 625.0, + "total_cost": 0.00559265, + "input_cost": 0.00012164999999999999, + "output_cost": 0.005471 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 6.647730144, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 2292.0, + "provider": "Google", + "Metric_request_tokens": 810.0, + "Metric_response_tokens": 524.0, + "total_cost": 0.0037889, + "input_cost": 0.00012149999999999999, + "output_cost": 0.0036674 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 35.71455716, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.476345308, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2659.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00210205, + "input_cost": 0.00025065, + "output_cost": 0.0018514 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 6.346005219, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2718.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00232105, + "input_cost": 0.00025035, + "output_cost": 0.0020707 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 4.836977527, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2539.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00169455, + "input_cost": 0.00025035, + "output_cost": 0.0014441999999999999 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 6.672693551, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2929.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00304705, + "input_cost": 0.00025065, + "output_cost": 0.0027964 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 14.662619685, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6896.0, + "provider": "Google", + "Metric_request_tokens": 3981.0, + "Metric_response_tokens": 1184.0, + "total_cost": 0.00736605, + "input_cost": 0.00059715, + "output_cost": 0.0067689 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.188614707, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2611.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 557.0, + "total_cost": 0.0019320499999999998, + "input_cost": 0.00025035, + "output_cost": 0.0016817 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 6.5603659, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2756.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00244155, + "input_cost": 0.00025065, + "output_cost": 0.0021909 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 6.128366075, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2798.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00259525, + "input_cost": 0.00025035, + "output_cost": 0.0023449 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 4.944774864, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2557.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00175755, + "input_cost": 0.00025035, + "output_cost": 0.0015072 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.76290555, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2696.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.0022315499999999997, + "input_cost": 0.00025065, + "output_cost": 0.0019809 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 11.161597903, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7403.0, + "provider": "Google", + "Metric_request_tokens": 5349.0, + "Metric_response_tokens": 1168.0, + "total_cost": 0.00460415, + "input_cost": 0.00080235, + "output_cost": 0.0038018 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 6.529419098, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2839.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0027445499999999997, + "input_cost": 0.00025035, + "output_cost": 0.0024942 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 8.540685767, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3173.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.0039010499999999997, + "input_cost": 0.00025065, + "output_cost": 0.0036504 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 5.922996591, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2729.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0023595499999999998, + "input_cost": 0.00025035, + "output_cost": 0.0021092 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 6.741784844, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2900.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00295805, + "input_cost": 0.00025035, + "output_cost": 0.0027077 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 7.055616236, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2809.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.0026270499999999997, + "input_cost": 0.00025065, + "output_cost": 0.0023764 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 21.365465445, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9022.0, + "provider": "Google", + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.01021305, + "input_cost": 0.00080415, + "output_cost": 0.0094089 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.080146417, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2513.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00160355, + "input_cost": 0.00025035, + "output_cost": 0.0013532 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 25.419348211, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10935.0, + "provider": "Google", + "Metric_request_tokens": 5358.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.016854800000000003, + "input_cost": 0.0008037, + "output_cost": 0.016051100000000002 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.279733312, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2443.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00135855, + "input_cost": 0.00025035, + "output_cost": 0.0011082 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.584990518, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2690.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00221725, + "input_cost": 0.00025035, + "output_cost": 0.0019669 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 7.12928071, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2925.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00303305, + "input_cost": 0.00025065, + "output_cost": 0.0027824 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 23.894062173, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11489.0, + "provider": "Google", + "Metric_request_tokens": 6861.0, + "Metric_response_tokens": 1806.0, + "total_cost": 0.01198975, + "input_cost": 0.00102915, + "output_cost": 0.010960600000000001 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 9.010575503, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3355.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00455055, + "input_cost": 0.00025035, + "output_cost": 0.0043002000000000005 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.778800106, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2693.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.0022210499999999996, + "input_cost": 0.00025065, + "output_cost": 0.0019703999999999998 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 8.167258931, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3303.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 550.0, + "total_cost": 0.004374349999999999, + "input_cost": 0.00025035, + "output_cost": 0.004123999999999999 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 4.731809965, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2458.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00141105, + "input_cost": 0.00025035, + "output_cost": 0.0011607 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.421789254, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2578.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 568.0, + "total_cost": 0.0017779500000000001, + "input_cost": 0.00025065, + "output_cost": 0.0015273 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.939170129, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2535.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0016805499999999998, + "input_cost": 0.00025035, + "output_cost": 0.0014302 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.085231652, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2550.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0017330499999999999, + "input_cost": 0.00025035, + "output_cost": 0.0014827 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 4.517046246, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4330.0, + "provider": "Google", + "Metric_request_tokens": 3106.0, + "Metric_response_tokens": 1224.0, + "total_cost": 0.0008002, + "input_cost": 0.0003106, + "output_cost": 0.0004896000000000001 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.366856197, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4300.0, + "provider": "Google", + "Metric_request_tokens": 3108.0, + "Metric_response_tokens": 1192.0, + "total_cost": 0.0007876000000000001, + "input_cost": 0.0003108, + "output_cost": 0.0004768 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.691253093, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3771.0, + "provider": "Google", + "Metric_request_tokens": 3105.0, + "Metric_response_tokens": 666.0, + "total_cost": 0.0005769, + "input_cost": 0.0003105, + "output_cost": 0.0002664 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 4.948628294, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 50365.0, + "provider": "Google", + "Metric_request_tokens": 49186.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.0053902, + "input_cost": 0.0049186, + "output_cost": 0.00047159999999999997 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.397124725, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4273.0, + "provider": "Google", + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 1184.0, + "total_cost": 0.0007825, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0004736 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 4.246158175, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4269.0, + "provider": "Google", + "Metric_request_tokens": 3092.0, + "Metric_response_tokens": 1177.0, + "total_cost": 0.0007800000000000001, + "input_cost": 0.00030920000000000003, + "output_cost": 0.00047080000000000006 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 4.461461037, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4313.0, + "provider": "Google", + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 1224.0, + "total_cost": 0.0007985000000000002, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0004896000000000001 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.826108098, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3829.0, + "provider": "Google", + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 693.0, + "total_cost": 0.0005908, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0002772 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.812971395, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3771.0, + "provider": "Google", + "Metric_request_tokens": 3103.0, + "Metric_response_tokens": 668.0, + "total_cost": 0.0005775, + "input_cost": 0.0003103, + "output_cost": 0.0002672 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 4.463204247, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4292.0, + "provider": "Google", + "Metric_request_tokens": 3106.0, + "Metric_response_tokens": 1186.0, + "total_cost": 0.0007850000000000001, + "input_cost": 0.0003106, + "output_cost": 0.00047440000000000004 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.422216874, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6650.0, + "provider": "Google", + "Metric_request_tokens": 5423.0, + "Metric_response_tokens": 1227.0, + "total_cost": 0.0010331, + "input_cost": 0.0005423, + "output_cost": 0.0004908 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 4.450171262, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4287.0, + "provider": "Google", + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.0007875, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00047840000000000003 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 4.562724232, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4284.0, + "provider": "Google", + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.0007872, + "input_cost": 0.0003088, + "output_cost": 0.00047840000000000003 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.407190932, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4378.0, + "provider": "Google", + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 1242.0, + "total_cost": 0.0008104000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0004968 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.620992741, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3766.0, + "provider": "Google", + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 675.0, + "total_cost": 0.0005791, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00027 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.472499546, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4297.0, + "provider": "Google", + "Metric_request_tokens": 3090.0, + "Metric_response_tokens": 1207.0, + "total_cost": 0.0007918000000000001, + "input_cost": 0.00030900000000000003, + "output_cost": 0.0004828 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.473251097, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4322.0, + "provider": "Google", + "Metric_request_tokens": 3106.0, + "Metric_response_tokens": 1216.0, + "total_cost": 0.0007970000000000001, + "input_cost": 0.0003106, + "output_cost": 0.00048640000000000006 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 4.574995535, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4266.0, + "provider": "Google", + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 1175.0, + "total_cost": 0.0007791, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00047000000000000004 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 4.094999263, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3858.0, + "provider": "Google", + "Metric_request_tokens": 3137.0, + "Metric_response_tokens": 721.0, + "total_cost": 0.0006021, + "input_cost": 0.00031370000000000004, + "output_cost": 0.0002884 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.658386722, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3778.0, + "provider": "Google", + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 689.0, + "total_cost": 0.0005845000000000001, + "input_cost": 0.00030890000000000003, + "output_cost": 0.00027560000000000003 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 4.893656632, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4357.0, + "provider": "Google", + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 1221.0, + "total_cost": 0.0008020000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0004884 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.236565242, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3883.0, + "provider": "Google", + "Metric_request_tokens": 3137.0, + "Metric_response_tokens": 746.0, + "total_cost": 0.0006121000000000001, + "input_cost": 0.00031370000000000004, + "output_cost": 0.00029840000000000004 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.485359369, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4324.0, + "provider": "Google", + "Metric_request_tokens": 3108.0, + "Metric_response_tokens": 1216.0, + "total_cost": 0.0007972000000000001, + "input_cost": 0.0003108, + "output_cost": 0.00048640000000000006 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.734365673, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3791.0, + "provider": "Google", + "Metric_request_tokens": 3110.0, + "Metric_response_tokens": 681.0, + "total_cost": 0.0005834, + "input_cost": 0.000311, + "output_cost": 0.0002724 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 4.500398738, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4304.0, + "provider": "Google", + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1216.0, + "total_cost": 0.0007952, + "input_cost": 0.0003088, + "output_cost": 0.00048640000000000006 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.124787172, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3850.0, + "provider": "Google", + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 761.0, + "total_cost": 0.0006133, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0003044 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.98193398, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3903.0, + "provider": "Google", + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 767.0, + "total_cost": 0.0006204000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.00030680000000000003 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 4.48101111, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4290.0, + "provider": "Google", + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1202.0, + "total_cost": 0.0007896, + "input_cost": 0.0003088, + "output_cost": 0.0004808 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.916593879, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4543.0, + "provider": "Google", + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 1454.0, + "total_cost": 0.0008905, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0005816 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.879042448, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3885.0, + "provider": "Google", + "Metric_request_tokens": 3092.0, + "Metric_response_tokens": 793.0, + "total_cost": 0.0006264, + "input_cost": 0.00030920000000000003, + "output_cost": 0.0003172 } ], "config": { From 3bf2d919844f3a6180fad49a7d29d305e50d9441 Mon Sep 17 00:00:00 2001 From: Andrew Ginns Date: Wed, 23 Jul 2025 07:51:54 +0000 Subject: [PATCH 2/2] chore: Add more model runs, inc. Nova premier and Flash-lite --- src/data/merbench_data.json | 10225 +++++++++++++++++++++++++--------- 1 file changed, 7584 insertions(+), 2641 deletions(-) diff --git a/src/data/merbench_data.json b/src/data/merbench_data.json index 437c0c3..2867727 100644 --- a/src/data/merbench_data.json +++ b/src/data/merbench_data.json @@ -1,7 +1,7 @@ { "stats": { - "total_runs": 396, - "models_evaluated": 12, + "total_runs": 699, + "models_evaluated": 13, "test_cases": 3, "test_groups": ["easy", "hard", "medium"], "providers": ["Amazon", "Google"], @@ -12,6 +12,7 @@ "bedrock:us.amazon.nova-pro-v1:0", "gemini-2.0-flash", "gemini-2.5-flash", + "gemini-2.5-flash-lite", "gemini-2.5-flash-lite-preview-06-17", "gemini-2.5-flash-preview-04-17", "gemini-2.5-flash-preview-05-20", @@ -19,41 +20,41 @@ "gemini-2.5-pro-preview-05-06", "gemini-2.5-pro-preview-06-05" ], - "total_cost": 15.01060421, - "avg_cost_per_run": 0.03790556618686869 + "total_cost": 17.814881175, + "avg_cost_per_run": 0.02548623916309013 }, "leaderboard": [ { "Model": "gemini-2.5-pro-preview-06-05", - "Success_Rate": 33.333333333333336, - "Avg_Duration": 38.34483030143589, - "Avg_Tokens": 8813.564102564103, - "Avg_Cost": 0.04050108974358974, - "Avg_Input_Cost": 0.006804935897435898, - "Avg_Output_Cost": 0.03369615384615384, - "Runs": 39, + "Success_Rate": 29.41176470588235, + "Avg_Duration": 36.84238953870588, + "Avg_Tokens": 8111.882352941177, + "Avg_Cost": 0.03829563725490196, + "Avg_Input_Cost": 0.0061175980392156855, + "Avg_Output_Cost": 0.03217803921568627, + "Runs": 51, "Provider": "Google" }, { "Model": "gemini-2.5-pro-preview-05-06", - "Success_Rate": 27.272727272727273, - "Avg_Duration": 56.42929048663636, - "Avg_Tokens": 24795.242424242424, - "Avg_Cost": 0.16677734848484846, - "Avg_Input_Cost": 0.011596439393939393, - "Avg_Output_Cost": 0.1551809090909091, - "Runs": 33, + "Success_Rate": 26.666666666666668, + "Avg_Duration": 49.84911264002222, + "Avg_Tokens": 19753.911111111112, + "Avg_Cost": 0.13076105555555556, + "Avg_Input_Cost": 0.009539722222222222, + "Avg_Output_Cost": 0.12122133333333333, + "Runs": 45, "Provider": "Google" }, { "Model": "gemini-2.5-pro-preview-03-25", - "Success_Rate": 23.076923076923077, - "Avg_Duration": 60.486014026717946, - "Avg_Tokens": 18381.05128205128, - "Avg_Cost": 0.12938753205128206, - "Avg_Input_Cost": 0.0077747115384615385, - "Avg_Output_Cost": 0.1216128205128205, - "Runs": 39, + "Success_Rate": 22.916666666666668, + "Avg_Duration": 57.17489324589584, + "Avg_Tokens": 16393.3125, + "Avg_Cost": 0.113261328125, + "Avg_Input_Cost": 0.007238828125, + "Avg_Output_Cost": 0.1060225, + "Runs": 48, "Provider": "Google" }, { @@ -67,37 +68,26 @@ "Runs": 45, "Provider": "Google" }, - { - "Model": "bedrock:us.amazon.nova-premier-v1:0", - "Success_Rate": 6.666666666666667, - "Avg_Duration": 78.3300724666, - "Avg_Tokens": 15556.266666666666, - "Avg_Cost": 0.05645733333333334, - "Avg_Input_Cost": 0.034498999999999995, - "Avg_Output_Cost": 0.021958333333333333, - "Runs": 15, - "Provider": "Amazon" - }, { "Model": "gemini-2.5-flash-lite-preview-06-17", - "Success_Rate": 4.444444444444445, - "Avg_Duration": 4.399183285022222, - "Avg_Tokens": 5233.377777777778, - "Avg_Cost": 0.0008394444444444445, - "Avg_Input_Cost": 0.00041796888888888893, - "Avg_Output_Cost": 0.0004214755555555556, - "Runs": 45, + "Success_Rate": 5.0, + "Avg_Duration": 4.403742361633333, + "Avg_Tokens": 4974.583333333333, + "Avg_Cost": 0.0008166483333333334, + "Avg_Input_Cost": 0.00039106166666666675, + "Avg_Output_Cost": 0.0004255866666666667, + "Runs": 60, "Provider": "Google" }, { "Model": "gemini-2.5-flash-preview-05-20", - "Success_Rate": 4.444444444444445, - "Avg_Duration": 9.259261195111112, - "Avg_Tokens": 5119.933333333333, - "Avg_Cost": 0.007952797777777779, - "Avg_Input_Cost": 0.00035615999999999995, - "Avg_Output_Cost": 0.007596637777777778, - "Runs": 45, + "Success_Rate": 5.0, + "Avg_Duration": 9.74856794015, + "Avg_Tokens": 5771.55, + "Avg_Cost": 0.010067502499999999, + "Avg_Input_Cost": 0.0003626075, + "Avg_Output_Cost": 0.009704895, + "Runs": 60, "Provider": "Google" }, { @@ -111,106 +101,128 @@ "Runs": 45, "Provider": "Google" }, + { + "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Success_Rate": 3.3333333333333335, + "Avg_Duration": 63.18500474063333, + "Avg_Tokens": 9528.966666666667, + "Avg_Cost": 0.03564125, + "Avg_Input_Cost": 0.020867708333333335, + "Avg_Output_Cost": 0.014773541666666666, + "Runs": 60, + "Provider": "Amazon" + }, + { + "Model": "gemini-2.5-flash-lite", + "Success_Rate": 2.2222222222222223, + "Avg_Duration": 5.566101188488889, + "Avg_Tokens": 5586.155555555555, + "Avg_Cost": 0.0008821755555555556, + "Avg_Input_Cost": 0.00045076222222222224, + "Avg_Output_Cost": 0.00043141333333333333, + "Runs": 45, + "Provider": "Google" + }, + { + "Model": "gemini-2.0-flash", + "Success_Rate": 0.0, + "Avg_Duration": 4.211500790416666, + "Avg_Tokens": 1325.6666666666667, + "Avg_Cost": 0.0002895966666666667, + "Avg_Input_Cost": 8.022333333333333e-5, + "Avg_Output_Cost": 0.00020937333333333332, + "Runs": 60, + "Provider": "Google" + }, { "Model": "bedrock:us.amazon.nova-pro-v1:0", "Success_Rate": 0.0, - "Avg_Duration": 50.704131002266664, - "Avg_Tokens": 0.0, - "Avg_Cost": 0.0, - "Avg_Input_Cost": 0.0, - "Avg_Output_Cost": 0.0, - "Runs": 15, + "Avg_Duration": 49.53060242716666, + "Avg_Tokens": 678.15, + "Avg_Cost": 0.0008442, + "Avg_Input_Cost": 0.00044196000000000003, + "Avg_Output_Cost": 0.00040224, + "Runs": 60, "Provider": "Amazon" }, { "Model": "bedrock:us.amazon.nova-micro-v1:0", "Success_Rate": 0.0, - "Avg_Duration": 17.3943256876, - "Avg_Tokens": 1744.2, - "Avg_Cost": 9.965200000000001e-5, - "Avg_Input_Cost": 4.8178666666666673e-5, - "Avg_Output_Cost": 5.147333333333334e-5, - "Runs": 15, + "Avg_Duration": 18.831003638516666, + "Avg_Tokens": 1783.85, + "Avg_Cost": 9.043825000000001e-5, + "Avg_Input_Cost": 5.310025000000001e-5, + "Avg_Output_Cost": 3.733800000000001e-5, + "Runs": 60, "Provider": "Amazon" }, { "Model": "bedrock:us.amazon.nova-lite-v1:0", "Success_Rate": 0.0, - "Avg_Duration": 25.544886336066664, - "Avg_Tokens": 1926.6666666666667, - "Avg_Cost": 0.000170212, - "Avg_Input_Cost": 9.7396e-5, - "Avg_Output_Cost": 7.2816e-5, - "Runs": 15, + "Avg_Duration": 24.5429114128, + "Avg_Tokens": 2799.3166666666666, + "Avg_Cost": 0.000247573, + "Avg_Input_Cost": 0.000141421, + "Avg_Output_Cost": 0.000106152, + "Runs": 60, "Provider": "Amazon" - }, - { - "Model": "gemini-2.0-flash", - "Success_Rate": 0.0, - "Avg_Duration": 4.5703242256222225, - "Avg_Tokens": 1354.6, - "Avg_Cost": 0.00029454666666666666, - "Avg_Input_Cost": 8.243111111111111e-5, - "Avg_Output_Cost": 0.00021211555555555555, - "Runs": 45, - "Provider": "Google" } ], "pareto_data": [ { "Model": "bedrock:us.amazon.nova-lite-v1:0", "Success_Rate": 0.0, - "Duration": 25.544886336066664, - "total_tokens": 1926.6666666666667, - "total_cost": 0.000170212, - "input_cost": 9.7396e-5, - "output_cost": 7.2816e-5, - "Metric_request_tokens": 4869.8, - "Metric_response_tokens": 910.2 + "Duration": 24.5429114128, + "total_tokens": 2799.3166666666666, + "total_cost": 0.000247573, + "input_cost": 0.000141421, + "output_cost": 0.000106152, + "Metric_request_tokens": 5237.814814814815, + "Metric_response_tokens": 982.8888888888889 }, { "Model": "bedrock:us.amazon.nova-micro-v1:0", "Success_Rate": 0.0, - "Duration": 17.3943256876, - "total_tokens": 1744.2, - "total_cost": 9.965200000000001e-5, - "input_cost": 4.8178666666666673e-5, - "output_cost": 5.147333333333334e-5, - "Metric_request_tokens": 5162.0, - "Metric_response_tokens": 1378.75 + "Duration": 18.831003638516666, + "total_tokens": 1783.85, + "total_cost": 9.043825000000001e-5, + "input_cost": 5.310025000000001e-5, + "output_cost": 3.733800000000001e-5, + "Metric_request_tokens": 7002.2307692307695, + "Metric_response_tokens": 1230.923076923077 }, { "Model": "bedrock:us.amazon.nova-premier-v1:0", - "Success_Rate": 6.666666666666667, - "Duration": 78.3300724666, - "total_tokens": 15556.266666666666, - "total_cost": 0.05645733333333334, - "input_cost": 0.034498999999999995, - "output_cost": 0.021958333333333333, - "Metric_request_tokens": 18817.636363636364, - "Metric_response_tokens": 2395.4545454545455 + "Success_Rate": 3.3333333333333335, + "Duration": 63.18500474063333, + "total_tokens": 9528.966666666667, + "total_cost": 0.03564125, + "input_cost": 0.020867708333333335, + "output_cost": 0.014773541666666666, + "Metric_request_tokens": 17269.827586206895, + "Metric_response_tokens": 2445.2758620689656 }, { "Model": "bedrock:us.amazon.nova-pro-v1:0", "Success_Rate": 0.0, - "Duration": 50.704131002266664, - "total_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0, - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0 + "Duration": 49.53060242716666, + "total_tokens": 678.15, + "total_cost": 0.0008442, + "input_cost": 0.00044196000000000003, + "output_cost": 0.00040224, + "Metric_request_tokens": 8286.75, + "Metric_response_tokens": 1885.5 }, { "Model": "gemini-2.0-flash", "Success_Rate": 0.0, - "Duration": 4.5703242256222225, - "total_tokens": 1354.6, - "total_cost": 0.00029454666666666666, - "input_cost": 8.243111111111111e-5, - "output_cost": 0.00021211555555555555, - "Metric_request_tokens": 824.3111111111111, - "Metric_response_tokens": 530.2888888888889 + "Duration": 4.211500790416666, + "total_tokens": 1325.6666666666667, + "total_cost": 0.0002895966666666667, + "input_cost": 8.022333333333333e-5, + "output_cost": 0.00020937333333333332, + "Metric_request_tokens": 802.2333333333333, + "Metric_response_tokens": 523.4333333333333 }, { "Model": "gemini-2.5-flash", @@ -223,16 +235,27 @@ "Metric_request_tokens": 2796.288888888889, "Metric_response_tokens": 807.5333333333333 }, + { + "Model": "gemini-2.5-flash-lite", + "Success_Rate": 2.2222222222222223, + "Duration": 5.566101188488889, + "total_tokens": 5586.155555555555, + "total_cost": 0.0008821755555555556, + "input_cost": 0.00045076222222222224, + "output_cost": 0.00043141333333333333, + "Metric_request_tokens": 4507.622222222222, + "Metric_response_tokens": 1078.5333333333333 + }, { "Model": "gemini-2.5-flash-lite-preview-06-17", - "Success_Rate": 4.444444444444445, - "Duration": 4.399183285022222, - "total_tokens": 5233.377777777778, - "total_cost": 0.0008394444444444445, - "input_cost": 0.00041796888888888893, - "output_cost": 0.0004214755555555556, - "Metric_request_tokens": 4179.688888888889, - "Metric_response_tokens": 1053.6888888888889 + "Success_Rate": 5.0, + "Duration": 4.403742361633333, + "total_tokens": 4974.583333333333, + "total_cost": 0.0008166483333333334, + "input_cost": 0.00039106166666666675, + "output_cost": 0.0004255866666666667, + "Metric_request_tokens": 3910.616666666667, + "Metric_response_tokens": 1063.9666666666667 }, { "Model": "gemini-2.5-flash-preview-04-17", @@ -247,47 +270,47 @@ }, { "Model": "gemini-2.5-flash-preview-05-20", - "Success_Rate": 4.444444444444445, - "Duration": 9.259261195111112, - "total_tokens": 5119.933333333333, - "total_cost": 0.007952797777777779, - "input_cost": 0.00035615999999999995, - "output_cost": 0.007596637777777778, - "Metric_request_tokens": 2428.3636363636365, - "Metric_response_tokens": 709.8181818181819 + "Success_Rate": 5.0, + "Duration": 9.74856794015, + "total_tokens": 5771.55, + "total_cost": 0.010067502499999999, + "input_cost": 0.0003626075, + "output_cost": 0.009704895, + "Metric_request_tokens": 2500.7413793103447, + "Metric_response_tokens": 725.8103448275862 }, { "Model": "gemini-2.5-pro-preview-03-25", - "Success_Rate": 23.076923076923077, - "Duration": 60.486014026717946, - "total_tokens": 18381.05128205128, - "total_cost": 0.12938753205128206, - "input_cost": 0.0077747115384615385, - "output_cost": 0.1216128205128205, - "Metric_request_tokens": 8364.51724137931, - "Metric_response_tokens": 1380.8275862068965 + "Success_Rate": 22.916666666666668, + "Duration": 57.17489324589584, + "total_tokens": 16393.3125, + "total_cost": 0.113261328125, + "input_cost": 0.007238828125, + "output_cost": 0.1060225, + "Metric_request_tokens": 7315.026315789473, + "Metric_response_tokens": 1362.9736842105262 }, { "Model": "gemini-2.5-pro-preview-05-06", - "Success_Rate": 27.272727272727273, - "Duration": 56.42929048663636, - "total_tokens": 24795.242424242424, - "total_cost": 0.16677734848484846, - "input_cost": 0.011596439393939393, - "output_cost": 0.1551809090909091, - "Metric_request_tokens": 12756.083333333334, - "Metric_response_tokens": 1607.2916666666667 + "Success_Rate": 26.666666666666668, + "Duration": 49.84911264002222, + "total_tokens": 19753.911111111112, + "total_cost": 0.13076105555555556, + "input_cost": 0.009539722222222222, + "output_cost": 0.12122133333333333, + "Metric_request_tokens": 9539.722222222223, + "Metric_response_tokens": 1457.138888888889 }, { "Model": "gemini-2.5-pro-preview-06-05", - "Success_Rate": 33.333333333333336, - "Duration": 38.34483030143589, - "total_tokens": 8813.564102564103, - "total_cost": 0.04050108974358974, - "input_cost": 0.006804935897435898, - "output_cost": 0.03369615384615384, - "Metric_request_tokens": 5443.948717948718, - "Metric_response_tokens": 1348.3846153846155 + "Success_Rate": 29.41176470588235, + "Duration": 36.84238953870588, + "total_tokens": 8111.882352941177, + "total_cost": 0.03829563725490196, + "input_cost": 0.0061175980392156855, + "output_cost": 0.03217803921568627, + "Metric_request_tokens": 4894.078431372549, + "Metric_response_tokens": 1306.6470588235295 } ], "test_groups_data": [ @@ -295,111 +318,111 @@ "Model": "bedrock:us.amazon.nova-lite-v1:0", "test_group": "easy", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.2, - "Score_UsedBothMCPTools": 0.2, - "total_cost": 0.00011403599999999998, - "input_cost": 4.8515999999999994e-5, - "output_cost": 6.551999999999998e-5, - "total_tokens": 1081.6 + "Score_UsageLimitNotExceeded": 0.4, + "Score_UsedBothMCPTools": 0.275, + "total_cost": 0.000238407, + "input_cost": 0.00013509900000000002, + "output_cost": 0.00010330800000000001, + "total_tokens": 2682.1 }, { "Model": "bedrock:us.amazon.nova-lite-v1:0", "test_group": "hard", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0, - "total_tokens": 0.0 + "Score_UsageLimitNotExceeded": 0.3, + "Score_UsedBothMCPTools": 0.225, + "total_cost": 0.00015518999999999998, + "input_cost": 8.3562e-5, + "output_cost": 7.162799999999999e-5, + "total_tokens": 1691.15 }, { "Model": "bedrock:us.amazon.nova-lite-v1:0", "test_group": "medium", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.8, - "Score_UsedBothMCPTools": 0.6, - "total_cost": 0.0003966, - "input_cost": 0.00024367200000000003, - "output_cost": 0.000152928, - "total_tokens": 4698.4 + "Score_UsageLimitNotExceeded": 0.65, + "Score_UsedBothMCPTools": 0.5, + "total_cost": 0.000349122, + "input_cost": 0.000205602, + "output_cost": 0.00014352, + "total_tokens": 4024.7 }, { "Model": "bedrock:us.amazon.nova-micro-v1:0", "test_group": "easy", "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.4, - "Score_UsedBothMCPTools": 0.3, - "total_cost": 0.00013008800000000002, - "input_cost": 6.9692e-5, - "output_cost": 6.039600000000001e-5, - "total_tokens": 2422.6 + "Score_UsedBothMCPTools": 0.35, + "total_cost": 0.00014982100000000001, + "input_cost": 9.1595e-5, + "output_cost": 5.8226e-5, + "total_tokens": 3032.9 }, { "Model": "bedrock:us.amazon.nova-micro-v1:0", "test_group": "hard", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.2, - "Score_UsedBothMCPTools": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0, - "total_tokens": 0.0 + "Score_UsageLimitNotExceeded": 0.15, + "Score_UsedBothMCPTools": 0.05, + "total_cost": 3.7059750000000004e-5, + "input_cost": 3.0283750000000003e-5, + "output_cost": 6.776e-6, + "total_tokens": 913.65 }, { "Model": "bedrock:us.amazon.nova-micro-v1:0", "test_group": "medium", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.4, - "Score_UsedBothMCPTools": 0.4, - "total_cost": 0.00016886800000000002, - "input_cost": 7.4844e-5, - "output_cost": 9.4024e-5, - "total_tokens": 2810.0 + "Score_UsageLimitNotExceeded": 0.2, + "Score_UsedBothMCPTools": 0.2, + "total_cost": 8.443400000000001e-5, + "input_cost": 3.7422e-5, + "output_cost": 4.7012e-5, + "total_tokens": 1405.0 }, { "Model": "bedrock:us.amazon.nova-premier-v1:0", "test_group": "easy", - "Score_MermaidDiagramValid": 0.2, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_cost": 0.1004015, - "input_cost": 0.0715065, - "output_cost": 0.028894999999999997, - "total_tokens": 30914.2 + "Score_MermaidDiagramValid": 0.1, + "Score_UsageLimitNotExceeded": 0.9, + "Score_UsedBothMCPTools": 0.625, + "total_cost": 0.057880125000000004, + "input_cost": 0.0392845, + "output_cost": 0.018595624999999998, + "total_tokens": 17201.45 }, { "Model": "bedrock:us.amazon.nova-premier-v1:0", "test_group": "hard", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.6, - "Score_UsedBothMCPTools": 0.6, - "total_cost": 0.0410155, - "input_cost": 0.019733, - "output_cost": 0.0212825, - "total_tokens": 9595.8 + "Score_UsageLimitNotExceeded": 0.65, + "Score_UsedBothMCPTools": 0.35, + "total_cost": 0.023277375, + "input_cost": 0.011096125, + "output_cost": 0.01218125, + "total_tokens": 5412.95 }, { "Model": "bedrock:us.amazon.nova-premier-v1:0", "test_group": "medium", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.6, - "Score_UsedBothMCPTools": 0.6, - "total_cost": 0.027955, - "input_cost": 0.012257500000000001, - "output_cost": 0.015697499999999996, - "total_tokens": 6158.8 + "Score_UsageLimitNotExceeded": 0.75, + "Score_UsedBothMCPTools": 0.45, + "total_cost": 0.02576625, + "input_cost": 0.0122225, + "output_cost": 0.013543749999999998, + "total_tokens": 5972.5 }, { "Model": "bedrock:us.amazon.nova-pro-v1:0", "test_group": "easy", "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.2, - "Score_UsedBothMCPTools": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0, - "total_tokens": 0.0 + "Score_UsedBothMCPTools": 0.1, + "total_cost": 0.00161976, + "input_cost": 0.00086424, + "output_cost": 0.00075552, + "total_tokens": 1316.4 }, { "Model": "bedrock:us.amazon.nova-pro-v1:0", @@ -416,12 +439,12 @@ "Model": "bedrock:us.amazon.nova-pro-v1:0", "test_group": "medium", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0, - "total_tokens": 0.0 + "Score_UsageLimitNotExceeded": 0.1, + "Score_UsedBothMCPTools": 0.1, + "total_cost": 0.0009128400000000002, + "input_cost": 0.00046164000000000003, + "output_cost": 0.0004512, + "total_tokens": 718.05 }, { "Model": "gemini-2.0-flash", @@ -429,10 +452,10 @@ "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_cost": 0.00027685333333333335, + "total_cost": 0.00027638, "input_cost": 7.36e-5, - "output_cost": 0.00020325333333333332, - "total_tokens": 1244.1333333333334 + "output_cost": 0.00020277999999999998, + "total_tokens": 1242.95 }, { "Model": "gemini-2.0-flash", @@ -440,21 +463,21 @@ "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_cost": 0.00027490666666666664, + "total_cost": 0.00027488, "input_cost": 7.36e-5, - "output_cost": 0.0002013066666666667, - "total_tokens": 1239.2666666666667 + "output_cost": 0.00020128000000000002, + "total_tokens": 1239.2 }, { "Model": "gemini-2.0-flash", "test_group": "medium", "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.13333333333333333, - "total_cost": 0.00033188, - "input_cost": 0.00010009333333333333, - "output_cost": 0.0002317866666666667, - "total_tokens": 1580.4 + "Score_UsedBothMCPTools": 0.1, + "total_cost": 0.00031753, + "input_cost": 9.347e-5, + "output_cost": 0.00022406000000000002, + "total_tokens": 1494.85 }, { "Model": "gemini-2.5-flash", @@ -489,27 +512,60 @@ "output_cost": 0.006282386666666667, "total_tokens": 5518.133333333333 }, + { + "Model": "gemini-2.5-flash-lite", + "test_group": "easy", + "Score_MermaidDiagramValid": 0.06666666666666667, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.7333333333333333, + "total_cost": 0.00104232, + "input_cost": 0.0006312533333333334, + "output_cost": 0.0004110666666666667, + "total_tokens": 7340.2 + }, + { + "Model": "gemini-2.5-flash-lite", + "test_group": "hard", + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.7333333333333333, + "total_cost": 0.0007507866666666667, + "input_cost": 0.0003091333333333334, + "output_cost": 0.00044165333333333334, + "total_tokens": 4195.466666666666 + }, + { + "Model": "gemini-2.5-flash-lite", + "test_group": "medium", + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.6333333333333333, + "total_cost": 0.0008534200000000001, + "input_cost": 0.0004119, + "output_cost": 0.00044152000000000004, + "total_tokens": 5222.8 + }, { "Model": "gemini-2.5-flash-lite-preview-06-17", "test_group": "easy", - "Score_MermaidDiagramValid": 0.13333333333333333, + "Score_MermaidDiagramValid": 0.15, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.7, - "total_cost": 0.00107244, - "input_cost": 0.0006171600000000001, - "output_cost": 0.00045528, - "total_tokens": 7309.8 + "Score_UsedBothMCPTools": 0.675, + "total_cost": 0.0010010650000000002, + "input_cost": 0.0005401650000000001, + "output_cost": 0.00046090000000000004, + "total_tokens": 6553.9 }, { "Model": "gemini-2.5-flash-lite-preview-06-17", "test_group": "hard", "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.7666666666666667, - "total_cost": 0.0006987066666666668, - "input_cost": 0.00031092, - "output_cost": 0.0003877866666666667, - "total_tokens": 4078.6666666666665 + "Score_UsedBothMCPTools": 0.775, + "total_cost": 0.000712445, + "input_cost": 0.00031114500000000003, + "output_cost": 0.0004013, + "total_tokens": 4114.7 }, { "Model": "gemini-2.5-flash-lite-preview-06-17", @@ -517,10 +573,10 @@ "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.8, - "total_cost": 0.0007471866666666667, - "input_cost": 0.0003258266666666667, - "output_cost": 0.00042136, - "total_tokens": 4311.666666666667 + "total_cost": 0.000736435, + "input_cost": 0.000321875, + "output_cost": 0.00041456, + "total_tokens": 4255.15 }, { "Model": "gemini-2.5-flash-preview-04-17", @@ -558,13 +614,13 @@ { "Model": "gemini-2.5-flash-preview-05-20", "test_group": "easy", - "Score_MermaidDiagramValid": 0.13333333333333333, + "Score_MermaidDiagramValid": 0.15, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5666666666666667, - "total_cost": 0.009252423333333334, - "input_cost": 0.00030188999999999997, - "output_cost": 0.008950533333333333, - "total_tokens": 5100.733333333334 + "Score_UsedBothMCPTools": 0.575, + "total_cost": 0.0118667775, + "input_cost": 0.0002998575, + "output_cost": 0.01156692, + "total_tokens": 5843.45 }, { "Model": "gemini-2.5-flash-preview-05-20", @@ -572,98 +628,98 @@ "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_cost": 0.005455606666666666, - "input_cost": 0.00025677999999999996, - "output_cost": 0.005198826666666667, - "total_tokens": 3659.5333333333333 + "total_cost": 0.0070533, + "input_cost": 0.00025999499999999995, + "output_cost": 0.0067933049999999995, + "total_tokens": 4138.7 }, { "Model": "gemini-2.5-flash-preview-05-20", "test_group": "medium", "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.7333333333333333, - "total_cost": 0.009150363333333333, - "input_cost": 0.00050981, - "output_cost": 0.008640553333333334, - "total_tokens": 6599.533333333334 + "Score_UsedBothMCPTools": 0.75, + "total_cost": 0.01128243, + "input_cost": 0.00052797, + "output_cost": 0.01075446, + "total_tokens": 7332.5 }, { "Model": "gemini-2.5-pro-preview-03-25", "test_group": "easy", - "Score_MermaidDiagramValid": 0.6923076923076923, + "Score_MermaidDiagramValid": 0.6875, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_cost": 0.33611653846153844, - "input_cost": 0.017935, - "output_cost": 0.31818153846153846, - "total_tokens": 46166.153846153844 + "total_cost": 0.280189765625, + "input_cost": 0.015300390625, + "output_cost": 0.264889375, + "total_tokens": 38729.25 }, { "Model": "gemini-2.5-pro-preview-03-25", "test_group": "hard", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.6153846153846154, - "Score_UsedBothMCPTools": 0.6153846153846154, - "total_cost": 0.030362788461538462, - "input_cost": 0.0028320192307692304, - "output_cost": 0.02753076923076923, - "total_tokens": 5018.692307692308 + "Score_UsageLimitNotExceeded": 0.6875, + "Score_UsedBothMCPTools": 0.6875, + "total_cost": 0.035003281250000004, + "input_cost": 0.00331890625, + "output_cost": 0.031684375, + "total_tokens": 5823.5625 }, { "Model": "gemini-2.5-pro-preview-03-25", "test_group": "medium", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.6923076923076923, - "Score_UsedBothMCPTools": 0.6153846153846154, - "total_cost": 0.021683269230769228, - "input_cost": 0.0025571153846153847, - "output_cost": 0.019126153846153845, - "total_tokens": 3958.3076923076924 + "Score_UsageLimitNotExceeded": 0.75, + "Score_UsedBothMCPTools": 0.6875, + "total_cost": 0.0245909375, + "input_cost": 0.0030971875, + "output_cost": 0.02149375, + "total_tokens": 4627.125 }, { "Model": "gemini-2.5-pro-preview-05-06", "test_group": "easy", - "Score_MermaidDiagramValid": 0.7272727272727273, + "Score_MermaidDiagramValid": 0.7333333333333333, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.9090909090909091, - "total_cost": 0.2612590909090909, - "input_cost": 0.022234545454545455, - "output_cost": 0.23902454545454546, - "total_tokens": 41690.09090909091 + "Score_UsedBothMCPTools": 0.9333333333333333, + "total_cost": 0.200351, + "input_cost": 0.017341, + "output_cost": 0.18300999999999998, + "total_tokens": 32173.8 }, { "Model": "gemini-2.5-pro-preview-05-06", "test_group": "hard", "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.9090909090909091, - "Score_UsedBothMCPTools": 0.5454545454545454, - "total_cost": 0.01853659090909091, - "input_cost": 0.002118409090909091, - "output_cost": 0.016418181818181818, - "total_tokens": 3336.5454545454545 + "Score_UsageLimitNotExceeded": 0.9333333333333333, + "Score_UsedBothMCPTools": 0.6666666666666666, + "total_cost": 0.020663166666666666, + "input_cost": 0.0025891666666666663, + "output_cost": 0.018074, + "total_tokens": 3878.733333333333 }, { "Model": "gemini-2.5-pro-preview-05-06", "test_group": "medium", - "Score_MermaidDiagramValid": 0.09090909090909091, - "Score_UsageLimitNotExceeded": 0.7272727272727273, - "Score_UsedBothMCPTools": 0.7272727272727273, - "total_cost": 0.22053636363636364, - "input_cost": 0.010436363636363636, - "output_cost": 0.21009999999999998, - "total_tokens": 29359.090909090908 + "Score_MermaidDiagramValid": 0.06666666666666667, + "Score_UsageLimitNotExceeded": 0.8, + "Score_UsedBothMCPTools": 0.8, + "total_cost": 0.171269, + "input_cost": 0.008689, + "output_cost": 0.16258, + "total_tokens": 23209.2 }, { "Model": "gemini-2.5-pro-preview-06-05", "test_group": "easy", - "Score_MermaidDiagramValid": 0.8461538461538461, + "Score_MermaidDiagramValid": 0.7058823529411765, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_cost": 0.03721326923076923, - "input_cost": 0.009261730769230769, - "output_cost": 0.027951538461538458, - "total_tokens": 10204.538461538461 + "total_cost": 0.0345539705882353, + "input_cost": 0.007996323529411764, + "output_cost": 0.02655764705882353, + "total_tokens": 9052.823529411764 }, { "Model": "gemini-2.5-pro-preview-06-05", @@ -671,52 +727,52 @@ "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_cost": 0.04328519230769231, - "input_cost": 0.0057175, - "output_cost": 0.03756769230769231, - "total_tokens": 8330.76923076923 + "total_cost": 0.041507205882352946, + "input_cost": 0.005286029411764706, + "output_cost": 0.036221176470588236, + "total_tokens": 7850.941176470588 }, { "Model": "gemini-2.5-pro-preview-06-05", "test_group": "medium", - "Score_MermaidDiagramValid": 0.15384615384615385, + "Score_MermaidDiagramValid": 0.17647058823529413, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_cost": 0.041004807692307695, - "input_cost": 0.005435576923076923, - "output_cost": 0.03556923076923077, - "total_tokens": 7905.384615384615 + "total_cost": 0.03882573529411765, + "input_cost": 0.005070441176470588, + "output_cost": 0.03375529411764706, + "total_tokens": 7431.882352941177 } ], "failure_analysis_data": [ { "Model": "bedrock:us.amazon.nova-lite-v1:0", - "Invalid Diagram": 15, - "MCP Tool Failure": 12, - "Usage Limit Exceeded": 10 + "Invalid Diagram": 60, + "MCP Tool Failure": 47, + "Usage Limit Exceeded": 33 }, { "Model": "bedrock:us.amazon.nova-micro-v1:0", - "Invalid Diagram": 15, - "MCP Tool Failure": 12, - "Usage Limit Exceeded": 10 + "Invalid Diagram": 60, + "MCP Tool Failure": 49, + "Usage Limit Exceeded": 45 }, { "Model": "bedrock:us.amazon.nova-premier-v1:0", - "Invalid Diagram": 14, - "MCP Tool Failure": 4, - "Usage Limit Exceeded": 4 + "Invalid Diagram": 58, + "MCP Tool Failure": 32, + "Usage Limit Exceeded": 14 }, { "Model": "bedrock:us.amazon.nova-pro-v1:0", - "Invalid Diagram": 15, - "MCP Tool Failure": 15, - "Usage Limit Exceeded": 14 + "Invalid Diagram": 60, + "MCP Tool Failure": 56, + "Usage Limit Exceeded": 54 }, { "Model": "gemini-2.0-flash", - "Invalid Diagram": 45, - "MCP Tool Failure": 43, + "Invalid Diagram": 60, + "MCP Tool Failure": 58, "Usage Limit Exceeded": 0 }, { @@ -725,10 +781,16 @@ "MCP Tool Failure": 31, "Usage Limit Exceeded": 0 }, + { + "Model": "gemini-2.5-flash-lite", + "Invalid Diagram": 44, + "MCP Tool Failure": 27, + "Usage Limit Exceeded": 0 + }, { "Model": "gemini-2.5-flash-lite-preview-06-17", - "Invalid Diagram": 43, - "MCP Tool Failure": 22, + "Invalid Diagram": 57, + "MCP Tool Failure": 30, "Usage Limit Exceeded": 0 }, { @@ -739,25 +801,25 @@ }, { "Model": "gemini-2.5-flash-preview-05-20", - "Invalid Diagram": 43, - "MCP Tool Failure": 35, + "Invalid Diagram": 57, + "MCP Tool Failure": 45, "Usage Limit Exceeded": 0 }, { "Model": "gemini-2.5-pro-preview-03-25", - "Invalid Diagram": 30, + "Invalid Diagram": 37, "MCP Tool Failure": 10, "Usage Limit Exceeded": 9 }, { "Model": "gemini-2.5-pro-preview-05-06", - "Invalid Diagram": 24, + "Invalid Diagram": 33, "MCP Tool Failure": 9, "Usage Limit Exceeded": 4 }, { "Model": "gemini-2.5-pro-preview-06-05", - "Invalid Diagram": 26, + "Invalid Diagram": 36, "MCP Tool Failure": 0, "Usage Limit Exceeded": 0 } @@ -766,119 +828,119 @@ { "Model": "bedrock:us.amazon.nova-lite-v1:0", "test_group": "easy", - "avg_total_cost": 0.000114, - "sum_total_cost": 0.00057, - "run_count": 5, - "avg_input_cost": 4.9e-5, - "sum_input_cost": 0.000243, - "avg_output_cost": 6.6e-5, - "sum_output_cost": 0.000328 + "avg_total_cost": 0.000238, + "sum_total_cost": 0.004768, + "run_count": 20, + "avg_input_cost": 0.000135, + "sum_input_cost": 0.002702, + "avg_output_cost": 0.000103, + "sum_output_cost": 0.002066 }, { "Model": "bedrock:us.amazon.nova-lite-v1:0", "test_group": "hard", - "avg_total_cost": 0.0, - "sum_total_cost": 0.0, - "run_count": 5, - "avg_input_cost": 0.0, - "sum_input_cost": 0.0, - "avg_output_cost": 0.0, - "sum_output_cost": 0.0 + "avg_total_cost": 0.000155, + "sum_total_cost": 0.003104, + "run_count": 20, + "avg_input_cost": 8.4e-5, + "sum_input_cost": 0.001671, + "avg_output_cost": 7.2e-5, + "sum_output_cost": 0.001433 }, { "Model": "bedrock:us.amazon.nova-lite-v1:0", "test_group": "medium", - "avg_total_cost": 0.000397, - "sum_total_cost": 0.001983, - "run_count": 5, - "avg_input_cost": 0.000244, - "sum_input_cost": 0.001218, - "avg_output_cost": 0.000153, - "sum_output_cost": 0.000765 + "avg_total_cost": 0.000349, + "sum_total_cost": 0.006982, + "run_count": 20, + "avg_input_cost": 0.000206, + "sum_input_cost": 0.004112, + "avg_output_cost": 0.000144, + "sum_output_cost": 0.00287 }, { "Model": "bedrock:us.amazon.nova-micro-v1:0", "test_group": "easy", - "avg_total_cost": 0.00013, - "sum_total_cost": 0.00065, - "run_count": 5, - "avg_input_cost": 7e-5, - "sum_input_cost": 0.000348, - "avg_output_cost": 6e-5, - "sum_output_cost": 0.000302 + "avg_total_cost": 0.00015, + "sum_total_cost": 0.002996, + "run_count": 20, + "avg_input_cost": 9.2e-5, + "sum_input_cost": 0.001832, + "avg_output_cost": 5.8e-5, + "sum_output_cost": 0.001165 }, { "Model": "bedrock:us.amazon.nova-micro-v1:0", "test_group": "hard", - "avg_total_cost": 0.0, - "sum_total_cost": 0.0, - "run_count": 5, - "avg_input_cost": 0.0, - "sum_input_cost": 0.0, - "avg_output_cost": 0.0, - "sum_output_cost": 0.0 + "avg_total_cost": 3.7e-5, + "sum_total_cost": 0.000741, + "run_count": 20, + "avg_input_cost": 3e-5, + "sum_input_cost": 0.000606, + "avg_output_cost": 7e-6, + "sum_output_cost": 0.000136 }, { "Model": "bedrock:us.amazon.nova-micro-v1:0", "test_group": "medium", - "avg_total_cost": 0.000169, - "sum_total_cost": 0.000844, - "run_count": 5, - "avg_input_cost": 7.5e-5, - "sum_input_cost": 0.000374, - "avg_output_cost": 9.4e-5, - "sum_output_cost": 0.00047 + "avg_total_cost": 8.4e-5, + "sum_total_cost": 0.001689, + "run_count": 20, + "avg_input_cost": 3.7e-5, + "sum_input_cost": 0.000748, + "avg_output_cost": 4.7e-5, + "sum_output_cost": 0.00094 }, { "Model": "bedrock:us.amazon.nova-premier-v1:0", "test_group": "easy", - "avg_total_cost": 0.100402, - "sum_total_cost": 0.502008, - "run_count": 5, - "avg_input_cost": 0.071506, - "sum_input_cost": 0.357533, - "avg_output_cost": 0.028895, - "sum_output_cost": 0.144475 + "avg_total_cost": 0.05788, + "sum_total_cost": 1.157602, + "run_count": 20, + "avg_input_cost": 0.039284, + "sum_input_cost": 0.78569, + "avg_output_cost": 0.018596, + "sum_output_cost": 0.371912 }, { "Model": "bedrock:us.amazon.nova-premier-v1:0", "test_group": "hard", - "avg_total_cost": 0.041016, - "sum_total_cost": 0.205078, - "run_count": 5, - "avg_input_cost": 0.019733, - "sum_input_cost": 0.098665, - "avg_output_cost": 0.021282, - "sum_output_cost": 0.106412 + "avg_total_cost": 0.023277, + "sum_total_cost": 0.465548, + "run_count": 20, + "avg_input_cost": 0.011096, + "sum_input_cost": 0.221922, + "avg_output_cost": 0.012181, + "sum_output_cost": 0.243625 }, { "Model": "bedrock:us.amazon.nova-premier-v1:0", "test_group": "medium", - "avg_total_cost": 0.027955, - "sum_total_cost": 0.139775, - "run_count": 5, - "avg_input_cost": 0.012258, - "sum_input_cost": 0.061288, - "avg_output_cost": 0.015697, - "sum_output_cost": 0.078487 + "avg_total_cost": 0.025766, + "sum_total_cost": 0.515325, + "run_count": 20, + "avg_input_cost": 0.012222, + "sum_input_cost": 0.24445, + "avg_output_cost": 0.013544, + "sum_output_cost": 0.270875 }, { "Model": "bedrock:us.amazon.nova-pro-v1:0", "test_group": "easy", - "avg_total_cost": 0.0, - "sum_total_cost": 0.0, - "run_count": 5, - "avg_input_cost": 0.0, - "sum_input_cost": 0.0, - "avg_output_cost": 0.0, - "sum_output_cost": 0.0 + "avg_total_cost": 0.00162, + "sum_total_cost": 0.032395, + "run_count": 20, + "avg_input_cost": 0.000864, + "sum_input_cost": 0.017285, + "avg_output_cost": 0.000756, + "sum_output_cost": 0.01511 }, { "Model": "bedrock:us.amazon.nova-pro-v1:0", "test_group": "hard", "avg_total_cost": 0.0, "sum_total_cost": 0.0, - "run_count": 5, + "run_count": 20, "avg_input_cost": 0.0, "sum_input_cost": 0.0, "avg_output_cost": 0.0, @@ -887,46 +949,46 @@ { "Model": "bedrock:us.amazon.nova-pro-v1:0", "test_group": "medium", - "avg_total_cost": 0.0, - "sum_total_cost": 0.0, - "run_count": 5, - "avg_input_cost": 0.0, - "sum_input_cost": 0.0, - "avg_output_cost": 0.0, - "sum_output_cost": 0.0 + "avg_total_cost": 0.000913, + "sum_total_cost": 0.018257, + "run_count": 20, + "avg_input_cost": 0.000462, + "sum_input_cost": 0.009233, + "avg_output_cost": 0.000451, + "sum_output_cost": 0.009024 }, { "Model": "gemini-2.0-flash", "test_group": "easy", - "avg_total_cost": 0.000277, - "sum_total_cost": 0.004153, - "run_count": 15, + "avg_total_cost": 0.000276, + "sum_total_cost": 0.005528, + "run_count": 20, "avg_input_cost": 7.4e-5, - "sum_input_cost": 0.001104, + "sum_input_cost": 0.001472, "avg_output_cost": 0.000203, - "sum_output_cost": 0.003049 + "sum_output_cost": 0.004056 }, { "Model": "gemini-2.0-flash", "test_group": "hard", "avg_total_cost": 0.000275, - "sum_total_cost": 0.004124, - "run_count": 15, + "sum_total_cost": 0.005498, + "run_count": 20, "avg_input_cost": 7.4e-5, - "sum_input_cost": 0.001104, + "sum_input_cost": 0.001472, "avg_output_cost": 0.000201, - "sum_output_cost": 0.00302 + "sum_output_cost": 0.004026 }, { "Model": "gemini-2.0-flash", "test_group": "medium", - "avg_total_cost": 0.000332, - "sum_total_cost": 0.004978, - "run_count": 15, - "avg_input_cost": 0.0001, - "sum_input_cost": 0.001501, - "avg_output_cost": 0.000232, - "sum_output_cost": 0.003477 + "avg_total_cost": 0.000318, + "sum_total_cost": 0.006351, + "run_count": 20, + "avg_input_cost": 9.3e-5, + "sum_input_cost": 0.001869, + "avg_output_cost": 0.000224, + "sum_output_cost": 0.004481 }, { "Model": "gemini-2.5-flash", @@ -962,37 +1024,70 @@ "sum_output_cost": 0.094236 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "test_group": "easy", - "avg_total_cost": 0.001072, - "sum_total_cost": 0.016087, + "avg_total_cost": 0.001042, + "sum_total_cost": 0.015635, "run_count": 15, - "avg_input_cost": 0.000617, - "sum_input_cost": 0.009257, - "avg_output_cost": 0.000455, - "sum_output_cost": 0.006829 + "avg_input_cost": 0.000631, + "sum_input_cost": 0.009469, + "avg_output_cost": 0.000411, + "sum_output_cost": 0.006166 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "test_group": "hard", - "avg_total_cost": 0.000699, - "sum_total_cost": 0.010481, + "avg_total_cost": 0.000751, + "sum_total_cost": 0.011262, + "run_count": 15, + "avg_input_cost": 0.000309, + "sum_input_cost": 0.004637, + "avg_output_cost": 0.000442, + "sum_output_cost": 0.006625 + }, + { + "Model": "gemini-2.5-flash-lite", + "test_group": "medium", + "avg_total_cost": 0.000853, + "sum_total_cost": 0.012801, "run_count": 15, + "avg_input_cost": 0.000412, + "sum_input_cost": 0.006178, + "avg_output_cost": 0.000442, + "sum_output_cost": 0.006623 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "test_group": "easy", + "avg_total_cost": 0.001001, + "sum_total_cost": 0.020021, + "run_count": 20, + "avg_input_cost": 0.00054, + "sum_input_cost": 0.010803, + "avg_output_cost": 0.000461, + "sum_output_cost": 0.009218 + }, + { + "Model": "gemini-2.5-flash-lite-preview-06-17", + "test_group": "hard", + "avg_total_cost": 0.000712, + "sum_total_cost": 0.014249, + "run_count": 20, "avg_input_cost": 0.000311, - "sum_input_cost": 0.004664, - "avg_output_cost": 0.000388, - "sum_output_cost": 0.005817 + "sum_input_cost": 0.006223, + "avg_output_cost": 0.000401, + "sum_output_cost": 0.008026 }, { "Model": "gemini-2.5-flash-lite-preview-06-17", "test_group": "medium", - "avg_total_cost": 0.000747, - "sum_total_cost": 0.011208, - "run_count": 15, - "avg_input_cost": 0.000326, - "sum_input_cost": 0.004887, - "avg_output_cost": 0.000421, - "sum_output_cost": 0.00632 + "avg_total_cost": 0.000736, + "sum_total_cost": 0.014729, + "run_count": 20, + "avg_input_cost": 0.000322, + "sum_input_cost": 0.006438, + "avg_output_cost": 0.000415, + "sum_output_cost": 0.008291 }, { "Model": "gemini-2.5-flash-preview-04-17", @@ -1030,134 +1125,134 @@ { "Model": "gemini-2.5-flash-preview-05-20", "test_group": "easy", - "avg_total_cost": 0.009252, - "sum_total_cost": 0.138786, - "run_count": 15, - "avg_input_cost": 0.000302, - "sum_input_cost": 0.004528, - "avg_output_cost": 0.008951, - "sum_output_cost": 0.134258 + "avg_total_cost": 0.011867, + "sum_total_cost": 0.237336, + "run_count": 20, + "avg_input_cost": 0.0003, + "sum_input_cost": 0.005997, + "avg_output_cost": 0.011567, + "sum_output_cost": 0.231338 }, { "Model": "gemini-2.5-flash-preview-05-20", "test_group": "hard", - "avg_total_cost": 0.005456, - "sum_total_cost": 0.081834, - "run_count": 15, - "avg_input_cost": 0.000257, - "sum_input_cost": 0.003852, - "avg_output_cost": 0.005199, - "sum_output_cost": 0.077982 + "avg_total_cost": 0.007053, + "sum_total_cost": 0.141066, + "run_count": 20, + "avg_input_cost": 0.00026, + "sum_input_cost": 0.0052, + "avg_output_cost": 0.006793, + "sum_output_cost": 0.135866 }, { "Model": "gemini-2.5-flash-preview-05-20", "test_group": "medium", - "avg_total_cost": 0.00915, - "sum_total_cost": 0.137255, - "run_count": 15, - "avg_input_cost": 0.00051, - "sum_input_cost": 0.007647, - "avg_output_cost": 0.008641, - "sum_output_cost": 0.129608 + "avg_total_cost": 0.011282, + "sum_total_cost": 0.225649, + "run_count": 20, + "avg_input_cost": 0.000528, + "sum_input_cost": 0.010559, + "avg_output_cost": 0.010754, + "sum_output_cost": 0.215089 }, { "Model": "gemini-2.5-pro-preview-03-25", "test_group": "easy", - "avg_total_cost": 0.336117, - "sum_total_cost": 4.369515, - "run_count": 13, - "avg_input_cost": 0.017935, - "sum_input_cost": 0.233155, - "avg_output_cost": 0.318182, - "sum_output_cost": 4.13636 + "avg_total_cost": 0.28019, + "sum_total_cost": 4.483036, + "run_count": 16, + "avg_input_cost": 0.0153, + "sum_input_cost": 0.244806, + "avg_output_cost": 0.264889, + "sum_output_cost": 4.23823 }, { "Model": "gemini-2.5-pro-preview-03-25", "test_group": "hard", - "avg_total_cost": 0.030363, - "sum_total_cost": 0.394716, - "run_count": 13, - "avg_input_cost": 0.002832, - "sum_input_cost": 0.036816, - "avg_output_cost": 0.027531, - "sum_output_cost": 0.3579 + "avg_total_cost": 0.035003, + "sum_total_cost": 0.560053, + "run_count": 16, + "avg_input_cost": 0.003319, + "sum_input_cost": 0.053102, + "avg_output_cost": 0.031684, + "sum_output_cost": 0.50695 }, { "Model": "gemini-2.5-pro-preview-03-25", "test_group": "medium", - "avg_total_cost": 0.021683, - "sum_total_cost": 0.281882, - "run_count": 13, - "avg_input_cost": 0.002557, - "sum_input_cost": 0.033242, - "avg_output_cost": 0.019126, - "sum_output_cost": 0.24864 + "avg_total_cost": 0.024591, + "sum_total_cost": 0.393455, + "run_count": 16, + "avg_input_cost": 0.003097, + "sum_input_cost": 0.049555, + "avg_output_cost": 0.021494, + "sum_output_cost": 0.3439 }, { "Model": "gemini-2.5-pro-preview-05-06", "test_group": "easy", - "avg_total_cost": 0.261259, - "sum_total_cost": 2.87385, - "run_count": 11, - "avg_input_cost": 0.022235, - "sum_input_cost": 0.24458, - "avg_output_cost": 0.239025, - "sum_output_cost": 2.62927 + "avg_total_cost": 0.200351, + "sum_total_cost": 3.005265, + "run_count": 15, + "avg_input_cost": 0.017341, + "sum_input_cost": 0.260115, + "avg_output_cost": 0.18301, + "sum_output_cost": 2.74515 }, { "Model": "gemini-2.5-pro-preview-05-06", "test_group": "hard", - "avg_total_cost": 0.018537, - "sum_total_cost": 0.203902, - "run_count": 11, - "avg_input_cost": 0.002118, - "sum_input_cost": 0.023302, - "avg_output_cost": 0.016418, - "sum_output_cost": 0.1806 + "avg_total_cost": 0.020663, + "sum_total_cost": 0.309948, + "run_count": 15, + "avg_input_cost": 0.002589, + "sum_input_cost": 0.038838, + "avg_output_cost": 0.018074, + "sum_output_cost": 0.27111 }, { "Model": "gemini-2.5-pro-preview-05-06", "test_group": "medium", - "avg_total_cost": 0.220536, - "sum_total_cost": 2.4259, - "run_count": 11, - "avg_input_cost": 0.010436, - "sum_input_cost": 0.1148, - "avg_output_cost": 0.2101, - "sum_output_cost": 2.3111 + "avg_total_cost": 0.171269, + "sum_total_cost": 2.569035, + "run_count": 15, + "avg_input_cost": 0.008689, + "sum_input_cost": 0.130335, + "avg_output_cost": 0.16258, + "sum_output_cost": 2.4387 }, { "Model": "gemini-2.5-pro-preview-06-05", "test_group": "easy", - "avg_total_cost": 0.037213, - "sum_total_cost": 0.483772, - "run_count": 13, - "avg_input_cost": 0.009262, - "sum_input_cost": 0.120402, - "avg_output_cost": 0.027952, - "sum_output_cost": 0.36337 + "avg_total_cost": 0.034554, + "sum_total_cost": 0.587418, + "run_count": 17, + "avg_input_cost": 0.007996, + "sum_input_cost": 0.135938, + "avg_output_cost": 0.026558, + "sum_output_cost": 0.45148 }, { "Model": "gemini-2.5-pro-preview-06-05", "test_group": "hard", - "avg_total_cost": 0.043285, - "sum_total_cost": 0.562708, - "run_count": 13, - "avg_input_cost": 0.005718, - "sum_input_cost": 0.074328, - "avg_output_cost": 0.037568, - "sum_output_cost": 0.48838 + "avg_total_cost": 0.041507, + "sum_total_cost": 0.705622, + "run_count": 17, + "avg_input_cost": 0.005286, + "sum_input_cost": 0.089862, + "avg_output_cost": 0.036221, + "sum_output_cost": 0.61576 }, { "Model": "gemini-2.5-pro-preview-06-05", "test_group": "medium", - "avg_total_cost": 0.041005, - "sum_total_cost": 0.533062, - "run_count": 13, - "avg_input_cost": 0.005436, - "sum_input_cost": 0.070662, - "avg_output_cost": 0.035569, - "sum_output_cost": 0.4624 + "avg_total_cost": 0.038826, + "sum_total_cost": 0.660038, + "run_count": 17, + "avg_input_cost": 0.00507, + "sum_input_cost": 0.086198, + "avg_output_cost": 0.033755, + "sum_output_cost": 0.57384 } ], "raw_data": [ @@ -2122,103 +2217,343 @@ "output_cost": 0.0002012 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 60.637950634, + "Duration": 2.905133416, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 76294.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, "provider": "Google", - "Metric_request_tokens": 15093.0, - "Metric_response_tokens": 2470.0, - "total_cost": 0.20930445, - "input_cost": 0.00226395, - "output_cost": 0.20704050000000002 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 31.479987348, + "Duration": 3.155831434, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 40567.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, "provider": "Google", - "Metric_request_tokens": 8696.0, - "Metric_response_tokens": 1852.0, - "total_cost": 0.10748210000000001, - "input_cost": 0.0013044, - "output_cost": 0.10617770000000001 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.009089436, + "Duration": 3.289127811, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1730.0, + "total_tokens": 1239.0, "provider": "Google", - "Metric_request_tokens": 810.0, - "Metric_response_tokens": 524.0, - "total_cost": 0.0018219, - "input_cost": 0.00012149999999999999, - "output_cost": 0.0017004 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 27.092670181, - "Score_MermaidDiagramValid": 1.0, + "Duration": 2.969590916, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 72731.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, "provider": "Google", - "Metric_request_tokens": 7623.0, - "Metric_response_tokens": 1833.0, - "total_cost": 0.22370575, - "input_cost": 0.0011434499999999998, - "output_cost": 0.22256230000000002 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 15.495790935, + "Duration": 3.007718604, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4759.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00946455, - "input_cost": 0.00025035, - "output_cost": 0.0092142 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 15.96490713, + "Duration": 3.411889755, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5661.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, "provider": "Google", - "Metric_request_tokens": 3007.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.0063383499999999995, - "input_cost": 0.00045105, - "output_cost": 0.0058873 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 2.978789606, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1261.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 525.0, + "total_cost": 0.0002836, + "input_cost": 7.36e-5, + "output_cost": 0.00021 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.024806005, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.377116743, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 2.999520941, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.034895574, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.436441263, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 2.963080948, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.033675592, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.437838664, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 60.637950634, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 76294.0, + "provider": "Google", + "Metric_request_tokens": 15093.0, + "Metric_response_tokens": 2470.0, + "total_cost": 0.20930445, + "input_cost": 0.00226395, + "output_cost": 0.20704050000000002 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 31.479987348, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 40567.0, + "provider": "Google", + "Metric_request_tokens": 8696.0, + "Metric_response_tokens": 1852.0, + "total_cost": 0.10748210000000001, + "input_cost": 0.0013044, + "output_cost": 0.10617770000000001 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.009089436, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1730.0, + "provider": "Google", + "Metric_request_tokens": 810.0, + "Metric_response_tokens": 524.0, + "total_cost": 0.0018219, + "input_cost": 0.00012149999999999999, + "output_cost": 0.0017004 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 27.092670181, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 72731.0, + "provider": "Google", + "Metric_request_tokens": 7623.0, + "Metric_response_tokens": 1833.0, + "total_cost": 0.22370575, + "input_cost": 0.0011434499999999998, + "output_cost": 0.22256230000000002 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 15.495790935, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4759.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00946455, + "input_cost": 0.00025035, + "output_cost": 0.0092142 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 15.96490713, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5661.0, + "provider": "Google", + "Metric_request_tokens": 3007.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.0063383499999999995, + "input_cost": 0.00045105, + "output_cost": 0.0058873 + }, + { + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", "Duration": 54.049785353, @@ -3322,42 +3657,42 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 70.761430334, + "Duration": 141.168154989, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 68346.0, "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 64861.0, + "Metric_response_tokens": 3485.0, + "total_cost": 0.205715, + "input_cost": 0.1621525, + "output_cost": 0.0435625 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 50.064655716, + "Duration": 66.419366072, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9777.0, "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 7990.0, + "Metric_response_tokens": 1787.0, + "total_cost": 0.0423125, + "input_cost": 0.019975, + "output_cost": 0.0223375 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 54.286773631, + "Duration": 61.750421529, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, @@ -3370,74 +3705,122 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 52.848939949, + "Duration": 60.774519728, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6479.0, "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 4710.0, + "Metric_response_tokens": 1769.0, + "total_cost": 0.0338875, + "input_cost": 0.011774999999999999, + "output_cost": 0.0221125 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 52.4068963, + "Duration": 61.257517708, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 14585.0, "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 11771.0, + "Metric_response_tokens": 2814.0, + "total_cost": 0.06460250000000001, + "input_cost": 0.029427500000000002, + "output_cost": 0.035175 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 50.579682246, + "Duration": 168.374540939, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 22046.0, "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 18606.0, + "Metric_response_tokens": 3440.0, + "total_cost": 0.089515, + "input_cost": 0.046515, + "output_cost": 0.043 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 50.490538016, + "Duration": 81.523331201, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 63619.0, + "provider": "Amazon", + "Metric_request_tokens": 60858.0, + "Metric_response_tokens": 2761.0, + "total_cost": 0.1866575, + "input_cost": 0.152145, + "output_cost": 0.0345125 + }, + { + "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 47.967601991, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6432.0, "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 4754.0, + "Metric_response_tokens": 1678.0, + "total_cost": 0.03286, + "input_cost": 0.011885000000000001, + "output_cost": 0.020975 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 72.870020591, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 19765.0, + "provider": "Amazon", + "Metric_request_tokens": 16317.0, + "Metric_response_tokens": 3448.0, + "total_cost": 0.08389250000000001, + "input_cost": 0.0407925, + "output_cost": 0.0431 + }, + { + "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 52.794925723, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10125.0, + "provider": "Amazon", + "Metric_request_tokens": 8113.0, + "Metric_response_tokens": 2012.0, + "total_cost": 0.0454325, + "input_cost": 0.020282500000000002, + "output_cost": 0.02515 + }, + { + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 49.381506192, + "Duration": 102.690218942, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, @@ -3450,42 +3833,42 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 51.014168316, + "Duration": 27.695561409, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6168.0, "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 4543.0, + "Metric_response_tokens": 1625.0, + "total_cost": 0.031670000000000004, + "input_cost": 0.0113575, + "output_cost": 0.0203125 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 53.619240014, + "Duration": 22.951197833, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6002.0, "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 4471.0, + "Metric_response_tokens": 1531.0, + "total_cost": 0.030315, + "input_cost": 0.0111775, + "output_cost": 0.0191375 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 46.101775188, + "Duration": 89.757139435, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, @@ -3498,10 +3881,10 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 53.682643889, + "Duration": 116.956568909, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, @@ -3514,42 +3897,42 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 19.294165416, + "Duration": 61.788091838, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8566.0, "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 6763.0, + "Metric_response_tokens": 1803.0, + "total_cost": 0.039444999999999994, + "input_cost": 0.0169075, + "output_cost": 0.0225375 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 52.706066252, + "Duration": 139.579026705, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 28485.0, "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 24611.0, + "Metric_response_tokens": 3874.0, + "total_cost": 0.1099525, + "input_cost": 0.0615275, + "output_cost": 0.048424999999999996 }, { - "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 53.323483575, + "Duration": 102.15980072, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, @@ -3562,42 +3945,42 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 32.586487666, + "Duration": 116.141871727, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 19949.0, "provider": "Amazon", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 16646.0, + "Metric_response_tokens": 3303.0, + "total_cost": 0.08290249999999999, + "input_cost": 0.041615, + "output_cost": 0.0412875 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 9.551409036, + "Duration": 59.31011804, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 5238.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6590.0, "provider": "Amazon", - "Metric_request_tokens": 4492.0, - "Metric_response_tokens": 746.0, - "total_cost": 0.00044856000000000003, - "input_cost": 0.00026952000000000004, - "output_cost": 0.00017904 + "Metric_request_tokens": 4868.0, + "Metric_response_tokens": 1722.0, + "total_cost": 0.033695, + "input_cost": 0.01217, + "output_cost": 0.021525 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 24.607065725, + "Duration": 153.57670155, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, @@ -3610,26 +3993,42 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 13.907358597, + "Duration": 52.652731978, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5408.0, + "total_tokens": 6372.0, "provider": "Amazon", - "Metric_request_tokens": 4043.0, - "Metric_response_tokens": 1365.0, - "total_cost": 0.0005701799999999999, - "input_cost": 0.00024257999999999997, - "output_cost": 0.00032759999999999994 + "Metric_request_tokens": 4841.0, + "Metric_response_tokens": 1531.0, + "total_cost": 0.03124, + "input_cost": 0.012102499999999999, + "output_cost": 0.0191375 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 33.254325135, + "Duration": 124.215302042, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 22787.0, + "provider": "Amazon", + "Metric_request_tokens": 19271.0, + "Metric_response_tokens": 3516.0, + "total_cost": 0.0921275, + "input_cost": 0.0481775, + "output_cost": 0.04395 + }, + { + "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 152.610399933, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, @@ -3642,10 +4041,10 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", - "Case": "fix_invalid_diagram_hard", - "test_group": "hard", - "Duration": 34.303524713, + "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 122.520721486, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, @@ -3658,10 +4057,42 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 130.053796609, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-premier-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 85.873660202, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 12301.0, + "provider": "Amazon", + "Metric_request_tokens": 9837.0, + "Metric_response_tokens": 2464.0, + "total_cost": 0.0553925, + "input_cost": 0.0245925, + "output_cost": 0.0308 + }, + { + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 38.439967155, + "Duration": 122.249576234, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, @@ -3674,28 +4105,28 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 14.821898825, + "Duration": 1.075055246, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9263.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Amazon", - "Metric_request_tokens": 7867.0, - "Metric_response_tokens": 1396.0, - "total_cost": 0.0008070600000000001, - "input_cost": 0.00047202, - "output_cost": 0.00033504 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 18.995154891, + "Duration": 1.21642044, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, "provider": "Amazon", @@ -3706,12 +4137,12 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 69.995588077, + "Duration": 1.060936443, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, "provider": "Amazon", @@ -3722,28 +4153,28 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 9.927372481, + "Duration": 1.037051732, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5372.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Amazon", - "Metric_request_tokens": 4550.0, - "Metric_response_tokens": 822.0, - "total_cost": 0.00047028, - "input_cost": 0.000273, - "output_cost": 0.00019728 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 32.900333046, + "Duration": 1.056416859, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, "provider": "Amazon", @@ -3754,12 +4185,12 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 28.391986261, + "Duration": 1.045000565, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, "provider": "Amazon", @@ -3770,28 +4201,28 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.633113428, + "Duration": 1.174768992, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3619.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Amazon", - "Metric_request_tokens": 3397.0, - "Metric_response_tokens": 222.0, - "total_cost": 0.00025709999999999996, - "input_cost": 0.00020381999999999998, - "output_cost": 5.328e-5 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 15.857710005, + "Duration": 1.06139308, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, "provider": "Amazon", @@ -3802,44 +4233,44 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 7.235511866, + "Duration": 1.096773894, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4178.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Amazon", - "Metric_request_tokens": 3433.0, - "Metric_response_tokens": 745.0, - "total_cost": 0.00022445500000000003, - "input_cost": 0.000120155, - "output_cost": 0.00010430000000000001 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 10.032302433, + "Duration": 1.065730453, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4888.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Amazon", - "Metric_request_tokens": 3535.0, - "Metric_response_tokens": 1353.0, - "total_cost": 0.000313145, - "input_cost": 0.000123725, - "output_cost": 0.00018942 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 16.138580876, + "Duration": 1.034951182, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, "provider": "Amazon", @@ -3850,28 +4281,28 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 10.894268114, + "Duration": 1.170030852, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7935.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Amazon", - "Metric_request_tokens": 6523.0, - "Metric_response_tokens": 1412.0, - "total_cost": 0.00042598500000000004, - "input_cost": 0.00022830500000000002, - "output_cost": 0.00019768000000000002 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 15.982702523, + "Duration": 1.082285844, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, "provider": "Amazon", @@ -3882,12 +4313,12 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 22.058090836, + "Duration": 1.03252349, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, "provider": "Amazon", @@ -3898,12 +4329,12 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 17.691408682, + "Duration": 1.038807845, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, "provider": "Amazon", @@ -3914,12 +4345,12 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 29.664497105, + "Duration": 1.059020363, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, "provider": "Amazon", @@ -3930,12 +4361,12 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-premier-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 17.96936611, + "Duration": 1.159144096, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, "provider": "Amazon", @@ -3946,12 +4377,12 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-pro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 25.827149441, + "Duration": 70.761430334, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, + "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, "provider": "Amazon", @@ -3962,28 +4393,28 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-pro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 13.347354317, + "Duration": 50.064655716, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9162.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Amazon", - "Metric_request_tokens": 7157.0, - "Metric_response_tokens": 2005.0, - "total_cost": 0.000531195, - "input_cost": 0.00025049500000000005, - "output_cost": 0.0002807 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-pro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.111033105, + "Duration": 54.286773631, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, "total_tokens": 0.0, "provider": "Amazon", @@ -3994,10 +4425,10 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-pro-v1:0", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.272915809, + "Duration": 52.848939949, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, @@ -4010,10 +4441,10 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-pro-v1:0", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 22.779003058, + "Duration": 52.4068963, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, @@ -4026,10 +4457,10 @@ "output_cost": 0.0 }, { - "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Model": "bedrock:us.amazon.nova-pro-v1:0", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 19.910701039, + "Duration": 50.579682246, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, @@ -4042,3460 +4473,7972 @@ "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 50.490538016, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 49.381506192, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 51.014168316, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 53.619240014, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 46.101775188, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 53.682643889, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 19.294165416, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 52.706066252, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 53.323483575, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 70.761430334, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 50.064655716, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 54.286773631, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 52.848939949, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 52.4068963, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 50.579682246, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 50.490538016, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 49.381506192, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 51.014168316, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 53.619240014, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 46.101775188, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 53.682643889, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 19.294165416, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 52.706066252, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 53.323483575, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 60.490865526, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 24.38614935, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7391.0, + "provider": "Amazon", + "Metric_request_tokens": 6014.0, + "Metric_response_tokens": 1377.0, + "total_cost": 0.009217600000000001, + "input_cost": 0.004811200000000001, + "output_cost": 0.0044064 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 19.587720037, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 35.779726991, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 50.898700324, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 29.120025907, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 52.342244267, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 51.294958536, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 50.891190365, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 47.666868754, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 18401.0, + "provider": "Amazon", + "Metric_request_tokens": 15769.0, + "Metric_response_tokens": 2632.0, + "total_cost": 0.0210376, + "input_cost": 0.0126152, + "output_cost": 0.0084224 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 63.490425569, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 63.600602322, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 50.937322607, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 21.340557151, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 54.150935132, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 53.966453918, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 50.698133211, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 54.007505821, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 65.783296981, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 63.681787497, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 64.71506337, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 52.909417244, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 24.945646339, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6970.0, + "provider": "Amazon", + "Metric_request_tokens": 5527.0, + "Metric_response_tokens": 1443.0, + "total_cost": 0.0090392, + "input_cost": 0.0044216, + "output_cost": 0.0046176 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 52.262290885, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 34.392486572, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7927.0, + "provider": "Amazon", + "Metric_request_tokens": 5837.0, + "Metric_response_tokens": 2090.0, + "total_cost": 0.011357599999999999, + "input_cost": 0.0046696, + "output_cost": 0.0066879999999999995 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 51.782173036, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 51.199029621, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 56.303173032, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 45.050576092, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-pro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 53.036889105, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 32.586487666, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 9.551409036, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5238.0, + "provider": "Amazon", + "Metric_request_tokens": 4492.0, + "Metric_response_tokens": 746.0, + "total_cost": 0.00044856000000000003, + "input_cost": 0.00026952000000000004, + "output_cost": 0.00017904 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 24.607065725, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 13.907358597, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5408.0, + "provider": "Amazon", + "Metric_request_tokens": 4043.0, + "Metric_response_tokens": 1365.0, + "total_cost": 0.0005701799999999999, + "input_cost": 0.00024257999999999997, + "output_cost": 0.00032759999999999994 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 33.254325135, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 34.303524713, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 38.439967155, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 14.821898825, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9263.0, + "provider": "Amazon", + "Metric_request_tokens": 7867.0, + "Metric_response_tokens": 1396.0, + "total_cost": 0.0008070600000000001, + "input_cost": 0.00047202, + "output_cost": 0.00033504 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 18.995154891, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 69.995588077, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 9.927372481, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5372.0, + "provider": "Amazon", + "Metric_request_tokens": 4550.0, + "Metric_response_tokens": 822.0, + "total_cost": 0.00047028, + "input_cost": 0.000273, + "output_cost": 0.00019728 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 32.900333046, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 28.391986261, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 5.633113428, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3619.0, + "provider": "Amazon", + "Metric_request_tokens": 3397.0, + "Metric_response_tokens": 222.0, + "total_cost": 0.00025709999999999996, + "input_cost": 0.00020381999999999998, + "output_cost": 5.328e-5 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 15.857710005, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 32.586487666, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 9.551409036, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5238.0, + "provider": "Amazon", + "Metric_request_tokens": 4492.0, + "Metric_response_tokens": 746.0, + "total_cost": 0.00044856000000000003, + "input_cost": 0.00026952000000000004, + "output_cost": 0.00017904 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 24.607065725, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 13.907358597, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5408.0, + "provider": "Amazon", + "Metric_request_tokens": 4043.0, + "Metric_response_tokens": 1365.0, + "total_cost": 0.0005701799999999999, + "input_cost": 0.00024257999999999997, + "output_cost": 0.00032759999999999994 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 33.254325135, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 34.303524713, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 38.439967155, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 14.821898825, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9263.0, + "provider": "Amazon", + "Metric_request_tokens": 7867.0, + "Metric_response_tokens": 1396.0, + "total_cost": 0.0008070600000000001, + "input_cost": 0.00047202, + "output_cost": 0.00033504 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 18.995154891, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 69.995588077, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 9.927372481, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5372.0, + "provider": "Amazon", + "Metric_request_tokens": 4550.0, + "Metric_response_tokens": 822.0, + "total_cost": 0.00047028, + "input_cost": 0.000273, + "output_cost": 0.00019728 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 32.900333046, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 28.391986261, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 5.633113428, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3619.0, + "provider": "Amazon", + "Metric_request_tokens": 3397.0, + "Metric_response_tokens": 222.0, + "total_cost": 0.00025709999999999996, + "input_cost": 0.00020381999999999998, + "output_cost": 5.328e-5 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 15.857710005, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 10.13120079, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4093.0, + "provider": "Amazon", + "Metric_request_tokens": 3436.0, + "Metric_response_tokens": 657.0, + "total_cost": 0.00036384, + "input_cost": 0.00020616, + "output_cost": 0.00015768 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 10.202050303, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5241.0, + "provider": "Amazon", + "Metric_request_tokens": 4466.0, + "Metric_response_tokens": 775.0, + "total_cost": 0.00045396, + "input_cost": 0.00026796000000000003, + "output_cost": 0.000186 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 12.523508567, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4560.0, + "provider": "Amazon", + "Metric_request_tokens": 3455.0, + "Metric_response_tokens": 1105.0, + "total_cost": 0.0004725, + "input_cost": 0.0002073, + "output_cost": 0.0002652 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 33.301909151, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 16.07952375, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9166.0, + "provider": "Amazon", + "Metric_request_tokens": 7880.0, + "Metric_response_tokens": 1286.0, + "total_cost": 0.00078144, + "input_cost": 0.0004728, + "output_cost": 0.00030864 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 20.832885038, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10426.0, + "provider": "Amazon", + "Metric_request_tokens": 8538.0, + "Metric_response_tokens": 1888.0, + "total_cost": 0.0009653999999999999, + "input_cost": 0.00051228, + "output_cost": 0.00045311999999999994 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 10.103307419, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5207.0, + "provider": "Amazon", + "Metric_request_tokens": 4458.0, + "Metric_response_tokens": 749.0, + "total_cost": 0.00044724, + "input_cost": 0.00026748, + "output_cost": 0.00017976 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 15.416841941, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7739.0, + "provider": "Amazon", + "Metric_request_tokens": 6387.0, + "Metric_response_tokens": 1352.0, + "total_cost": 0.0007076999999999999, + "input_cost": 0.00038322, + "output_cost": 0.00032448 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 32.051268291, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 23.906853847, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 19298.0, + "provider": "Amazon", + "Metric_request_tokens": 17687.0, + "Metric_response_tokens": 1611.0, + "total_cost": 0.00144786, + "input_cost": 0.00106122, + "output_cost": 0.00038664 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 14.752534117, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 6073.0, + "provider": "Amazon", + "Metric_request_tokens": 4698.0, + "Metric_response_tokens": 1375.0, + "total_cost": 0.00061188, + "input_cost": 0.00028188, + "output_cost": 0.00032999999999999994 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 58.670944361, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 9.086971754, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4078.0, + "provider": "Amazon", + "Metric_request_tokens": 3427.0, + "Metric_response_tokens": 651.0, + "total_cost": 0.00036186, + "input_cost": 0.00020562, + "output_cost": 0.00015623999999999998 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 35.618876054, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 20.22773949, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 9.717342214, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4188.0, + "provider": "Amazon", + "Metric_request_tokens": 3424.0, + "Metric_response_tokens": 764.0, + "total_cost": 0.0003888, + "input_cost": 0.00020543999999999998, + "output_cost": 0.00018336 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 36.005522453, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 37.475340611, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 15.113064255, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5962.0, + "provider": "Amazon", + "Metric_request_tokens": 4515.0, + "Metric_response_tokens": 1447.0, + "total_cost": 0.00061818, + "input_cost": 0.00027089999999999997, + "output_cost": 0.00034728 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 17.63608912, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 10.620195375, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5359.0, + "provider": "Amazon", + "Metric_request_tokens": 4536.0, + "Metric_response_tokens": 823.0, + "total_cost": 0.00046968, + "input_cost": 0.00027215999999999997, + "output_cost": 0.00019752 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 57.336604821, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 33.836315522, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.83794607, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3553.0, + "provider": "Amazon", + "Metric_request_tokens": 3358.0, + "Metric_response_tokens": 195.0, + "total_cost": 0.00024828, + "input_cost": 0.00020147999999999998, + "output_cost": 4.68e-5 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 33.75772521, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 10.1295691, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5291.0, + "provider": "Amazon", + "Metric_request_tokens": 4491.0, + "Metric_response_tokens": 800.0, + "total_cost": 0.00046146, + "input_cost": 0.00026946, + "output_cost": 0.000192 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 9.798962699, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4239.0, + "provider": "Amazon", + "Metric_request_tokens": 3490.0, + "Metric_response_tokens": 749.0, + "total_cost": 0.00038916, + "input_cost": 0.0002094, + "output_cost": 0.00017976 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 35.330206914, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 57.658972199, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 13.06782325, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5686.0, + "provider": "Amazon", + "Metric_request_tokens": 4477.0, + "Metric_response_tokens": 1209.0, + "total_cost": 0.00055878, + "input_cost": 0.00026862, + "output_cost": 0.00029016 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 7.235511866, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4178.0, + "provider": "Amazon", + "Metric_request_tokens": 3433.0, + "Metric_response_tokens": 745.0, + "total_cost": 0.00022445500000000003, + "input_cost": 0.000120155, + "output_cost": 0.00010430000000000001 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 10.032302433, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4888.0, + "provider": "Amazon", + "Metric_request_tokens": 3535.0, + "Metric_response_tokens": 1353.0, + "total_cost": 0.000313145, + "input_cost": 0.000123725, + "output_cost": 0.00018942 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 16.138580876, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 10.894268114, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7935.0, + "provider": "Amazon", + "Metric_request_tokens": 6523.0, + "Metric_response_tokens": 1412.0, + "total_cost": 0.00042598500000000004, + "input_cost": 0.00022830500000000002, + "output_cost": 0.00019768000000000002 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 15.982702523, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 22.058090836, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 17.691408682, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 29.664497105, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 17.96936611, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 25.827149441, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 13.347354317, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9162.0, + "provider": "Amazon", + "Metric_request_tokens": 7157.0, + "Metric_response_tokens": 2005.0, + "total_cost": 0.000531195, + "input_cost": 0.00025049500000000005, + "output_cost": 0.0002807 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.111033105, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 26.272915809, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 22.779003058, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 19.910701039, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 7.235511866, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4178.0, + "provider": "Amazon", + "Metric_request_tokens": 3433.0, + "Metric_response_tokens": 745.0, + "total_cost": 0.00022445500000000003, + "input_cost": 0.000120155, + "output_cost": 0.00010430000000000001 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 10.032302433, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4888.0, + "provider": "Amazon", + "Metric_request_tokens": 3535.0, + "Metric_response_tokens": 1353.0, + "total_cost": 0.000313145, + "input_cost": 0.000123725, + "output_cost": 0.00018942 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 16.138580876, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 10.894268114, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7935.0, + "provider": "Amazon", + "Metric_request_tokens": 6523.0, + "Metric_response_tokens": 1412.0, + "total_cost": 0.00042598500000000004, + "input_cost": 0.00022830500000000002, + "output_cost": 0.00019768000000000002 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 15.982702523, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 22.058090836, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 17.691408682, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 29.664497105, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 17.96936611, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 25.827149441, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 13.347354317, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9162.0, + "provider": "Amazon", + "Metric_request_tokens": 7157.0, + "Metric_response_tokens": 2005.0, + "total_cost": 0.000531195, + "input_cost": 0.00025049500000000005, + "output_cost": 0.0002807 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.111033105, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 26.272915809, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 22.779003058, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 19.910701039, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 10.709128752, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7655.0, + "provider": "Amazon", + "Metric_request_tokens": 6340.0, + "Metric_response_tokens": 1315.0, + "total_cost": 0.000406, + "input_cost": 0.00022190000000000003, + "output_cost": 0.0001841 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 16.1569874, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 16.292243277, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 25.459357714, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 24.24639963, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 27.127796199, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 7.515629417, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4196.0, + "provider": "Amazon", + "Metric_request_tokens": 3478.0, + "Metric_response_tokens": 718.0, + "total_cost": 0.00022225, + "input_cost": 0.00012173000000000002, + "output_cost": 0.00010052 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 22.272816724, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 27.155517751, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 21.478668781, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 16.68120615, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 17.854751274, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 26.11028082, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 13.019602175, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 18.122924172, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 13.539048311, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 18474.0, + "provider": "Amazon", + "Metric_request_tokens": 17520.0, + "Metric_response_tokens": 954.0, + "total_cost": 0.0007467600000000001, + "input_cost": 0.0006132, + "output_cost": 0.00013356000000000002 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 25.096765367, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 15.169118995, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 9.241399938, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6107.0, + "provider": "Amazon", + "Metric_request_tokens": 5090.0, + "Metric_response_tokens": 1017.0, + "total_cost": 0.00032053, + "input_cost": 0.00017815000000000002, + "output_cost": 0.00014238 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 29.228884007, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 27.533683609, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 20.056999566, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 15.667786223, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 25.437752797, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 21.22790309, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 27.491055, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 17.986839409, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 29.143947476, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 27.419551466, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Amazon", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "bedrock:us.amazon.nova-micro-v1:0", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 13.586402193, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 18273.0, + "provider": "Amazon", + "Metric_request_tokens": 17305.0, + "Metric_response_tokens": 968.0, + "total_cost": 0.000741195, + "input_cost": 0.0006056750000000001, + "output_cost": 0.00013552 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 24.207496913, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5044.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1158.0, + "total_cost": 0.023253749999999997, + "input_cost": 0.00388375, + "output_cost": 0.01937 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 24.803203804, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5199.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.02480375, + "input_cost": 0.00388375, + "output_cost": 0.02092 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 34.032753508, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6600.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.03881375, + "input_cost": 0.00388375, + "output_cost": 0.03493 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 24.432732425, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5326.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1176.0, + "total_cost": 0.02607375, + "input_cost": 0.00388375, + "output_cost": 0.02219 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 27.33412975, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5806.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1201.0, + "total_cost": 0.03087375, + "input_cost": 0.00388375, + "output_cost": 0.02699 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 32.488361727, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5925.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1154.0, + "total_cost": 0.03206375, + "input_cost": 0.00388375, + "output_cost": 0.028180000000000004 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 26.454849392, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5526.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.02807375, + "input_cost": 0.00388375, + "output_cost": 0.024190000000000003 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 40.878031462, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10420.0, + "provider": "Google", + "Metric_request_tokens": 6847.0, + "Metric_response_tokens": 1815.0, + "total_cost": 0.04428875, + "input_cost": 0.00855875, + "output_cost": 0.03573 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 67.384048514, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 13422.0, + "provider": "Google", + "Metric_request_tokens": 6855.0, + "Metric_response_tokens": 1785.0, + "total_cost": 0.07423875, + "input_cost": 0.00856875, + "output_cost": 0.06567 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 27.014404973, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5406.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.026873749999999995, + "input_cost": 0.00388375, + "output_cost": 0.022989999999999997 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 26.744050775, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5425.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.027063750000000004, + "input_cost": 0.00388375, + "output_cost": 0.023180000000000003 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 39.712500277, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6614.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.03895375, + "input_cost": 0.00388375, + "output_cost": 0.035070000000000004 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 41.219766849, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7059.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.04340375, + "input_cost": 0.00388375, + "output_cost": 0.03952 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 24.833685412, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5205.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.024863749999999997, + "input_cost": 0.00388375, + "output_cost": 0.02098 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 30.57386297, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5837.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.031183749999999996, + "input_cost": 0.00388375, + "output_cost": 0.027299999999999998 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 53.658760503, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 63389.0, + "provider": "Google", + "Metric_request_tokens": 58608.0, + "Metric_response_tokens": 2447.0, + "total_cost": 0.12107000000000001, + "input_cost": 0.07326, + "output_cost": 0.04781 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 32.435081791, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6158.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1150.0, + "total_cost": 0.03439375, + "input_cost": 0.00388375, + "output_cost": 0.03051 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 26.427531899, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5629.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.029103749999999998, + "input_cost": 0.00388375, + "output_cost": 0.02522 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 33.541754196, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6253.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1164.0, + "total_cost": 0.03534375, + "input_cost": 0.00388375, + "output_cost": 0.03146 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 24.7685358, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5160.0, + "provider": "Google", + "Metric_request_tokens": 3101.0, + "Metric_response_tokens": 1172.0, + "total_cost": 0.024466250000000002, + "input_cost": 0.00387625, + "output_cost": 0.02059 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 25.723323767, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5428.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.02709375, + "input_cost": 0.00388375, + "output_cost": 0.02321 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 26.441471015, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5382.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.026633749999999998, + "input_cost": 0.00388375, + "output_cost": 0.02275 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 26.604628416, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5562.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.02843375, + "input_cost": 0.00388375, + "output_cost": 0.024550000000000002 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 50.415529759, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11548.0, + "provider": "Google", + "Metric_request_tokens": 6858.0, + "Metric_response_tokens": 1806.0, + "total_cost": 0.055472499999999994, + "input_cost": 0.0085725, + "output_cost": 0.0469 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 22.679703767, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5023.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.02304375, + "input_cost": 0.00388375, + "output_cost": 0.01916 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 25.725383571, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5543.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.028243749999999998, + "input_cost": 0.00388375, + "output_cost": 0.02436 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 36.866564448, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6227.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1133.0, + "total_cost": 0.03508375, + "input_cost": 0.00388375, + "output_cost": 0.0312 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 32.45911738, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5824.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.031053749999999998, + "input_cost": 0.00388375, + "output_cost": 0.02717 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 37.005075571, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6464.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.03745375, + "input_cost": 0.00388375, + "output_cost": 0.03357 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 40.203596393, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6719.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1176.0, + "total_cost": 0.04000375, + "input_cost": 0.00388375, + "output_cost": 0.03612 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 32.600817879, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5693.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1192.0, + "total_cost": 0.02974375, + "input_cost": 0.00388375, + "output_cost": 0.02586 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 33.96250988, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5849.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1190.0, + "total_cost": 0.03130375, + "input_cost": 0.00388375, + "output_cost": 0.02742 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 40.243743396, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6388.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1154.0, + "total_cost": 0.03669375, + "input_cost": 0.00388375, + "output_cost": 0.03281 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 18.620513207, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4699.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.019803750000000002, + "input_cost": 0.00388375, + "output_cost": 0.01592 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 31.869262663, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5716.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1168.0, + "total_cost": 0.02997375, + "input_cost": 0.00388375, + "output_cost": 0.026090000000000002 + }, + { + "Model": "gemini-2.5-pro-preview-06-05", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 31.277196563, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5832.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.03113375, + "input_cost": 0.00388375, + "output_cost": 0.02725 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 22.205452715, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4960.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1187.0, + "total_cost": 0.022413750000000003, + "input_cost": 0.00388375, + "output_cost": 0.01853 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 45.254514377, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7035.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1153.0, + "total_cost": 0.04316375, + "input_cost": 0.00388375, + "output_cost": 0.03928 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 25.540973715, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5321.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1168.0, + "total_cost": 0.02602375, + "input_cost": 0.00388375, + "output_cost": 0.02214 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 29.203818188, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5909.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.031903749999999995, + "input_cost": 0.00388375, + "output_cost": 0.028019999999999996 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 41.27588133, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10620.0, + "provider": "Google", + "Metric_request_tokens": 6855.0, + "Metric_response_tokens": 1805.0, + "total_cost": 0.04621875, + "input_cost": 0.00856875, + "output_cost": 0.03765 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 68.89242445, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9389.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.06670375, + "input_cost": 0.00388375, + "output_cost": 0.06282 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 34.105280135, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6279.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1280.0, + "total_cost": 0.035603749999999997, + "input_cost": 0.00388375, + "output_cost": 0.03172 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 120.860207911, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 13787.0, + "provider": "Google", + "Metric_request_tokens": 7717.0, + "Metric_response_tokens": 1789.0, + "total_cost": 0.07034625, + "input_cost": 0.00964625, + "output_cost": 0.0607 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 29.936865091, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5784.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1157.0, + "total_cost": 0.03065375, + "input_cost": 0.00388375, + "output_cost": 0.026770000000000002 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 27.396380686, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5674.0, + "provider": "Google", + "Metric_request_tokens": 3106.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.029562500000000002, + "input_cost": 0.0038824999999999997, + "output_cost": 0.02568 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 23.581064657, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5037.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1154.0, + "total_cost": 0.023183750000000003, + "input_cost": 0.00388375, + "output_cost": 0.0193 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 29.262028799, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5839.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1150.0, + "total_cost": 0.031203750000000002, + "input_cost": 0.00388375, + "output_cost": 0.02732 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 53.601437466, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8093.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1193.0, + "total_cost": 0.05374375, + "input_cost": 0.00388375, + "output_cost": 0.04986 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 23.554440753, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5308.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.02589375, + "input_cost": 0.00388375, + "output_cost": 0.02201 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 23.896141303, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5227.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.025083750000000002, + "input_cost": 0.00388375, + "output_cost": 0.0212 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 46.506492861, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10985.0, + "provider": "Google", + "Metric_request_tokens": 6854.0, + "Metric_response_tokens": 1829.0, + "total_cost": 0.0498775, + "input_cost": 0.0085675, + "output_cost": 0.04131 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 31.379013607, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5869.0, + "provider": "Google", + "Metric_request_tokens": 3104.0, + "Metric_response_tokens": 1195.0, + "total_cost": 0.03153, + "input_cost": 0.0038799999999999998, + "output_cost": 0.02765 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 23.397718403, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5142.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1136.0, + "total_cost": 0.02423375, + "input_cost": 0.00388375, + "output_cost": 0.02035 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 31.349962177, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6014.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.03295375, + "input_cost": 0.00388375, + "output_cost": 0.02907 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 39.356080363, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6782.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1166.0, + "total_cost": 0.040633749999999996, + "input_cost": 0.00388375, + "output_cost": 0.03675 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 26.725778378, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5165.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1195.0, + "total_cost": 0.02446375, + "input_cost": 0.00388375, + "output_cost": 0.02058 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 41.986268087, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6768.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1183.0, + "total_cost": 0.04049375, + "input_cost": 0.00388375, + "output_cost": 0.036610000000000004 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 29.098656784, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5771.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1168.0, + "total_cost": 0.030523750000000002, + "input_cost": 0.00388375, + "output_cost": 0.02664 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 24.371314172, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5229.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.02510375, + "input_cost": 0.00388375, + "output_cost": 0.021220000000000003 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 23.463013826, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4976.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.022573749999999997, + "input_cost": 0.00388375, + "output_cost": 0.01869 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 47.889786736, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7356.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1154.0, + "total_cost": 0.04637375, + "input_cost": 0.00388375, + "output_cost": 0.04249 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 27.124752992, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5475.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.027563749999999998, + "input_cost": 0.00388375, + "output_cost": 0.02368 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 35.179937664, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6258.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 977.0, + "total_cost": 0.03539375, + "input_cost": 0.00388375, + "output_cost": 0.03151 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 25.536239605, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5279.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.02560375, + "input_cost": 0.00388375, + "output_cost": 0.02172 + }, + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 28.961691958, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5610.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1148.0, + "total_cost": 0.028913750000000002, + "input_cost": 0.00388375, + "output_cost": 0.02503 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 26.18099707, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5581.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.028623750000000003, + "input_cost": 0.00388375, + "output_cost": 0.02474 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 37.387479684, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6757.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.040383749999999996, + "input_cost": 0.00388375, + "output_cost": 0.0365 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 41.910195327, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10585.0, + "provider": "Google", + "Metric_request_tokens": 6807.0, + "Metric_response_tokens": 1723.0, + "total_cost": 0.046288750000000004, + "input_cost": 0.008508749999999999, + "output_cost": 0.03778 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 18.566825519, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4762.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.02043375, + "input_cost": 0.00388375, + "output_cost": 0.01655 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 35.952256562, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6120.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.03401375, + "input_cost": 0.00388375, + "output_cost": 0.03013 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 23.562834473, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5167.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.02448375, + "input_cost": 0.00388375, + "output_cost": 0.0206 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 26.116377335, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5461.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1175.0, + "total_cost": 0.027423750000000004, + "input_cost": 0.00388375, + "output_cost": 0.023540000000000002 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 42.19434694, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7549.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.04830375, + "input_cost": 0.00388375, + "output_cost": 0.04442 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 30.915458356, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5835.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1148.0, + "total_cost": 0.031163749999999997, + "input_cost": 0.00388375, + "output_cost": 0.02728 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 26.201752563, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5374.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.02655375, + "input_cost": 0.00388375, + "output_cost": 0.02267 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 23.905880551, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5249.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1156.0, + "total_cost": 0.02530375, + "input_cost": 0.00388375, + "output_cost": 0.021419999999999998 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 24.308699595, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5184.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1153.0, + "total_cost": 0.024653749999999995, + "input_cost": 0.00388375, + "output_cost": 0.020769999999999997 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 39.750523403, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6263.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.035443749999999996, + "input_cost": 0.00388375, + "output_cost": 0.03156 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 65.476559976, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6971.0, + "provider": "Google", + "Metric_request_tokens": 3984.0, + "Metric_response_tokens": 1170.0, + "total_cost": 0.03485, + "input_cost": 0.00498, + "output_cost": 0.02987 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 72.910902918, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 0.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, + "provider": "Google", + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 43.602165928, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 10530.0, + "provider": "Google", + "Metric_request_tokens": 6840.0, + "Metric_response_tokens": 1786.0, + "total_cost": 0.045450000000000004, + "input_cost": 0.00855, + "output_cost": 0.0369 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 33.823405726, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6724.0, + "provider": "Google", + "Metric_request_tokens": 3968.0, + "Metric_response_tokens": 1157.0, + "total_cost": 0.03252, + "input_cost": 0.00496, + "output_cost": 0.02756 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 30.694557094, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5928.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1154.0, + "total_cost": 0.032093750000000004, + "input_cost": 0.00388375, + "output_cost": 0.028210000000000002 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 28.020181574, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5702.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.02983375, + "input_cost": 0.00388375, + "output_cost": 0.02595 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 26.120680251, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5541.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.02822375, + "input_cost": 0.00388375, + "output_cost": 0.02434 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 51.401740722, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8168.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.05449375, + "input_cost": 0.00388375, + "output_cost": 0.05061 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 24.207496913, + "Duration": 30.757611824, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5044.0, + "total_tokens": 5710.0, "provider": "Google", "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1158.0, - "total_cost": 0.023253749999999997, + "Metric_response_tokens": 1179.0, + "total_cost": 0.029913749999999996, "input_cost": 0.00388375, - "output_cost": 0.01937 + "output_cost": 0.026029999999999998 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 24.803203804, + "Duration": 34.511741088, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5199.0, + "total_tokens": 6547.0, "provider": "Google", "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.02480375, + "Metric_response_tokens": 1176.0, + "total_cost": 0.03828375, "input_cost": 0.00388375, - "output_cost": 0.02092 + "output_cost": 0.0344 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 34.032753508, + "Duration": 33.667128837, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6600.0, + "total_tokens": 6142.0, "provider": "Google", "Metric_request_tokens": 3107.0, "Metric_response_tokens": 1160.0, - "total_cost": 0.03881375, + "total_cost": 0.03423375, "input_cost": 0.00388375, - "output_cost": 0.03493 + "output_cost": 0.03035 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.5-pro-preview-03-25", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 24.432732425, + "Duration": 26.603688339, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5326.0, + "total_tokens": 5227.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.025083750000000002, + "input_cost": 0.00388375, + "output_cost": 0.0212 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 49.97060826, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11419.0, + "provider": "Google", + "Metric_request_tokens": 6836.0, + "Metric_response_tokens": 1793.0, + "total_cost": 0.05437500000000001, + "input_cost": 0.008545, + "output_cost": 0.04583 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 97.172731042, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 15969.0, + "provider": "Google", + "Metric_request_tokens": 6815.0, + "Metric_response_tokens": 1761.0, + "total_cost": 0.10005875000000002, + "input_cost": 0.00851875, + "output_cost": 0.09154000000000001 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 53.025055925, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8261.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.055423749999999994, + "input_cost": 0.00388375, + "output_cost": 0.051539999999999996 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 32.071007326, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5793.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.03074375, + "input_cost": 0.00388375, + "output_cost": 0.026860000000000002 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 30.771411983, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5778.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1136.0, + "total_cost": 0.030593750000000003, + "input_cost": 0.00388375, + "output_cost": 0.02671 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 33.989499385, + "Score_MermaidDiagramValid": 1.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6020.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1160.0, + "total_cost": 0.033013749999999994, + "input_cost": 0.00388375, + "output_cost": 0.029129999999999996 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 26.15421085, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 5364.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.026453749999999998, + "input_cost": 0.00388375, + "output_cost": 0.02257 + }, + { + "Model": "gemini-2.5-pro-preview-03-25", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 35.682115651, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6187.0, + "provider": "Google", + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1198.0, + "total_cost": 0.03468375, + "input_cost": 0.00388375, + "output_cost": 0.030799999999999998 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 3.16239572, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.095654126, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.573285539, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 3.13363762, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 3.075430478, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 3.734203872, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, + "provider": "Google", + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 + }, + { + "Model": "gemini-2.0-flash", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 3.207070397, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1261.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1176.0, - "total_cost": 0.02607375, - "input_cost": 0.00388375, - "output_cost": 0.02219 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 525.0, + "total_cost": 0.0002836, + "input_cost": 7.36e-5, + "output_cost": 0.00021 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 27.33412975, + "Duration": 3.173871208, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5806.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1201.0, - "total_cost": 0.03087375, - "input_cost": 0.00388375, - "output_cost": 0.02699 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 32.488361727, + "Duration": 3.445568835, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5925.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1154.0, - "total_cost": 0.03206375, - "input_cost": 0.00388375, - "output_cost": 0.028180000000000004 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.454849392, - "Score_MermaidDiagramValid": 1.0, + "Duration": 3.087279343, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5526.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.02807375, - "input_cost": 0.00388375, - "output_cost": 0.024190000000000003 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 40.878031462, - "Score_MermaidDiagramValid": 1.0, + "Duration": 3.14396244, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10420.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, "provider": "Google", - "Metric_request_tokens": 6847.0, - "Metric_response_tokens": 1815.0, - "total_cost": 0.04428875, - "input_cost": 0.00855875, - "output_cost": 0.03573 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 67.384048514, + "Duration": 3.481531585, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 13422.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1242.0, "provider": "Google", - "Metric_request_tokens": 6855.0, - "Metric_response_tokens": 1785.0, - "total_cost": 0.07423875, - "input_cost": 0.00856875, - "output_cost": 0.06567 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 506.0, + "total_cost": 0.00027600000000000004, + "input_cost": 7.36e-5, + "output_cost": 0.00020240000000000004 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 27.014404973, - "Score_MermaidDiagramValid": 1.0, + "Duration": 3.053321468, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5406.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.026873749999999995, - "input_cost": 0.00388375, - "output_cost": 0.022989999999999997 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 26.744050775, + "Duration": 3.041527691, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5425.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1156.0, - "total_cost": 0.027063750000000004, - "input_cost": 0.00388375, - "output_cost": 0.023180000000000003 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 39.712500277, + "Duration": 3.656550888, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6614.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1156.0, - "total_cost": 0.03895375, - "input_cost": 0.00388375, - "output_cost": 0.035070000000000004 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 41.219766849, - "Score_MermaidDiagramValid": 1.0, + "Duration": 3.232291255, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7059.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1261.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.04340375, - "input_cost": 0.00388375, - "output_cost": 0.03952 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 525.0, + "total_cost": 0.0002836, + "input_cost": 7.36e-5, + "output_cost": 0.00021 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 24.833685412, + "Duration": 3.16402353, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5205.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.024863749999999997, - "input_cost": 0.00388375, - "output_cost": 0.02098 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 30.57386297, + "Duration": 3.371460427, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5837.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1156.0, - "total_cost": 0.031183749999999996, - "input_cost": 0.00388375, - "output_cost": 0.027299999999999998 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 53.658760503, - "Score_MermaidDiagramValid": 1.0, + "Duration": 3.306933591, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 63389.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1261.0, "provider": "Google", - "Metric_request_tokens": 58608.0, - "Metric_response_tokens": 2447.0, - "total_cost": 0.12107000000000001, - "input_cost": 0.07326, - "output_cost": 0.04781 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 525.0, + "total_cost": 0.0002836, + "input_cost": 7.36e-5, + "output_cost": 0.00021 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 32.435081791, + "Duration": 3.199269162, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6158.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1150.0, - "total_cost": 0.03439375, - "input_cost": 0.00388375, - "output_cost": 0.03051 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 26.427531899, + "Duration": 3.715297386, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5629.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1156.0, - "total_cost": 0.029103749999999998, - "input_cost": 0.00388375, - "output_cost": 0.02522 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 33.541754196, + "Duration": 3.390987167, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6253.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1256.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1164.0, - "total_cost": 0.03534375, - "input_cost": 0.00388375, - "output_cost": 0.03146 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 520.0, + "total_cost": 0.0002816, + "input_cost": 7.36e-5, + "output_cost": 0.000208 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 24.7685358, + "Duration": 3.009082393, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5160.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1238.0, "provider": "Google", - "Metric_request_tokens": 3101.0, - "Metric_response_tokens": 1172.0, - "total_cost": 0.024466250000000002, - "input_cost": 0.00387625, - "output_cost": 0.02059 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 502.0, + "total_cost": 0.0002744, + "input_cost": 7.36e-5, + "output_cost": 0.0002008 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 25.723323767, + "Duration": 3.572387722, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5428.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.02709375, - "input_cost": 0.00388375, - "output_cost": 0.02321 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.441471015, - "Score_MermaidDiagramValid": 1.0, + "Duration": 3.097230117, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5382.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1179.0, - "total_cost": 0.026633749999999998, - "input_cost": 0.00388375, - "output_cost": 0.02275 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 26.604628416, + "Duration": 3.342514555, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5562.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1160.0, - "total_cost": 0.02843375, - "input_cost": 0.00388375, - "output_cost": 0.024550000000000002 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "gemini-2.5-pro-preview-06-05", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 50.415529759, + "Duration": 3.742365295, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 11548.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, "provider": "Google", - "Metric_request_tokens": 6858.0, - "Metric_response_tokens": 1806.0, - "total_cost": 0.055472499999999994, - "input_cost": 0.0085725, - "output_cost": 0.0469 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 22.205452715, + "Duration": 3.291870391, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4960.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1234.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1187.0, - "total_cost": 0.022413750000000003, - "input_cost": 0.00388375, - "output_cost": 0.01853 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 498.0, + "total_cost": 0.0002728, + "input_cost": 7.36e-5, + "output_cost": 0.0001992 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 45.254514377, + "Duration": 3.237470473, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7035.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1251.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1153.0, - "total_cost": 0.04316375, - "input_cost": 0.00388375, - "output_cost": 0.03928 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 515.0, + "total_cost": 0.0002796, + "input_cost": 7.36e-5, + "output_cost": 0.00020600000000000002 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.0-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 25.540973715, + "Duration": 3.437132314, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5321.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 1239.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1168.0, - "total_cost": 0.02602375, - "input_cost": 0.00388375, - "output_cost": 0.02214 + "Metric_request_tokens": 736.0, + "Metric_response_tokens": 503.0, + "total_cost": 0.0002748, + "input_cost": 7.36e-5, + "output_cost": 0.0002012 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 29.203818188, - "Score_MermaidDiagramValid": 1.0, + "Duration": 5.205071529, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5909.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2632.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.031903749999999995, - "input_cost": 0.00388375, - "output_cost": 0.028019999999999996 + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00200755, + "input_cost": 0.00025065, + "output_cost": 0.0017569 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 41.27588133, + "Duration": 9.27147079, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10620.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3259.0, "provider": "Google", - "Metric_request_tokens": 6855.0, - "Metric_response_tokens": 1805.0, - "total_cost": 0.04621875, - "input_cost": 0.00856875, - "output_cost": 0.03765 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 557.0, + "total_cost": 0.00420005, + "input_cost": 0.00025035, + "output_cost": 0.0039497 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 68.89242445, + "Duration": 4.703598385, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9389.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2531.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1160.0, - "total_cost": 0.06670375, - "input_cost": 0.00388375, - "output_cost": 0.06282 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0016665500000000001, + "input_cost": 0.00025035, + "output_cost": 0.0014162 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 34.105280135, - "Score_MermaidDiagramValid": 1.0, + "Duration": 5.890864449, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6279.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2638.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1280.0, - "total_cost": 0.035603749999999997, - "input_cost": 0.00388375, - "output_cost": 0.03172 + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 591.0, + "total_cost": 0.00192125, + "input_cost": 0.00025065, + "output_cost": 0.0016706 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 120.860207911, + "Duration": 5.237042556, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 13787.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2595.0, "provider": "Google", - "Metric_request_tokens": 7717.0, - "Metric_response_tokens": 1789.0, - "total_cost": 0.07034625, - "input_cost": 0.00964625, - "output_cost": 0.0607 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00189055, + "input_cost": 0.00025035, + "output_cost": 0.0016401999999999999 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 29.936865091, + "Duration": 5.402304393, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5784.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2648.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1157.0, - "total_cost": 0.03065375, - "input_cost": 0.00388375, - "output_cost": 0.026770000000000002 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0020760500000000003, + "input_cost": 0.00025035, + "output_cost": 0.0018257000000000002 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 27.396380686, + "Duration": 5.681153876, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5674.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2673.0, "provider": "Google", - "Metric_request_tokens": 3106.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.029562500000000002, - "input_cost": 0.0038824999999999997, - "output_cost": 0.02568 + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00215105, + "input_cost": 0.00025065, + "output_cost": 0.0019004 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 23.581064657, + "Duration": 5.74444077, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5037.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2683.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1154.0, - "total_cost": 0.023183750000000003, - "input_cost": 0.00388375, - "output_cost": 0.0193 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00219855, + "input_cost": 0.00025035, + "output_cost": 0.0019482 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 29.262028799, + "Duration": 35.715282683, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5839.0, + "total_tokens": 15172.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1150.0, - "total_cost": 0.031203750000000002, - "input_cost": 0.00388375, - "output_cost": 0.02732 + "Metric_request_tokens": 8909.0, + "Metric_response_tokens": 1841.0, + "total_cost": 0.01791795, + "input_cost": 0.00133635, + "output_cost": 0.0165816 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 53.601437466, + "Duration": 19.026625667, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 8093.0, + "total_tokens": 7184.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1193.0, - "total_cost": 0.05374375, - "input_cost": 0.00388375, - "output_cost": 0.04986 + "Metric_request_tokens": 3969.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.00837945, + "input_cost": 0.00059535, + "output_cost": 0.0077840999999999995 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 23.554440753, + "Duration": 5.674318695, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5308.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2643.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.02589375, - "input_cost": 0.00388375, - "output_cost": 0.02201 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00205855, + "input_cost": 0.00025035, + "output_cost": 0.0018082 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 23.896141303, + "Duration": 6.052854086, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5227.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2773.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.025083750000000002, - "input_cost": 0.00388375, - "output_cost": 0.0212 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00251355, + "input_cost": 0.00025035, + "output_cost": 0.0022632 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 46.506492861, - "Score_MermaidDiagramValid": 1.0, + "Duration": 6.691234945, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10985.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2764.0, "provider": "Google", - "Metric_request_tokens": 6854.0, - "Metric_response_tokens": 1829.0, - "total_cost": 0.0498775, - "input_cost": 0.0085675, - "output_cost": 0.04131 + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.0024695499999999996, + "input_cost": 0.00025065, + "output_cost": 0.0022188999999999998 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 31.379013607, + "Duration": 10.540987174, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5869.0, + "total_tokens": 7199.0, "provider": "Google", - "Metric_request_tokens": 3104.0, - "Metric_response_tokens": 1195.0, - "total_cost": 0.03153, - "input_cost": 0.0038799999999999998, - "output_cost": 0.02765 + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.00383255, + "input_cost": 0.00080415, + "output_cost": 0.0030284 }, { - "Model": "gemini-2.5-pro-preview-05-06", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 23.397718403, + "Duration": 5.512122765, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5142.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2665.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1136.0, - "total_cost": 0.02423375, - "input_cost": 0.00388375, - "output_cost": 0.02035 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00213555, + "input_cost": 0.00025035, + "output_cost": 0.0018852 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.18099707, - "Score_MermaidDiagramValid": 1.0, + "Duration": 6.02414223, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5581.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2737.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.028623750000000003, - "input_cost": 0.00388375, - "output_cost": 0.02474 + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.0023750499999999996, + "input_cost": 0.00025065, + "output_cost": 0.0021244 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 37.387479684, + "Duration": 18.309828457, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6757.0, + "total_tokens": 7772.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1171.0, - "total_cost": 0.040383749999999996, - "input_cost": 0.00388375, - "output_cost": 0.0365 + "Metric_request_tokens": 3983.0, + "Metric_response_tokens": 1188.0, + "total_cost": 0.01041375, + "input_cost": 0.00059745, + "output_cost": 0.0098163 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 41.910195327, + "Duration": 5.858011472, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10585.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2586.0, "provider": "Google", - "Metric_request_tokens": 6807.0, - "Metric_response_tokens": 1723.0, - "total_cost": 0.046288750000000004, - "input_cost": 0.008508749999999999, - "output_cost": 0.03778 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0018590500000000001, + "input_cost": 0.00025035, + "output_cost": 0.0016087 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 18.566825519, + "Duration": 5.887542438, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4762.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2715.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.02043375, - "input_cost": 0.00388375, - "output_cost": 0.01655 + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00229805, + "input_cost": 0.00025065, + "output_cost": 0.0020474 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 35.952256562, + "Duration": 5.607493865, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6120.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2660.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1171.0, - "total_cost": 0.03401375, - "input_cost": 0.00388375, - "output_cost": 0.03013 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00211225, + "input_cost": 0.00025035, + "output_cost": 0.0018619 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 23.562834473, + "Duration": 5.350370934, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5167.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2632.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.02448375, - "input_cost": 0.00388375, - "output_cost": 0.0206 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0020200500000000002, + "input_cost": 0.00025035, + "output_cost": 0.0017697000000000001 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.116377335, + "Duration": 13.369507864, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5461.0, + "total_tokens": 6433.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1175.0, - "total_cost": 0.027423750000000004, - "input_cost": 0.00388375, - "output_cost": 0.023540000000000002 + "Metric_request_tokens": 3970.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.0057969, + "input_cost": 0.0005954999999999999, + "output_cost": 0.0052014 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 42.19434694, + "Duration": 5.427887401, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7549.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2640.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1171.0, - "total_cost": 0.04830375, - "input_cost": 0.00388375, - "output_cost": 0.04442 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00204805, + "input_cost": 0.00025035, + "output_cost": 0.0017977 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 30.915458356, + "Duration": 6.886046113, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5835.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2860.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1148.0, - "total_cost": 0.031163749999999997, - "input_cost": 0.00388375, - "output_cost": 0.02728 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00281805, + "input_cost": 0.00025035, + "output_cost": 0.0025677 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.201752563, + "Duration": 6.190617486, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5374.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2704.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.02655375, - "input_cost": 0.00388375, - "output_cost": 0.02267 + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00225955, + "input_cost": 0.00025065, + "output_cost": 0.0020089 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 23.905880551, + "Duration": 12.313419785, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5249.0, + "total_tokens": 7265.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1156.0, - "total_cost": 0.02530375, - "input_cost": 0.00388375, - "output_cost": 0.021419999999999998 + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.00406355, + "input_cost": 0.00080415, + "output_cost": 0.0032594 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 24.308699595, + "Duration": 14.12684929, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5184.0, + "total_tokens": 6606.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1153.0, - "total_cost": 0.024653749999999995, - "input_cost": 0.00388375, - "output_cost": 0.020769999999999997 + "Metric_request_tokens": 3969.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.00642025, + "input_cost": 0.00059535, + "output_cost": 0.0058249 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 39.750523403, + "Duration": 11.416052215, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6263.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 5051.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.035443749999999996, - "input_cost": 0.00388375, - "output_cost": 0.03156 + "Metric_request_tokens": 3061.0, + "Metric_response_tokens": 1169.0, + "total_cost": 0.00403405, + "input_cost": 0.00045914999999999997, + "output_cost": 0.0035749 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 65.476559976, + "Duration": 5.042913232, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6971.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2542.0, "provider": "Google", - "Metric_request_tokens": 3984.0, - "Metric_response_tokens": 1170.0, - "total_cost": 0.03485, - "input_cost": 0.00498, - "output_cost": 0.02987 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00170505, + "input_cost": 0.00025035, + "output_cost": 0.0014547 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 72.910902918, + "Duration": 5.803645705, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2732.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0023700499999999994, + "input_cost": 0.00025035, + "output_cost": 0.0021196999999999995 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 43.602165928, + "Duration": 24.280603396, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10530.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 6840.0, - "Metric_response_tokens": 1786.0, - "total_cost": 0.045450000000000004, - "input_cost": 0.00855, - "output_cost": 0.0369 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 33.823405726, + "Duration": 21.444646242, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6724.0, + "total_tokens": 7586.0, "provider": "Google", - "Metric_request_tokens": 3968.0, - "Metric_response_tokens": 1157.0, - "total_cost": 0.03252, - "input_cost": 0.00496, - "output_cost": 0.02756 + "Metric_request_tokens": 3679.0, + "Metric_response_tokens": 722.0, + "total_cost": 0.012132549999999999, + "input_cost": 0.00055185, + "output_cost": 0.0115807 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 30.694557094, + "Duration": 14.103361899, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5928.0, + "total_tokens": 7319.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1154.0, - "total_cost": 0.032093750000000004, - "input_cost": 0.00388375, - "output_cost": 0.028210000000000002 + "Metric_request_tokens": 4540.0, + "Metric_response_tokens": 1218.0, + "total_cost": 0.006875300000000001, + "input_cost": 0.000681, + "output_cost": 0.006194300000000001 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 28.020181574, + "Duration": 26.086590308, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5702.0, + "total_tokens": 9499.0, "provider": "Google", - "Metric_request_tokens": 3107.0, + "Metric_request_tokens": 3969.0, "Metric_response_tokens": 1196.0, - "total_cost": 0.02983375, - "input_cost": 0.00388375, - "output_cost": 0.02595 + "total_cost": 0.016481950000000002, + "input_cost": 0.00059535, + "output_cost": 0.0158866 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 26.120680251, + "Duration": 24.1841836, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5541.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.02822375, - "input_cost": 0.00388375, - "output_cost": 0.02434 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 51.401740722, + "Duration": 17.981969969, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 8168.0, + "total_tokens": 7548.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1171.0, - "total_cost": 0.05449375, - "input_cost": 0.00388375, - "output_cost": 0.05061 + "Metric_request_tokens": 3630.0, + "Metric_response_tokens": 1695.0, + "total_cost": 0.009342000000000001, + "input_cost": 0.0005445, + "output_cost": 0.008797500000000001 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 30.757611824, - "Score_MermaidDiagramValid": 1.0, + "Duration": 20.823357766, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 5710.0, + "total_tokens": 9790.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1179.0, - "total_cost": 0.029913749999999996, - "input_cost": 0.00388375, - "output_cost": 0.026029999999999998 + "Metric_request_tokens": 5066.0, + "Metric_response_tokens": 1729.0, + "total_cost": 0.0122798, + "input_cost": 0.0007599, + "output_cost": 0.0115199 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 34.511741088, + "Duration": 25.462973093, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6547.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1176.0, - "total_cost": 0.03828375, - "input_cost": 0.00388375, - "output_cost": 0.0344 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-pro-preview-03-25", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 33.667128837, + "Duration": 35.722264378, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6142.0, + "total_tokens": 16420.0, "provider": "Google", - "Metric_request_tokens": 3107.0, - "Metric_response_tokens": 1160.0, - "total_cost": 0.03423375, - "input_cost": 0.00388375, - "output_cost": 0.03035 + "Metric_request_tokens": 8000.0, + "Metric_response_tokens": 1807.0, + "total_cost": 0.0254297, + "input_cost": 0.0012, + "output_cost": 0.0242297 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.16239572, + "Duration": 5.71420281, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, + "total_tokens": 1869.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Metric_request_tokens": 811.0, + "Metric_response_tokens": 111.0, + "total_cost": 0.0035027500000000002, + "input_cost": 0.00012164999999999999, + "output_cost": 0.0033811 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.095654126, + "Duration": 22.548778955, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.573285539, + "Duration": 18.459759081, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.13363762, + "Duration": 35.081946937, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 11019.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Metric_request_tokens": 4018.0, + "Metric_response_tokens": 1243.0, + "total_cost": 0.021501500000000003, + "input_cost": 0.0006027, + "output_cost": 0.020898800000000002 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.075430478, + "Duration": 26.112757598, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 12440.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "Metric_request_tokens": 6832.0, + "Metric_response_tokens": 1782.0, + "total_cost": 0.015484999999999999, + "input_cost": 0.0010248, + "output_cost": 0.0144602 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.734203872, + "Duration": 15.507230852, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4880.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00988225, + "input_cost": 0.00025035, + "output_cost": 0.0096319 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.207070397, + "Duration": 22.268203361, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1261.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 525.0, - "total_cost": 0.0002836, - "input_cost": 7.36e-5, - "output_cost": 0.00021 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.173871208, + "Duration": 29.657737725, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8725.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 3968.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.013843, + "input_cost": 0.0005952, + "output_cost": 0.013247799999999999 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.445568835, + "Duration": 17.856923641, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.087279343, + "Duration": 22.406209617, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9355.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Metric_request_tokens": 4545.0, + "Metric_response_tokens": 1192.0, + "total_cost": 0.01405995, + "input_cost": 0.0006817500000000001, + "output_cost": 0.0133782 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.14396244, + "Duration": 29.67447829, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9321.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "Metric_request_tokens": 3105.0, + "Metric_response_tokens": 1192.0, + "total_cost": 0.01876495, + "input_cost": 0.00046575, + "output_cost": 0.018299199999999998 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.481531585, + "Duration": 4.125237979, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1242.0, + "total_tokens": 1709.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 506.0, - "total_cost": 0.00027600000000000004, - "input_cost": 7.36e-5, - "output_cost": 0.00020240000000000004 + "Metric_request_tokens": 810.0, + "Metric_response_tokens": 524.0, + "total_cost": 0.0017484000000000002, + "input_cost": 0.00012149999999999999, + "output_cost": 0.0016269000000000001 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.053321468, + "Duration": 32.319621961, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.041527691, + "Duration": 16.088759709, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.656550888, + "Duration": 22.178054671, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8629.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 3965.0, + "Metric_response_tokens": 1170.0, + "total_cost": 0.01352575, + "input_cost": 0.0005947499999999999, + "output_cost": 0.012931 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.232291255, + "Duration": 10.029780959, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1261.0, + "total_tokens": 3107.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 525.0, - "total_cost": 0.0002836, - "input_cost": 7.36e-5, - "output_cost": 0.00021 + "Metric_request_tokens": 811.0, + "Metric_response_tokens": 536.0, + "total_cost": 0.006603250000000001, + "input_cost": 0.00012164999999999999, + "output_cost": 0.0064816000000000006 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.16402353, + "Duration": 42.923902196, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 30481.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "Metric_request_tokens": 21012.0, + "Metric_response_tokens": 3073.0, + "total_cost": 0.0273816, + "input_cost": 0.0031517999999999997, + "output_cost": 0.0242298 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.371460427, + "Duration": 39.222518221, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 1.0, + "Score_UsageLimitNotExceeded": 0.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.306933591, + "Duration": 9.631045956, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1261.0, + "total_tokens": 2892.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 525.0, - "total_cost": 0.0002836, - "input_cost": 7.36e-5, - "output_cost": 0.00021 + "Metric_request_tokens": 811.0, + "Metric_response_tokens": 625.0, + "total_cost": 0.00559265, + "input_cost": 0.00012164999999999999, + "output_cost": 0.005471 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.199269162, + "Duration": 6.647730144, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "total_tokens": 2292.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 810.0, + "Metric_response_tokens": 524.0, + "total_cost": 0.0037889, + "input_cost": 0.00012149999999999999, + "output_cost": 0.0036674 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-04-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.715297386, + "Duration": 35.71455716, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.390987167, + "Duration": 5.476345308, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1256.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2659.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 520.0, - "total_cost": 0.0002816, - "input_cost": 7.36e-5, - "output_cost": 0.000208 + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00210205, + "input_cost": 0.00025065, + "output_cost": 0.0018514 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.009082393, + "Duration": 6.346005219, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1238.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2718.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 502.0, - "total_cost": 0.0002744, - "input_cost": 7.36e-5, - "output_cost": 0.0002008 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00232105, + "input_cost": 0.00025035, + "output_cost": 0.0020707 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.572387722, + "Duration": 4.836977527, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2539.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00169455, + "input_cost": 0.00025035, + "output_cost": 0.0014441999999999999 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.097230117, + "Duration": 6.672693551, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2929.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00304705, + "input_cost": 0.00025065, + "output_cost": 0.0027964 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.342514555, + "Duration": 14.662619685, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 6896.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 3981.0, + "Metric_response_tokens": 1184.0, + "total_cost": 0.00736605, + "input_cost": 0.00059715, + "output_cost": 0.0067689 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.742365295, + "Duration": 5.188614707, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2611.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 557.0, + "total_cost": 0.0019320499999999998, + "input_cost": 0.00025035, + "output_cost": 0.0016817 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 3.291870391, + "Duration": 6.5603659, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1234.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2756.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 498.0, - "total_cost": 0.0002728, - "input_cost": 7.36e-5, - "output_cost": 0.0001992 + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00244155, + "input_cost": 0.00025065, + "output_cost": 0.0021909 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.237470473, + "Duration": 6.128366075, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1251.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2798.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 515.0, - "total_cost": 0.0002796, - "input_cost": 7.36e-5, - "output_cost": 0.00020600000000000002 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00259525, + "input_cost": 0.00025035, + "output_cost": 0.0023449 }, { - "Model": "gemini-2.0-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.437132314, + "Duration": 4.944774864, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1239.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2557.0, "provider": "Google", - "Metric_request_tokens": 736.0, - "Metric_response_tokens": 503.0, - "total_cost": 0.0002748, - "input_cost": 7.36e-5, - "output_cost": 0.0002012 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00175755, + "input_cost": 0.00025035, + "output_cost": 0.0015072 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.205071529, + "Duration": 5.76290555, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2632.0, + "total_tokens": 2696.0, "provider": "Google", "Metric_request_tokens": 1671.0, "Metric_response_tokens": 554.0, - "total_cost": 0.00200755, + "total_cost": 0.0022315499999999997, "input_cost": 0.00025065, - "output_cost": 0.0017569 + "output_cost": 0.0019809 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 9.27147079, + "Duration": 11.161597903, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3259.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 7403.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 557.0, - "total_cost": 0.00420005, - "input_cost": 0.00025035, - "output_cost": 0.0039497 + "Metric_request_tokens": 5349.0, + "Metric_response_tokens": 1168.0, + "total_cost": 0.00460415, + "input_cost": 0.00080235, + "output_cost": 0.0038018 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.703598385, + "Duration": 6.529419098, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2531.0, + "total_tokens": 2839.0, "provider": "Google", "Metric_request_tokens": 1669.0, "Metric_response_tokens": 552.0, - "total_cost": 0.0016665500000000001, + "total_cost": 0.0027445499999999997, "input_cost": 0.00025035, - "output_cost": 0.0014162 + "output_cost": 0.0024942 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.890864449, + "Duration": 8.540685767, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2638.0, + "total_tokens": 3173.0, "provider": "Google", "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 591.0, - "total_cost": 0.00192125, + "Metric_response_tokens": 554.0, + "total_cost": 0.0039010499999999997, "input_cost": 0.00025065, - "output_cost": 0.0016706 + "output_cost": 0.0036504 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.237042556, + "Duration": 5.922996591, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2595.0, + "total_tokens": 2729.0, "provider": "Google", "Metric_request_tokens": 1669.0, "Metric_response_tokens": 552.0, - "total_cost": 0.00189055, + "total_cost": 0.0023595499999999998, "input_cost": 0.00025035, - "output_cost": 0.0016401999999999999 + "output_cost": 0.0021092 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.402304393, + "Duration": 6.741784844, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2648.0, + "total_tokens": 2900.0, "provider": "Google", "Metric_request_tokens": 1669.0, "Metric_response_tokens": 552.0, - "total_cost": 0.0020760500000000003, + "total_cost": 0.00295805, "input_cost": 0.00025035, - "output_cost": 0.0018257000000000002 + "output_cost": 0.0027077 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.681153876, + "Duration": 7.055616236, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2673.0, + "total_tokens": 2809.0, "provider": "Google", "Metric_request_tokens": 1671.0, "Metric_response_tokens": 554.0, - "total_cost": 0.00215105, + "total_cost": 0.0026270499999999997, "input_cost": 0.00025065, - "output_cost": 0.0019004 + "output_cost": 0.0023764 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.74444077, + "Duration": 21.365465445, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2683.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 9022.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00219855, - "input_cost": 0.00025035, - "output_cost": 0.0019482 + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.01021305, + "input_cost": 0.00080415, + "output_cost": 0.0094089 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 35.715282683, + "Duration": 5.080146417, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 15172.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2513.0, "provider": "Google", - "Metric_request_tokens": 8909.0, - "Metric_response_tokens": 1841.0, - "total_cost": 0.01791795, - "input_cost": 0.00133635, - "output_cost": 0.0165816 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00160355, + "input_cost": 0.00025035, + "output_cost": 0.0013532 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 19.026625667, + "Duration": 25.419348211, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7184.0, + "total_tokens": 10935.0, "provider": "Google", - "Metric_request_tokens": 3969.0, + "Metric_request_tokens": 5358.0, "Metric_response_tokens": 1196.0, - "total_cost": 0.00837945, - "input_cost": 0.00059535, - "output_cost": 0.0077840999999999995 + "total_cost": 0.016854800000000003, + "input_cost": 0.0008037, + "output_cost": 0.016051100000000002 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.674318695, + "Duration": 4.279733312, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2643.0, + "total_tokens": 2443.0, "provider": "Google", "Metric_request_tokens": 1669.0, "Metric_response_tokens": 552.0, - "total_cost": 0.00205855, + "total_cost": 0.00135855, "input_cost": 0.00025035, - "output_cost": 0.0018082 + "output_cost": 0.0011082 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.052854086, + "Duration": 5.584990518, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2773.0, + "total_tokens": 2690.0, "provider": "Google", "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00251355, + "Metric_response_tokens": 554.0, + "total_cost": 0.00221725, "input_cost": 0.00025035, - "output_cost": 0.0022632 + "output_cost": 0.0019669 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.691234945, + "Duration": 7.12928071, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2764.0, + "total_tokens": 2925.0, "provider": "Google", "Metric_request_tokens": 1671.0, "Metric_response_tokens": 554.0, - "total_cost": 0.0024695499999999996, + "total_cost": 0.00303305, "input_cost": 0.00025065, - "output_cost": 0.0022188999999999998 + "output_cost": 0.0027824 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 10.540987174, + "Duration": 23.894062173, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7199.0, + "total_tokens": 11489.0, "provider": "Google", - "Metric_request_tokens": 5361.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.00383255, - "input_cost": 0.00080415, - "output_cost": 0.0030284 + "Metric_request_tokens": 6861.0, + "Metric_response_tokens": 1806.0, + "total_cost": 0.01198975, + "input_cost": 0.00102915, + "output_cost": 0.010960600000000001 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.512122765, + "Duration": 9.010575503, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2665.0, + "total_tokens": 3355.0, "provider": "Google", "Metric_request_tokens": 1669.0, "Metric_response_tokens": 552.0, - "total_cost": 0.00213555, + "total_cost": 0.00455055, "input_cost": 0.00025035, - "output_cost": 0.0018852 + "output_cost": 0.0043002000000000005 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.02414223, + "Duration": 5.778800106, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2737.0, + "total_tokens": 2693.0, "provider": "Google", "Metric_request_tokens": 1671.0, "Metric_response_tokens": 554.0, - "total_cost": 0.0023750499999999996, + "total_cost": 0.0022210499999999996, "input_cost": 0.00025065, - "output_cost": 0.0021244 + "output_cost": 0.0019703999999999998 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 18.309828457, + "Duration": 8.167258931, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7772.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3303.0, "provider": "Google", - "Metric_request_tokens": 3983.0, - "Metric_response_tokens": 1188.0, - "total_cost": 0.01041375, - "input_cost": 0.00059745, - "output_cost": 0.0098163 + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 550.0, + "total_cost": 0.004374349999999999, + "input_cost": 0.00025035, + "output_cost": 0.004123999999999999 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.858011472, + "Duration": 4.731809965, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2586.0, + "total_tokens": 2458.0, "provider": "Google", "Metric_request_tokens": 1669.0, "Metric_response_tokens": 552.0, - "total_cost": 0.0018590500000000001, + "total_cost": 0.00141105, "input_cost": 0.00025035, - "output_cost": 0.0016087 + "output_cost": 0.0011607 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.887542438, + "Duration": 5.421789254, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2715.0, + "total_tokens": 2578.0, "provider": "Google", "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00229805, + "Metric_response_tokens": 568.0, + "total_cost": 0.0017779500000000001, "input_cost": 0.00025065, - "output_cost": 0.0020474 + "output_cost": 0.0015273 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.607493865, + "Duration": 4.939170129, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2660.0, + "total_tokens": 2535.0, "provider": "Google", "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00211225, + "Metric_response_tokens": 552.0, + "total_cost": 0.0016805499999999998, "input_cost": 0.00025035, - "output_cost": 0.0018619 + "output_cost": 0.0014302 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.350370934, + "Duration": 5.085231652, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2632.0, + "total_tokens": 2550.0, "provider": "Google", "Metric_request_tokens": 1669.0, "Metric_response_tokens": 552.0, - "total_cost": 0.0020200500000000002, + "total_cost": 0.0017330499999999999, "input_cost": 0.00025035, - "output_cost": 0.0017697000000000001 + "output_cost": 0.0014827 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 13.369507864, + "Duration": 12.230727772, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6433.0, + "total_tokens": 11014.0, "provider": "Google", - "Metric_request_tokens": 3970.0, - "Metric_response_tokens": 1179.0, - "total_cost": 0.0057969, - "input_cost": 0.0005954999999999999, - "output_cost": 0.0052014 + "Metric_request_tokens": 3108.0, + "Metric_response_tokens": 1198.0, + "total_cost": 0.024663, + "input_cost": 0.0004662, + "output_cost": 0.0241968 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.427887401, + "Duration": 8.306374733, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2640.0, + "total_tokens": 2757.0, "provider": "Google", "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00204805, + "Metric_response_tokens": 554.0, + "total_cost": 0.00245175, "input_cost": 0.00025035, - "output_cost": 0.0017977 + "output_cost": 0.0022014 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.886046113, + "Duration": 6.595447784, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2860.0, + "total_tokens": 2698.0, "provider": "Google", "Metric_request_tokens": 1669.0, "Metric_response_tokens": 552.0, - "total_cost": 0.00281805, + "total_cost": 0.0022510499999999997, "input_cost": 0.00025035, - "output_cost": 0.0025677 + "output_cost": 0.0020007 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.190617486, + "Duration": 6.077756987, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2704.0, + "total_tokens": 8290.0, "provider": "Google", "Metric_request_tokens": 1671.0, "Metric_response_tokens": 554.0, - "total_cost": 0.00225955, + "total_cost": 0.02181055, "input_cost": 0.00025065, - "output_cost": 0.0020089 + "output_cost": 0.0215599 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 12.313419785, + "Duration": 13.08787764, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7265.0, + "total_tokens": 12993.0, "provider": "Google", - "Metric_request_tokens": 5361.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.00406355, - "input_cost": 0.00080415, - "output_cost": 0.0032594 + "Metric_request_tokens": 5355.0, + "Metric_response_tokens": 1171.0, + "total_cost": 0.024140349999999998, + "input_cost": 0.00080325, + "output_cost": 0.0233371 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 14.12684929, + "Duration": 18.203576056, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6606.0, + "Score_UsedBothMCPTools": 0.0, + "total_tokens": 0.0, "provider": "Google", - "Metric_request_tokens": 3969.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.00642025, - "input_cost": 0.00059535, - "output_cost": 0.0058249 + "Metric_request_tokens": 0.0, + "Metric_response_tokens": 0.0, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 11.416052215, - "Score_MermaidDiagramValid": 1.0, + "Duration": 7.877005808, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 5051.0, + "total_tokens": 2829.0, "provider": "Google", - "Metric_request_tokens": 3061.0, - "Metric_response_tokens": 1169.0, - "total_cost": 0.00403405, - "input_cost": 0.00045914999999999997, - "output_cost": 0.0035749 + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00269705, + "input_cost": 0.00025065, + "output_cost": 0.0024464 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 6.691518112, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2766.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.00248905, + "input_cost": 0.00025035, + "output_cost": 0.0022387 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.46408462, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2591.0, + "provider": "Google", + "Metric_request_tokens": 1669.0, + "Metric_response_tokens": 552.0, + "total_cost": 0.0018765499999999998, + "input_cost": 0.00025035, + "output_cost": 0.0016262 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 11.559526636, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 15503.0, + "provider": "Google", + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.047056049999999995, + "input_cost": 0.00025065, + "output_cost": 0.0468054 + }, + { + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.042913232, + "Duration": 18.417513707, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2542.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 8232.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00170505, - "input_cost": 0.00025035, - "output_cost": 0.0014547 + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1174.0, + "total_cost": 0.0074480499999999995, + "input_cost": 0.00080415, + "output_cost": 0.0066438999999999995 }, { - "Model": "gemini-2.5-flash", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.803645705, + "Duration": 9.485176875, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2732.0, + "total_tokens": 2981.0, "provider": "Google", "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0023700499999999994, + "Metric_response_tokens": 554.0, + "total_cost": 0.00323575, "input_cost": 0.00025035, - "output_cost": 0.0021196999999999995 + "output_cost": 0.0029854 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 24.280603396, + "Duration": 6.271124626, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 2722.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 1671.0, + "Metric_response_tokens": 554.0, + "total_cost": 0.00232255, + "input_cost": 0.00025065, + "output_cost": 0.0020719000000000002 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 21.444646242, + "Duration": 18.342354393, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7586.0, + "total_tokens": 20909.0, "provider": "Google", - "Metric_request_tokens": 3679.0, - "Metric_response_tokens": 722.0, - "total_cost": 0.012132549999999999, - "input_cost": 0.00055185, - "output_cost": 0.0115807 + "Metric_request_tokens": 5361.0, + "Metric_response_tokens": 1158.0, + "total_cost": 0.051863950000000006, + "input_cost": 0.00080415, + "output_cost": 0.0510598 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-preview-05-20", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 14.103361899, + "Duration": 19.63725688, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7319.0, + "total_tokens": 19611.0, "provider": "Google", - "Metric_request_tokens": 4540.0, - "Metric_response_tokens": 1218.0, - "total_cost": 0.006875300000000001, - "input_cost": 0.000681, - "output_cost": 0.006194300000000001 + "Metric_request_tokens": 3981.0, + "Metric_response_tokens": 1184.0, + "total_cost": 0.05186855, + "input_cost": 0.00059715, + "output_cost": 0.0512714 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 26.086590308, - "Score_MermaidDiagramValid": 1.0, + "Duration": 4.517046246, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9499.0, + "total_tokens": 4330.0, "provider": "Google", - "Metric_request_tokens": 3969.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.016481950000000002, - "input_cost": 0.00059535, - "output_cost": 0.0158866 + "Metric_request_tokens": 3106.0, + "Metric_response_tokens": 1224.0, + "total_cost": 0.0008002, + "input_cost": 0.0003106, + "output_cost": 0.0004896000000000001 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 24.1841836, + "Duration": 4.366856197, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4300.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3108.0, + "Metric_response_tokens": 1192.0, + "total_cost": 0.0007876000000000001, + "input_cost": 0.0003108, + "output_cost": 0.0004768 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 17.981969969, + "Duration": 3.691253093, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7548.0, + "total_tokens": 3771.0, "provider": "Google", - "Metric_request_tokens": 3630.0, - "Metric_response_tokens": 1695.0, - "total_cost": 0.009342000000000001, - "input_cost": 0.0005445, - "output_cost": 0.008797500000000001 + "Metric_request_tokens": 3105.0, + "Metric_response_tokens": 666.0, + "total_cost": 0.0005769, + "input_cost": 0.0003105, + "output_cost": 0.0002664 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 20.823357766, - "Score_MermaidDiagramValid": 0.0, + "Duration": 4.948628294, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9790.0, + "total_tokens": 50365.0, "provider": "Google", - "Metric_request_tokens": 5066.0, - "Metric_response_tokens": 1729.0, - "total_cost": 0.0122798, - "input_cost": 0.0007599, - "output_cost": 0.0115199 + "Metric_request_tokens": 49186.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.0053902, + "input_cost": 0.0049186, + "output_cost": 0.00047159999999999997 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 25.462973093, + "Duration": 4.397124725, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4273.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 1184.0, + "total_cost": 0.0007825, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0004736 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 35.722264378, + "Duration": 4.246158175, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 16420.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4269.0, "provider": "Google", - "Metric_request_tokens": 8000.0, - "Metric_response_tokens": 1807.0, - "total_cost": 0.0254297, - "input_cost": 0.0012, - "output_cost": 0.0242297 + "Metric_request_tokens": 3092.0, + "Metric_response_tokens": 1177.0, + "total_cost": 0.0007800000000000001, + "input_cost": 0.00030920000000000003, + "output_cost": 0.00047080000000000006 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.71420281, + "Duration": 4.461461037, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1869.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4313.0, "provider": "Google", - "Metric_request_tokens": 811.0, - "Metric_response_tokens": 111.0, - "total_cost": 0.0035027500000000002, - "input_cost": 0.00012164999999999999, - "output_cost": 0.0033811 + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 1224.0, + "total_cost": 0.0007985000000000002, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0004896000000000001 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 22.548778955, + "Duration": 3.826108098, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3829.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 693.0, + "total_cost": 0.0005908, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0002772 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 18.459759081, + "Duration": 3.812971395, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3771.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3103.0, + "Metric_response_tokens": 668.0, + "total_cost": 0.0005775, + "input_cost": 0.0003103, + "output_cost": 0.0002672 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 35.081946937, + "Duration": 4.463204247, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 11019.0, + "total_tokens": 4292.0, "provider": "Google", - "Metric_request_tokens": 4018.0, - "Metric_response_tokens": 1243.0, - "total_cost": 0.021501500000000003, - "input_cost": 0.0006027, - "output_cost": 0.020898800000000002 + "Metric_request_tokens": 3106.0, + "Metric_response_tokens": 1186.0, + "total_cost": 0.0007850000000000001, + "input_cost": 0.0003106, + "output_cost": 0.00047440000000000004 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 26.112757598, + "Duration": 4.422216874, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 12440.0, + "total_tokens": 6650.0, "provider": "Google", - "Metric_request_tokens": 6832.0, - "Metric_response_tokens": 1782.0, - "total_cost": 0.015484999999999999, - "input_cost": 0.0010248, - "output_cost": 0.0144602 + "Metric_request_tokens": 5423.0, + "Metric_response_tokens": 1227.0, + "total_cost": 0.0010331, + "input_cost": 0.0005423, + "output_cost": 0.0004908 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 15.507230852, + "Duration": 4.450171262, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4880.0, + "total_tokens": 4287.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00988225, - "input_cost": 0.00025035, - "output_cost": 0.0096319 + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.0007875, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00047840000000000003 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 22.268203361, + "Duration": 4.562724232, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4284.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.0007872, + "input_cost": 0.0003088, + "output_cost": 0.00047840000000000003 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 29.657737725, + "Duration": 4.407190932, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 8725.0, + "total_tokens": 4378.0, "provider": "Google", - "Metric_request_tokens": 3968.0, - "Metric_response_tokens": 1173.0, - "total_cost": 0.013843, - "input_cost": 0.0005952, - "output_cost": 0.013247799999999999 + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 1242.0, + "total_cost": 0.0008104000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0004968 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 17.856923641, + "Duration": 3.620992741, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3766.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 675.0, + "total_cost": 0.0005791, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00027 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 22.406209617, + "Duration": 5.472499546, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9355.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4297.0, "provider": "Google", - "Metric_request_tokens": 4545.0, - "Metric_response_tokens": 1192.0, - "total_cost": 0.01405995, - "input_cost": 0.0006817500000000001, - "output_cost": 0.0133782 + "Metric_request_tokens": 3090.0, + "Metric_response_tokens": 1207.0, + "total_cost": 0.0007918000000000001, + "input_cost": 0.00030900000000000003, + "output_cost": 0.0004828 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 29.67447829, + "Duration": 4.473251097, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9321.0, + "total_tokens": 4322.0, "provider": "Google", - "Metric_request_tokens": 3105.0, - "Metric_response_tokens": 1192.0, - "total_cost": 0.01876495, - "input_cost": 0.00046575, - "output_cost": 0.018299199999999998 + "Metric_request_tokens": 3106.0, + "Metric_response_tokens": 1216.0, + "total_cost": 0.0007970000000000001, + "input_cost": 0.0003106, + "output_cost": 0.00048640000000000006 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.125237979, + "Duration": 4.574995535, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 1709.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4266.0, "provider": "Google", - "Metric_request_tokens": 810.0, - "Metric_response_tokens": 524.0, - "total_cost": 0.0017484000000000002, - "input_cost": 0.00012149999999999999, - "output_cost": 0.0016269000000000001 + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 1175.0, + "total_cost": 0.0007791, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00047000000000000004 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 32.319621961, + "Duration": 4.094999263, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3858.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3137.0, + "Metric_response_tokens": 721.0, + "total_cost": 0.0006021, + "input_cost": 0.00031370000000000004, + "output_cost": 0.0002884 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 16.088759709, + "Duration": 4.658386722, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3778.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 689.0, + "total_cost": 0.0005845000000000001, + "input_cost": 0.00030890000000000003, + "output_cost": 0.00027560000000000003 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 22.178054671, + "Duration": 4.893656632, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 8629.0, + "total_tokens": 4357.0, "provider": "Google", - "Metric_request_tokens": 3965.0, - "Metric_response_tokens": 1170.0, - "total_cost": 0.01352575, - "input_cost": 0.0005947499999999999, - "output_cost": 0.012931 + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 1221.0, + "total_cost": 0.0008020000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0004884 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 10.029780959, + "Duration": 5.236565242, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 3107.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3883.0, "provider": "Google", - "Metric_request_tokens": 811.0, - "Metric_response_tokens": 536.0, - "total_cost": 0.006603250000000001, - "input_cost": 0.00012164999999999999, - "output_cost": 0.0064816000000000006 + "Metric_request_tokens": 3137.0, + "Metric_response_tokens": 746.0, + "total_cost": 0.0006121000000000001, + "input_cost": 0.00031370000000000004, + "output_cost": 0.00029840000000000004 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 42.923902196, + "Duration": 4.485359369, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 30481.0, + "total_tokens": 4324.0, "provider": "Google", - "Metric_request_tokens": 21012.0, - "Metric_response_tokens": 3073.0, - "total_cost": 0.0273816, - "input_cost": 0.0031517999999999997, - "output_cost": 0.0242298 + "Metric_request_tokens": 3108.0, + "Metric_response_tokens": 1216.0, + "total_cost": 0.0007972000000000001, + "input_cost": 0.0003108, + "output_cost": 0.00048640000000000006 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 39.222518221, + "Duration": 3.734365673, "Score_MermaidDiagramValid": 0.0, - "Score_UsageLimitNotExceeded": 0.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3791.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3110.0, + "Metric_response_tokens": 681.0, + "total_cost": 0.0005834, + "input_cost": 0.000311, + "output_cost": 0.0002724 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 9.631045956, + "Duration": 4.500398738, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 2892.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4304.0, "provider": "Google", - "Metric_request_tokens": 811.0, - "Metric_response_tokens": 625.0, - "total_cost": 0.00559265, - "input_cost": 0.00012164999999999999, - "output_cost": 0.005471 + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1216.0, + "total_cost": 0.0007952, + "input_cost": 0.0003088, + "output_cost": 0.00048640000000000006 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 6.647730144, + "Duration": 4.124787172, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 2292.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3850.0, "provider": "Google", - "Metric_request_tokens": 810.0, - "Metric_response_tokens": 524.0, - "total_cost": 0.0037889, - "input_cost": 0.00012149999999999999, - "output_cost": 0.0036674 + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 761.0, + "total_cost": 0.0006133, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0003044 }, { - "Model": "gemini-2.5-flash-preview-04-17", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 35.71455716, + "Duration": 3.98193398, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.0, - "total_tokens": 0.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3903.0, "provider": "Google", - "Metric_request_tokens": 0.0, - "Metric_response_tokens": 0.0, - "total_cost": 0.0, - "input_cost": 0.0, - "output_cost": 0.0 + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 767.0, + "total_cost": 0.0006204000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.00030680000000000003 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.476345308, + "Duration": 4.48101111, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2659.0, + "total_tokens": 4290.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00210205, - "input_cost": 0.00025065, - "output_cost": 0.0018514 + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1202.0, + "total_cost": 0.0007896, + "input_cost": 0.0003088, + "output_cost": 0.0004808 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 6.346005219, + "Duration": 4.916593879, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2718.0, + "total_tokens": 4543.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00232105, - "input_cost": 0.00025035, - "output_cost": 0.0020707 + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 1454.0, + "total_cost": 0.0008905, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0005816 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.836977527, + "Duration": 3.879042448, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2539.0, + "total_tokens": 3885.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00169455, - "input_cost": 0.00025035, - "output_cost": 0.0014441999999999999 + "Metric_request_tokens": 3092.0, + "Metric_response_tokens": 793.0, + "total_cost": 0.0006264, + "input_cost": 0.00030920000000000003, + "output_cost": 0.0003172 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.672693551, + "Duration": 4.444876941, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2929.0, + "total_tokens": 4284.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00304705, - "input_cost": 0.00025065, - "output_cost": 0.0027964 + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1196.0, + "total_cost": 0.0007872, + "input_cost": 0.0003088, + "output_cost": 0.00047840000000000003 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 14.662619685, + "Duration": 4.627413202, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6896.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4291.0, "provider": "Google", - "Metric_request_tokens": 3981.0, - "Metric_response_tokens": 1184.0, - "total_cost": 0.00736605, - "input_cost": 0.00059715, - "output_cost": 0.0067689 + "Metric_request_tokens": 3089.0, + "Metric_response_tokens": 1202.0, + "total_cost": 0.0007897, + "input_cost": 0.00030890000000000003, + "output_cost": 0.0004808 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.188614707, + "Duration": 3.704063431, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2611.0, + "total_tokens": 3774.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 557.0, - "total_cost": 0.0019320499999999998, - "input_cost": 0.00025035, - "output_cost": 0.0016817 + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 683.0, + "total_cost": 0.0005823, + "input_cost": 0.00030910000000000003, + "output_cost": 0.0002732 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 6.5603659, + "Duration": 4.447734786, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2756.0, + "total_tokens": 4282.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00244155, - "input_cost": 0.00025065, - "output_cost": 0.0021909 + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 1191.0, + "total_cost": 0.0007855000000000001, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00047640000000000003 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 6.128366075, + "Duration": 4.576908765, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2798.0, + "total_tokens": 4271.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00259525, - "input_cost": 0.00025035, - "output_cost": 0.0023449 + "Metric_request_tokens": 3091.0, + "Metric_response_tokens": 1180.0, + "total_cost": 0.0007811000000000001, + "input_cost": 0.00030910000000000003, + "output_cost": 0.00047200000000000003 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.944774864, + "Duration": 5.032189281, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2557.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4612.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00175755, - "input_cost": 0.00025035, - "output_cost": 0.0015072 + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 1476.0, + "total_cost": 0.0009040000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0005904 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.76290555, + "Duration": 4.472401128, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2696.0, + "total_tokens": 4274.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.0022315499999999997, - "input_cost": 0.00025065, - "output_cost": 0.0019809 + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1186.0, + "total_cost": 0.0007832000000000001, + "input_cost": 0.0003088, + "output_cost": 0.00047440000000000004 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 11.161597903, + "Duration": 3.732016304, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 7403.0, + "total_tokens": 3778.0, "provider": "Google", - "Metric_request_tokens": 5349.0, - "Metric_response_tokens": 1168.0, - "total_cost": 0.00460415, - "input_cost": 0.00080235, - "output_cost": 0.0038018 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 671.0, + "total_cost": 0.0005791, + "input_cost": 0.0003107, + "output_cost": 0.0002684 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.529419098, + "Duration": 4.756348604, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2839.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4571.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0027445499999999997, - "input_cost": 0.00025035, - "output_cost": 0.0024942 + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 1435.0, + "total_cost": 0.0008876000000000001, + "input_cost": 0.00031360000000000003, + "output_cost": 0.0005740000000000001 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 8.540685767, - "Score_MermaidDiagramValid": 0.0, + "Duration": 4.71480991, + "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3173.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4318.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.0039010499999999997, - "input_cost": 0.00025065, - "output_cost": 0.0036504 + "Metric_request_tokens": 3104.0, + "Metric_response_tokens": 1214.0, + "total_cost": 0.000796, + "input_cost": 0.0003104, + "output_cost": 0.00048560000000000004 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 5.922996591, + "Duration": 3.797644523, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2729.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3776.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0023595499999999998, - "input_cost": 0.00025035, - "output_cost": 0.0021092 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 669.0, + "total_cost": 0.0005783, + "input_cost": 0.0003107, + "output_cost": 0.0002676 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 6.741784844, + "Duration": 4.755809993, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2900.0, + "total_tokens": 4296.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00295805, - "input_cost": 0.00025035, - "output_cost": 0.0027077 + "Metric_request_tokens": 3092.0, + "Metric_response_tokens": 1204.0, + "total_cost": 0.0007908, + "input_cost": 0.00030920000000000003, + "output_cost": 0.0004816 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 7.055616236, + "Duration": 4.602118065, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2809.0, + "total_tokens": 4273.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.0026270499999999997, - "input_cost": 0.00025065, - "output_cost": 0.0023764 + "Metric_request_tokens": 3088.0, + "Metric_response_tokens": 1185.0, + "total_cost": 0.0007828, + "input_cost": 0.0003088, + "output_cost": 0.00047400000000000003 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 21.365465445, + "Duration": 4.466044834, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 9022.0, + "total_tokens": 4312.0, "provider": "Google", - "Metric_request_tokens": 5361.0, - "Metric_response_tokens": 1174.0, - "total_cost": 0.01021305, - "input_cost": 0.00080415, - "output_cost": 0.0094089 + "Metric_request_tokens": 3107.0, + "Metric_response_tokens": 1205.0, + "total_cost": 0.0007927, + "input_cost": 0.0003107, + "output_cost": 0.000482 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite-preview-06-17", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.080146417, + "Duration": 4.130914105, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2513.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3861.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00160355, - "input_cost": 0.00025035, - "output_cost": 0.0013532 + "Metric_request_tokens": 3136.0, + "Metric_response_tokens": 725.0, + "total_cost": 0.0006036, + "input_cost": 0.00031360000000000003, + "output_cost": 0.00029 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 25.419348211, + "Duration": 19.530914159, "Score_MermaidDiagramValid": 1.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 10935.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4267.0, "provider": "Google", - "Metric_request_tokens": 5358.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.016854800000000003, - "input_cost": 0.0008037, - "output_cost": 0.016051100000000002 + "Metric_request_tokens": 3077.0, + "Metric_response_tokens": 1190.0, + "total_cost": 0.0007837, + "input_cost": 0.0003077, + "output_cost": 0.0004760000000000001 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.279733312, + "Duration": 5.086535765, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2443.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4261.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00135855, - "input_cost": 0.00025035, - "output_cost": 0.0011082 + "Metric_request_tokens": 3094.0, + "Metric_response_tokens": 1167.0, + "total_cost": 0.0007762000000000001, + "input_cost": 0.00030940000000000004, + "output_cost": 0.00046680000000000007 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.584990518, + "Duration": 5.168974023, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2690.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4317.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00221725, - "input_cost": 0.00025035, - "output_cost": 0.0019669 + "Metric_request_tokens": 3128.0, + "Metric_response_tokens": 1189.0, + "total_cost": 0.0007884000000000001, + "input_cost": 0.0003128, + "output_cost": 0.0004756 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 7.12928071, + "Duration": 4.200968076, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2925.0, + "total_tokens": 3763.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.00303305, - "input_cost": 0.00025065, - "output_cost": 0.0027824 + "Metric_request_tokens": 3076.0, + "Metric_response_tokens": 687.0, + "total_cost": 0.0005824000000000001, + "input_cost": 0.00030760000000000005, + "output_cost": 0.0002748 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 23.894062173, + "Duration": 4.04668023, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 11489.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3772.0, "provider": "Google", - "Metric_request_tokens": 6861.0, - "Metric_response_tokens": 1806.0, - "total_cost": 0.01198975, - "input_cost": 0.00102915, - "output_cost": 0.010960600000000001 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 694.0, + "total_cost": 0.0005854, + "input_cost": 0.0003078, + "output_cost": 0.00027759999999999997 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 9.010575503, + "Duration": 5.559231565, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3355.0, + "total_tokens": 4278.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00455055, - "input_cost": 0.00025035, - "output_cost": 0.0043002000000000005 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1200.0, + "total_cost": 0.0007878, + "input_cost": 0.0003078, + "output_cost": 0.00047999999999999996 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.778800106, + "Duration": 5.227105654, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2693.0, + "total_tokens": 4262.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 554.0, - "total_cost": 0.0022210499999999996, - "input_cost": 0.00025065, - "output_cost": 0.0019703999999999998 + "Metric_request_tokens": 3076.0, + "Metric_response_tokens": 1186.0, + "total_cost": 0.0007820000000000001, + "input_cost": 0.00030760000000000005, + "output_cost": 0.00047440000000000004 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 8.167258931, + "Duration": 5.281790633, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3303.0, + "total_tokens": 4259.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 550.0, - "total_cost": 0.004374349999999999, - "input_cost": 0.00025035, - "output_cost": 0.004123999999999999 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1181.0, + "total_cost": 0.0007802, + "input_cost": 0.0003078, + "output_cost": 0.0004724 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.731809965, + "Duration": 4.292229758, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2458.0, + "total_tokens": 3794.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.00141105, - "input_cost": 0.00025035, - "output_cost": 0.0011607 + "Metric_request_tokens": 3080.0, + "Metric_response_tokens": 714.0, + "total_cost": 0.0005936, + "input_cost": 0.000308, + "output_cost": 0.0002856 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.421789254, + "Duration": 4.415614026, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2578.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3801.0, "provider": "Google", - "Metric_request_tokens": 1671.0, - "Metric_response_tokens": 568.0, - "total_cost": 0.0017779500000000001, - "input_cost": 0.00025065, - "output_cost": 0.0015273 + "Metric_request_tokens": 3094.0, + "Metric_response_tokens": 707.0, + "total_cost": 0.0005922, + "input_cost": 0.00030940000000000004, + "output_cost": 0.0002828 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.939170129, + "Duration": 4.140784542, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2535.0, + "total_tokens": 3750.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0016805499999999998, - "input_cost": 0.00025035, - "output_cost": 0.0014302 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 672.0, + "total_cost": 0.0005766, + "input_cost": 0.0003078, + "output_cost": 0.0002688 }, { - "Model": "gemini-2.5-flash-preview-05-20", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 5.085231652, + "Duration": 4.949271176, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 2550.0, + "total_tokens": 4281.0, "provider": "Google", - "Metric_request_tokens": 1669.0, - "Metric_response_tokens": 552.0, - "total_cost": 0.0017330499999999999, - "input_cost": 0.00025035, - "output_cost": 0.0014827 + "Metric_request_tokens": 3081.0, + "Metric_response_tokens": 1200.0, + "total_cost": 0.0007880999999999999, + "input_cost": 0.0003081, + "output_cost": 0.00047999999999999996 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.517046246, + "Duration": 4.131322822, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4330.0, + "total_tokens": 3759.0, "provider": "Google", - "Metric_request_tokens": 3106.0, - "Metric_response_tokens": 1224.0, - "total_cost": 0.0008002, - "input_cost": 0.0003106, - "output_cost": 0.0004896000000000001 + "Metric_request_tokens": 3092.0, + "Metric_response_tokens": 667.0, + "total_cost": 0.000576, + "input_cost": 0.00030920000000000003, + "output_cost": 0.0002668 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.366856197, + "Duration": 15.856126981, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4300.0, + "total_tokens": 21095.0, "provider": "Google", - "Metric_request_tokens": 3108.0, - "Metric_response_tokens": 1192.0, - "total_cost": 0.0007876000000000001, - "input_cost": 0.0003108, - "output_cost": 0.0004768 + "Metric_request_tokens": 18585.0, + "Metric_response_tokens": 2510.0, + "total_cost": 0.0028625000000000005, + "input_cost": 0.0018585000000000001, + "output_cost": 0.0010040000000000001 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.691253093, + "Duration": 5.093578255, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3771.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4305.0, "provider": "Google", - "Metric_request_tokens": 3105.0, - "Metric_response_tokens": 666.0, - "total_cost": 0.0005769, - "input_cost": 0.0003105, - "output_cost": 0.0002664 + "Metric_request_tokens": 3080.0, + "Metric_response_tokens": 1225.0, + "total_cost": 0.000798, + "input_cost": 0.000308, + "output_cost": 0.00049 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.948628294, - "Score_MermaidDiagramValid": 1.0, + "Duration": 5.10008024, + "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 50365.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4271.0, "provider": "Google", - "Metric_request_tokens": 49186.0, - "Metric_response_tokens": 1179.0, - "total_cost": 0.0053902, - "input_cost": 0.0049186, - "output_cost": 0.00047159999999999997 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1193.0, + "total_cost": 0.0007850000000000001, + "input_cost": 0.0003078, + "output_cost": 0.00047720000000000005 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.397124725, + "Duration": 4.996482216, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4273.0, + "total_tokens": 4254.0, "provider": "Google", - "Metric_request_tokens": 3089.0, - "Metric_response_tokens": 1184.0, - "total_cost": 0.0007825, - "input_cost": 0.00030890000000000003, - "output_cost": 0.0004736 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1176.0, + "total_cost": 0.0007781999999999999, + "input_cost": 0.0003078, + "output_cost": 0.0004704 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.246158175, + "Duration": 4.381453313, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4269.0, - "provider": "Google", - "Metric_request_tokens": 3092.0, - "Metric_response_tokens": 1177.0, - "total_cost": 0.0007800000000000001, - "input_cost": 0.00030920000000000003, - "output_cost": 0.00047080000000000006 + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3873.0, + "provider": "Google", + "Metric_request_tokens": 3080.0, + "Metric_response_tokens": 793.0, + "total_cost": 0.0006252, + "input_cost": 0.000308, + "output_cost": 0.0003172 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.461461037, + "Duration": 4.576133277, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4313.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 49852.0, "provider": "Google", - "Metric_request_tokens": 3089.0, - "Metric_response_tokens": 1224.0, - "total_cost": 0.0007985000000000002, - "input_cost": 0.00030890000000000003, - "output_cost": 0.0004896000000000001 + "Metric_request_tokens": 49187.0, + "Metric_response_tokens": 665.0, + "total_cost": 0.0051847, + "input_cost": 0.004918700000000001, + "output_cost": 0.000266 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 3.826108098, + "Duration": 4.075267329, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3829.0, + "total_tokens": 3813.0, "provider": "Google", - "Metric_request_tokens": 3136.0, - "Metric_response_tokens": 693.0, - "total_cost": 0.0005908, - "input_cost": 0.00031360000000000003, - "output_cost": 0.0002772 + "Metric_request_tokens": 3123.0, + "Metric_response_tokens": 690.0, + "total_cost": 0.0005882999999999999, + "input_cost": 0.0003123, + "output_cost": 0.000276 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.812971395, + "Duration": 4.984536161, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3771.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4251.0, "provider": "Google", - "Metric_request_tokens": 3103.0, - "Metric_response_tokens": 668.0, - "total_cost": 0.0005775, - "input_cost": 0.0003103, - "output_cost": 0.0002672 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1173.0, + "total_cost": 0.000777, + "input_cost": 0.0003078, + "output_cost": 0.0004692 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.463204247, + "Duration": 5.060489677, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4292.0, + "total_tokens": 4378.0, "provider": "Google", - "Metric_request_tokens": 3106.0, - "Metric_response_tokens": 1186.0, - "total_cost": 0.0007850000000000001, - "input_cost": 0.0003106, - "output_cost": 0.00047440000000000004 + "Metric_request_tokens": 3125.0, + "Metric_response_tokens": 1253.0, + "total_cost": 0.0008137000000000001, + "input_cost": 0.00031250000000000006, + "output_cost": 0.0005012 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.422216874, + "Duration": 4.046059014, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 6650.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3742.0, "provider": "Google", - "Metric_request_tokens": 5423.0, - "Metric_response_tokens": 1227.0, - "total_cost": 0.0010331, - "input_cost": 0.0005423, - "output_cost": 0.0004908 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 664.0, + "total_cost": 0.0005734, + "input_cost": 0.0003078, + "output_cost": 0.0002656 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.450171262, + "Duration": 4.931260288, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4287.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4296.0, "provider": "Google", - "Metric_request_tokens": 3091.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.0007875, - "input_cost": 0.00030910000000000003, - "output_cost": 0.00047840000000000003 + "Metric_request_tokens": 3095.0, + "Metric_response_tokens": 1201.0, + "total_cost": 0.0007899000000000001, + "input_cost": 0.00030950000000000004, + "output_cost": 0.0004804 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.562724232, + "Duration": 5.324805262, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4284.0, + "total_tokens": 4264.0, "provider": "Google", - "Metric_request_tokens": 3088.0, - "Metric_response_tokens": 1196.0, - "total_cost": 0.0007872, - "input_cost": 0.0003088, - "output_cost": 0.00047840000000000003 + "Metric_request_tokens": 3076.0, + "Metric_response_tokens": 1188.0, + "total_cost": 0.0007828000000000002, + "input_cost": 0.00030760000000000005, + "output_cost": 0.00047520000000000006 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.407190932, + "Duration": 5.532431761, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4378.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4482.0, "provider": "Google", - "Metric_request_tokens": 3136.0, - "Metric_response_tokens": 1242.0, - "total_cost": 0.0008104000000000001, - "input_cost": 0.00031360000000000003, - "output_cost": 0.0004968 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1404.0, + "total_cost": 0.0008694000000000001, + "input_cost": 0.0003078, + "output_cost": 0.0005616000000000001 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.620992741, + "Duration": 4.878045252, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3766.0, + "total_tokens": 4315.0, "provider": "Google", - "Metric_request_tokens": 3091.0, - "Metric_response_tokens": 675.0, - "total_cost": 0.0005791, - "input_cost": 0.00030910000000000003, - "output_cost": 0.00027 + "Metric_request_tokens": 3080.0, + "Metric_response_tokens": 1235.0, + "total_cost": 0.000802, + "input_cost": 0.000308, + "output_cost": 0.000494 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.472499546, + "Duration": 5.37411601, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4297.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4277.0, "provider": "Google", - "Metric_request_tokens": 3090.0, - "Metric_response_tokens": 1207.0, - "total_cost": 0.0007918000000000001, - "input_cost": 0.00030900000000000003, - "output_cost": 0.0004828 + "Metric_request_tokens": 3094.0, + "Metric_response_tokens": 1183.0, + "total_cost": 0.0007826, + "input_cost": 0.00030940000000000004, + "output_cost": 0.0004732 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.473251097, + "Duration": 4.198357553, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4322.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3773.0, "provider": "Google", - "Metric_request_tokens": 3106.0, - "Metric_response_tokens": 1216.0, - "total_cost": 0.0007970000000000001, - "input_cost": 0.0003106, - "output_cost": 0.00048640000000000006 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 695.0, + "total_cost": 0.0005858, + "input_cost": 0.0003078, + "output_cost": 0.000278 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.574995535, + "Duration": 5.122080385, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4266.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4309.0, "provider": "Google", - "Metric_request_tokens": 3091.0, - "Metric_response_tokens": 1175.0, - "total_cost": 0.0007791, - "input_cost": 0.00030910000000000003, - "output_cost": 0.00047000000000000004 + "Metric_request_tokens": 3097.0, + "Metric_response_tokens": 1212.0, + "total_cost": 0.0007945000000000001, + "input_cost": 0.0003097, + "output_cost": 0.0004848 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.094999263, + "Duration": 5.670149233, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3858.0, + "total_tokens": 6618.0, "provider": "Google", - "Metric_request_tokens": 3137.0, - "Metric_response_tokens": 721.0, - "total_cost": 0.0006021, - "input_cost": 0.00031370000000000004, - "output_cost": 0.0002884 + "Metric_request_tokens": 5390.0, + "Metric_response_tokens": 1228.0, + "total_cost": 0.0010302, + "input_cost": 0.000539, + "output_cost": 0.0004912 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.658386722, + "Duration": 5.358885211, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3778.0, + "total_tokens": 4265.0, "provider": "Google", - "Metric_request_tokens": 3089.0, - "Metric_response_tokens": 689.0, - "total_cost": 0.0005845000000000001, - "input_cost": 0.00030890000000000003, - "output_cost": 0.00027560000000000003 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1187.0, + "total_cost": 0.0007826, + "input_cost": 0.0003078, + "output_cost": 0.0004748 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 4.893656632, + "Duration": 5.529175968, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4357.0, + "total_tokens": 4276.0, "provider": "Google", - "Metric_request_tokens": 3136.0, - "Metric_response_tokens": 1221.0, - "total_cost": 0.0008020000000000001, - "input_cost": 0.00031360000000000003, - "output_cost": 0.0004884 + "Metric_request_tokens": 3097.0, + "Metric_response_tokens": 1179.0, + "total_cost": 0.0007813, + "input_cost": 0.0003097, + "output_cost": 0.00047159999999999997 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 5.236565242, + "Duration": 5.318584704, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3883.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4259.0, "provider": "Google", - "Metric_request_tokens": 3137.0, - "Metric_response_tokens": 746.0, - "total_cost": 0.0006121000000000001, - "input_cost": 0.00031370000000000004, - "output_cost": 0.00029840000000000004 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1181.0, + "total_cost": 0.0007802, + "input_cost": 0.0003078, + "output_cost": 0.0004724 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.485359369, + "Duration": 6.129762473, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 1.0, - "total_tokens": 4324.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 4518.0, "provider": "Google", - "Metric_request_tokens": 3108.0, - "Metric_response_tokens": 1216.0, - "total_cost": 0.0007972000000000001, - "input_cost": 0.0003108, - "output_cost": 0.00048640000000000006 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1440.0, + "total_cost": 0.0008838, + "input_cost": 0.0003078, + "output_cost": 0.000576 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.734365673, + "Duration": 5.357050738, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3791.0, + "total_tokens": 4260.0, "provider": "Google", - "Metric_request_tokens": 3110.0, - "Metric_response_tokens": 681.0, - "total_cost": 0.0005834, - "input_cost": 0.000311, - "output_cost": 0.0002724 + "Metric_request_tokens": 3094.0, + "Metric_response_tokens": 1166.0, + "total_cost": 0.0007758000000000001, + "input_cost": 0.00030940000000000004, + "output_cost": 0.0004664 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.500398738, + "Duration": 5.448035659, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4304.0, + "total_tokens": 4277.0, "provider": "Google", - "Metric_request_tokens": 3088.0, - "Metric_response_tokens": 1216.0, - "total_cost": 0.0007952, - "input_cost": 0.0003088, - "output_cost": 0.00048640000000000006 + "Metric_request_tokens": 3076.0, + "Metric_response_tokens": 1201.0, + "total_cost": 0.0007880000000000001, + "input_cost": 0.00030760000000000005, + "output_cost": 0.0004804 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.124787172, + "Duration": 5.454605744, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3850.0, + "total_tokens": 4260.0, "provider": "Google", - "Metric_request_tokens": 3089.0, - "Metric_response_tokens": 761.0, - "total_cost": 0.0006133, - "input_cost": 0.00030890000000000003, - "output_cost": 0.0003044 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1182.0, + "total_cost": 0.0007806, + "input_cost": 0.0003078, + "output_cost": 0.00047280000000000005 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.98193398, + "Duration": 4.712239535, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 1.0, - "total_tokens": 3903.0, + "total_tokens": 3832.0, "provider": "Google", - "Metric_request_tokens": 3136.0, - "Metric_response_tokens": 767.0, - "total_cost": 0.0006204000000000001, - "input_cost": 0.00031360000000000003, - "output_cost": 0.00030680000000000003 + "Metric_request_tokens": 3127.0, + "Metric_response_tokens": 705.0, + "total_cost": 0.0005947000000000001, + "input_cost": 0.0003127, + "output_cost": 0.000282 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_easy", "test_group": "easy", - "Duration": 4.48101111, + "Duration": 5.044155716, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4299.0, + "provider": "Google", + "Metric_request_tokens": 3092.0, + "Metric_response_tokens": 1207.0, + "total_cost": 0.0007920000000000001, + "input_cost": 0.00030920000000000003, + "output_cost": 0.0004828 + }, + { + "Model": "gemini-2.5-flash-lite", + "Case": "fix_invalid_diagram_medium", + "test_group": "medium", + "Duration": 4.750485612, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 3843.0, + "provider": "Google", + "Metric_request_tokens": 3125.0, + "Metric_response_tokens": 718.0, + "total_cost": 0.0005997000000000001, + "input_cost": 0.00031250000000000006, + "output_cost": 0.0002872 + }, + { + "Model": "gemini-2.5-flash-lite", + "Case": "fix_invalid_diagram_hard", + "test_group": "hard", + "Duration": 5.14357062, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4290.0, + "total_tokens": 4283.0, "provider": "Google", - "Metric_request_tokens": 3088.0, + "Metric_request_tokens": 3081.0, "Metric_response_tokens": 1202.0, - "total_cost": 0.0007896, - "input_cost": 0.0003088, + "total_cost": 0.0007888999999999999, + "input_cost": 0.0003081, "output_cost": 0.0004808 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", + "Case": "fix_invalid_diagram_easy", + "test_group": "easy", + "Duration": 5.678468457, + "Score_MermaidDiagramValid": 0.0, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 0.5, + "total_tokens": 3756.0, + "provider": "Google", + "Metric_request_tokens": 3077.0, + "Metric_response_tokens": 679.0, + "total_cost": 0.0005793, + "input_cost": 0.0003077, + "output_cost": 0.00027160000000000004 + }, + { + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_medium", "test_group": "medium", - "Duration": 4.916593879, + "Duration": 5.395852675, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, "Score_UsedBothMCPTools": 0.5, - "total_tokens": 4543.0, + "total_tokens": 4255.0, "provider": "Google", - "Metric_request_tokens": 3089.0, - "Metric_response_tokens": 1454.0, - "total_cost": 0.0008905, - "input_cost": 0.00030890000000000003, - "output_cost": 0.0005816 + "Metric_request_tokens": 3078.0, + "Metric_response_tokens": 1177.0, + "total_cost": 0.0007786000000000001, + "input_cost": 0.0003078, + "output_cost": 0.00047080000000000006 }, { - "Model": "gemini-2.5-flash-lite-preview-06-17", + "Model": "gemini-2.5-flash-lite", "Case": "fix_invalid_diagram_hard", "test_group": "hard", - "Duration": 3.879042448, + "Duration": 5.920805734, "Score_MermaidDiagramValid": 0.0, "Score_UsageLimitNotExceeded": 1.0, - "Score_UsedBothMCPTools": 0.5, - "total_tokens": 3885.0, + "Score_UsedBothMCPTools": 1.0, + "total_tokens": 4262.0, "provider": "Google", - "Metric_request_tokens": 3092.0, - "Metric_response_tokens": 793.0, - "total_cost": 0.0006264, - "input_cost": 0.00030920000000000003, - "output_cost": 0.0003172 + "Metric_request_tokens": 3094.0, + "Metric_response_tokens": 1168.0, + "total_cost": 0.0007766000000000001, + "input_cost": 0.00030940000000000004, + "output_cost": 0.0004672 } ], "config": {