In [27]:
import pandas as pd

from source.utils.model import TestResult
from source.utils.repo import LOCAL_SYMPY_REPO_PATH, SYMPY_REPO_URL, get_sympy_dataset, git_clone_repo
from source.utils.tester import Tester, process_issue
from source.utils.utils import get_credentials, get_logger

In [2]:
logger = get_logger(__name__)

In [40]:
# Uncomment the following line to clone the sympy repository
# git_clone_repo(SYMPY_REPO_URL, LOCAL_SYMPY_REPO_PATH)

### Create test logic instance

In [4]:
credentials = get_credentials()
tester = Tester(credentials=credentials)

### Create SWE Bench dataset

In [5]:
df = get_sympy_dataset()

### Process test logic

In [None]:
issues = df.iloc[:10,:]
test_cases = []
for i in range(10):
    logger.info(f"Processing issue {i}")
    issue = issues.iloc[i,:]
    test_case = await process_issue(issue, tester)
    test_cases.append(test_case)

In [None]:
test_result_df = pd.DataFrame([tc.create_comparison_record(issues.iloc[ind]) for ind, tc in enumerate(test_cases)])
test_result_df.to_csv("./data/test_result.csv", index=False)

### Example of OpenAI, DeepSeek and original solution

In [33]:
solution_example = test_result_df[TestResult.get_keys_for_solutions()].iloc[0]
for key, value in solution_example.items():
    print(f"{key}:\n\n{value}\n\n\n")

openai_patch:

diff --git a/sympy/printing/ccode.py b/sympy/printing/ccode.py
index abcdef1..1234567 100644
--- a/sympy/printing/ccode.py
+++ b/sympy/printing/ccode.py
@@ -30,6 +30,7 @@ known_functions = {
     "cos": "cos",
     "tan": "tan",
     "asin": "asin",
     "acos": "acos",
     "atan": "atan",
+    "sinc": "(sin(%s)/(%s))"
     ...
 }

 class CCodePrinter(CodePrinter):
     ...
 



deepseek_patch:

diff --git a/sympy/printing/ccode.py b/sympy/printing/ccode.py
index 6b0c9f7..f7f8d1e 100644
--- a/sympy/printing/ccode.py
+++ b/sympy/printing/ccode.py
@@ -65,6 +65,7 @@
     "sinh": "sinh",
     "cosh": "cosh",
     "tanh": "tanh",
+    "sinc": [(lambda x: not x.is_zero, "sin(x)/x"), (lambda x: x.is_zero, "1")],
     "asinh": "asinh",
     "acosh": "acosh",
     "atanh": "atanh",




correct_patch:

diff --git a/sympy/printing/ccode.py b/sympy/printing/ccode.py
--- a/sympy/printing/ccode.py
+++ b/sympy/printing/ccode.py
@@ -231,6 +231,20 @@ def _print_Symbol(self, expr):
     

### Correctness test
`openai_deepseek_correct` means that OpenAI GPT-4o-mini model tested a DeepSeek V3's solution

In [28]:
test_result_df[TestResult.get_keys_for_correctness()]

Unnamed: 0,openai_openai_correct,openai_deepseek_correct,deepseek_openai_correct,deepseek_deepseek_correct
0,False,True,False,False
1,False,True,False,False
2,False,True,False,False
3,True,True,False,True
4,False,True,False,False
5,False,False,False,False
6,True,False,True,False
7,False,False,False,False
8,False,True,False,True
9,True,True,False,False


### Count number of correct solutions

In [29]:
test_result_df[TestResult.get_keys_for_correctness()].sum()

openai_openai_correct        3
openai_deepseek_correct      7
deepseek_openai_correct      1
deepseek_deepseek_correct    2
dtype: int64

### Solution scores

In [34]:
test_result_df[TestResult.get_keys_for_test_scores()]

Unnamed: 0,openai_openai_score,openai_deepseek_score,deepseek_openai_score,deepseek_deepseek_score
0,3,4,2,3
1,2,5,2,3
2,2,4,1,3
3,5,4,2,4
4,2,4,1,3
5,2,1,2,1
6,5,1,5,1
7,1,2,1,1
8,2,5,1,4
9,4,5,1,3


### Average scores

In [35]:
test_result_df[TestResult.get_keys_for_test_scores()].mean()

openai_openai_score        2.8
openai_deepseek_score      3.5
deepseek_openai_score      1.8
deepseek_deepseek_score    2.6
dtype: float64

### Number of steps to come up with a solution

In [36]:
test_result_df[TestResult.get_keys_for_steps()]

Unnamed: 0,openai_num_steps,deepseek_num_steps
0,3,3
1,3,3
2,3,3
3,1,2
4,3,3
5,3,3
6,2,0
7,3,3
8,3,3
9,3,3


### Average number of steps

In [37]:
test_result_df[TestResult.get_keys_for_steps()].mean()

openai_num_steps      2.7
deepseek_num_steps    2.6
dtype: float64

### Number of used tokens for retriever and solver steps

In [38]:
test_result_df[TestResult.get_keys_for_used_tokens()]

Unnamed: 0,openai_input_tokens_retriever,deepseek_input_tokens_retriever,openai_output_tokens_retriever,deepseek_output_tokens_retriever,openai_input_tokens_solver,deepseek_input_tokens_solver,openai_output_tokens_solver,deepseek_output_tokens_solver
0,15517,17131,176,144,7351,7624,580,870
1,17120,18962,719,228,9764,5053,611,1435
2,18261,19772,614,150,11619,4535,382,194
3,15852,17612,297,90,929,3931,305,553
4,22353,24761,736,657,16099,10027,361,432
5,23938,26667,497,464,15982,12672,574,568
6,16905,0,183,0,4765,0,494,0
7,15396,17072,222,98,8785,4558,396,306
8,18158,20293,604,266,10790,10682,396,1298
9,15675,17380,486,99,7890,3936,648,388


### Average number of used tokens

In [39]:
test_result_df[TestResult.get_keys_for_used_tokens()].mean()

openai_input_tokens_retriever       17917.5
deepseek_input_tokens_retriever     17965.0
openai_output_tokens_retriever        453.4
deepseek_output_tokens_retriever      219.6
openai_input_tokens_solver           9397.4
deepseek_input_tokens_solver         6301.8
openai_output_tokens_solver           474.7
deepseek_output_tokens_solver         604.4
dtype: float64