1414from cuml .linear_model import LogisticRegression
1515import cupy as cp
1616
17+
1718def update_model_info (model_info ):
1819 for model , info in model_info .items ():
1920 if "https://huggingface.co/" in info ["link" ]:
@@ -56,8 +57,8 @@ def get_results(tids):
5657 for model , info in model_info .items ():
5758 model = model .replace ("/" , "--" )
5859 hf_model = ""
59- files = glob (f"results/{ model } --bigcodebench-*.json" )
60- assert files , f"No files found for results/{ model } --bigcodebench-*.json"
60+ files = glob (f"results/{ model } --bigcodebench-*_eval_results .json" )
61+ assert files , f"No files found for results/{ model } --bigcodebench-*_eval_results .json"
6162 for file in files :
6263 try :
6364 _ , suffix = os .path .basename (file ).split ("--bigcodebench-hard-" )
@@ -86,7 +87,7 @@ def get_results(tids):
8687 raise ValueError ("Unknown task" )
8788
8889 mode = ""
89- if "-sanitized-calibrate " in file :
90+ if "calibrated " in file :
9091 mode = "-cal"
9192
9293 results [info ["name" ]][f"pass@1" ][f"{ task } { mode } " ] = round (mean (status )* 100 ,1 )
@@ -141,17 +142,17 @@ def split_gen():
141142 if "calibrated" in file :
142143 if info ["prompted" ]:
143144 if suffix .startswith ("complete" ):
144- with open (f"sanitized_calibrated_samples/complete/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
145+ with open (f"sanitized_calibrated_samples/complete/{ model } --bigcodebench* -{ suffix } " , "w" ) as f :
145146 f .writelines (data )
146147 else :
147- with open (f"sanitized_calibrated_samples/instruct/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
148+ with open (f"sanitized_calibrated_samples/instruct/{ model } --bigcodebench* -{ suffix } " , "w" ) as f :
148149 f .writelines (data )
149150 else :
150151 if suffix .startswith ("complete" ):
151- with open (f"sanitized_samples/complete/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
152+ with open (f"sanitized_samples/complete/{ model } --bigcodebench* -{ suffix } " , "w" ) as f :
152153 f .writelines (data )
153154 else :
154- with open (f"sanitized_samples/instruct/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
155+ with open (f"sanitized_samples/instruct/{ model } --bigcodebench* -{ suffix } " , "w" ) as f :
155156 f .writelines (data )
156157
157158
@@ -168,7 +169,7 @@ def read_task_perf(tids, task="complete"):
168169 try :
169170 try :
170171 if info ["prompted" ]:
171- files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized- calibrated_eval_results.json" )
172+ files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized* calibrated_eval_results.json" )
172173 if files :
173174 file = files [0 ]
174175 else :
@@ -177,7 +178,7 @@ def read_task_perf(tids, task="complete"):
177178 file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results.json" )[0 ]
178179 except :
179180 if info ["prompted" ]:# and not info["direct_complete"]:
180- files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized- calibrated_hard_eval_results.json" )
181+ files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized* calibrated_hard_eval_results.json" )
181182 if files :
182183 file = files [0 ]
183184 else :
@@ -187,7 +188,7 @@ def read_task_perf(tids, task="complete"):
187188 except :
188189 try :
189190 if info ["prompted" ]:# and not info["direct_complete"]:
190- files = glob (f"results/{ model } --bigcodebench-hard-{ task } *-0-1-sanitized- calibrated_hard_eval_results.json" )
191+ files = glob (f"results/{ model } --bigcodebench-hard-{ task } *-0-1-sanitized* calibrated_hard_eval_results.json" )
191192 if files :
192193 file = files [0 ]
193194 else :
@@ -196,7 +197,7 @@ def read_task_perf(tids, task="complete"):
196197 file = glob (f"results/{ model } --bigcodebench-hard-{ task } *-0-1-sanitized_hard_eval_results.json" )[0 ]
197198 except :
198199 if info ["prompted" ]:
199- files = glob (f"results/{ model } --bigcodebench-hard-{ task } *-0-1-sanitized- calibrated_eval_results.json" )
200+ files = glob (f"results/{ model } --bigcodebench-hard-{ task } *-0-1-sanitized* calibrated_eval_results.json" )
200201 if files :
201202 file = files [0 ]
202203 else :
@@ -394,7 +395,7 @@ def get_perf_df(data_dict):
394395
395396
396397if __name__ == "__main__" :
397-
398+ split_gen ()
398399 bcb_orig = load_dataset ("bigcode/bigcodebench" , split = "v0.1.1" )
399400 bcb_hard = load_dataset ("bigcode/bigcodebench-hard" , split = "v0.1.1" )
400401 bcb_config = {
@@ -408,7 +409,7 @@ def get_perf_df(data_dict):
408409 instruct_data , instruct_files = read_task_perf (bcb ["task_id" ], "instruct" )
409410 complete_df = get_perf_df (complete_data )
410411 instruct_df = get_perf_df (instruct_data )
411-
412+
412413 push_ds (DatasetDict ({"complete" : Dataset .from_pandas (complete_df ), "instruct" : Dataset .from_pandas (instruct_df )}), f"bigcode/bigcodebench{ suffix } -perf" )
413414
414415 with open ("task2domain.json" , "r" ) as f :
0 commit comments