Skip to content

Commit 3ac359c

Browse files
committed
update the result computation
1 parent 0532959 commit 3ac359c

File tree

1 file changed

+14
-13
lines changed

1 file changed

+14
-13
lines changed

analysis/get_results.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from cuml.linear_model import LogisticRegression
1515
import cupy as cp
1616

17+
1718
def update_model_info(model_info):
1819
for model, info in model_info.items():
1920
if "https://huggingface.co/" in info["link"]:
@@ -56,8 +57,8 @@ def get_results(tids):
5657
for model, info in model_info.items():
5758
model = model.replace("/", "--")
5859
hf_model = ""
59-
files = glob(f"results/{model}--bigcodebench-*.json")
60-
assert files, f"No files found for results/{model}--bigcodebench-*.json"
60+
files = glob(f"results/{model}--bigcodebench-*_eval_results.json")
61+
assert files, f"No files found for results/{model}--bigcodebench-*_eval_results.json"
6162
for file in files:
6263
try:
6364
_, suffix = os.path.basename(file).split("--bigcodebench-hard-")
@@ -86,7 +87,7 @@ def get_results(tids):
8687
raise ValueError("Unknown task")
8788

8889
mode = ""
89-
if "-sanitized-calibrate" in file:
90+
if "calibrated" in file:
9091
mode = "-cal"
9192

9293
results[info["name"]][f"pass@1"][f"{task}{mode}"] = round(mean(status)*100,1)
@@ -141,17 +142,17 @@ def split_gen():
141142
if "calibrated" in file:
142143
if info["prompted"]:
143144
if suffix.startswith("complete"):
144-
with open(f"sanitized_calibrated_samples/complete/{model}--bigcodebench-{suffix}", "w") as f:
145+
with open(f"sanitized_calibrated_samples/complete/{model}--bigcodebench*-{suffix}", "w") as f:
145146
f.writelines(data)
146147
else:
147-
with open(f"sanitized_calibrated_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f:
148+
with open(f"sanitized_calibrated_samples/instruct/{model}--bigcodebench*-{suffix}", "w") as f:
148149
f.writelines(data)
149150
else:
150151
if suffix.startswith("complete"):
151-
with open(f"sanitized_samples/complete/{model}--bigcodebench-{suffix}", "w") as f:
152+
with open(f"sanitized_samples/complete/{model}--bigcodebench*-{suffix}", "w") as f:
152153
f.writelines(data)
153154
else:
154-
with open(f"sanitized_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f:
155+
with open(f"sanitized_samples/instruct/{model}--bigcodebench*-{suffix}", "w") as f:
155156
f.writelines(data)
156157

157158

@@ -168,7 +169,7 @@ def read_task_perf(tids, task="complete"):
168169
try:
169170
try:
170171
if info["prompted"]:
171-
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_eval_results.json")
172+
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized*calibrated_eval_results.json")
172173
if files:
173174
file = files[0]
174175
else:
@@ -177,7 +178,7 @@ def read_task_perf(tids, task="complete"):
177178
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
178179
except:
179180
if info["prompted"]:# and not info["direct_complete"]:
180-
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_hard_eval_results.json")
181+
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized*calibrated_hard_eval_results.json")
181182
if files:
182183
file = files[0]
183184
else:
@@ -187,7 +188,7 @@ def read_task_perf(tids, task="complete"):
187188
except:
188189
try:
189190
if info["prompted"]:# and not info["direct_complete"]:
190-
files = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized-calibrated_hard_eval_results.json")
191+
files = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized*calibrated_hard_eval_results.json")
191192
if files:
192193
file = files[0]
193194
else:
@@ -196,7 +197,7 @@ def read_task_perf(tids, task="complete"):
196197
file = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized_hard_eval_results.json")[0]
197198
except:
198199
if info["prompted"]:
199-
files = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized-calibrated_eval_results.json")
200+
files = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized*calibrated_eval_results.json")
200201
if files:
201202
file = files[0]
202203
else:
@@ -394,7 +395,7 @@ def get_perf_df(data_dict):
394395

395396

396397
if __name__ == "__main__":
397-
398+
split_gen()
398399
bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.1")
399400
bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.1")
400401
bcb_config = {
@@ -408,7 +409,7 @@ def get_perf_df(data_dict):
408409
instruct_data, instruct_files = read_task_perf(bcb["task_id"], "instruct")
409410
complete_df = get_perf_df(complete_data)
410411
instruct_df = get_perf_df(instruct_data)
411-
412+
412413
push_ds(DatasetDict({"complete": Dataset.from_pandas(complete_df), "instruct": Dataset.from_pandas(instruct_df)}), f"bigcode/bigcodebench{suffix}-perf")
413414

414415
with open("task2domain.json", "r") as f:

0 commit comments

Comments
 (0)