Skip to content

Commit 0ecd667

Browse files
committed
update the results analysis script
1 parent 05b7f1f commit 0ecd667

File tree

1 file changed

+16
-22
lines changed

1 file changed

+16
-22
lines changed

analysis/get_results.py

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -118,40 +118,34 @@ def check_valid(results):
118118

119119

120120
def split_gen():
121-
shutil.rmtree("sanitized_samples", ignore_errors=True)
122121
shutil.rmtree("sanitized_calibrated_samples", ignore_errors=True)
123-
os.makedirs("sanitized_samples/complete", exist_ok=True)
124-
os.makedirs("sanitized_samples/instruct", exist_ok=True)
125-
os.makedirs("sanitized_calibrated_samples/complete", exist_ok=True)
126-
os.makedirs("sanitized_calibrated_samples/instruct", exist_ok=True)
122+
os.makedirs("sanitized_calibrated_samples/hard/complete", exist_ok=True)
123+
os.makedirs("sanitized_calibrated_samples/hard/instruct", exist_ok=True)
124+
os.makedirs("sanitized_calibrated_samples/full/complete", exist_ok=True)
125+
os.makedirs("sanitized_calibrated_samples/full/instruct", exist_ok=True)
126+
127127
for model, info in model_info.items():
128128
model = model.replace("/", "--")
129129
files = glob(f"results/{model}--bigcodebench-*.jsonl")
130130
if info["link"].startswith("https://huggingface.co/"):
131131
model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
132132

133133
for file in files:
134+
if "-sanitized" not in file or "calibrated" not in file:
135+
continue
136+
134137
_, suffix = os.path.basename(file).split("--bigcodebench-")
135138
with open(file, "r") as f:
136139
data = f.readlines()
137140

138-
if "-sanitized" in file:
139-
if "calibrated" in file:
140-
if info["prompted"]:
141-
if suffix.startswith("complete"):
142-
with open(f"sanitized_calibrated_samples/complete/{model}--bigcodebench-{suffix}", "w") as f:
143-
f.writelines(data)
144-
else:
145-
with open(f"sanitized_calibrated_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f:
146-
f.writelines(data)
141+
split_type = "hard" if "-hard-" in file else "full"
142+
if info["prompted"]:
143+
if suffix.startswith("complete") or suffix.startswith("hard-complete"):
144+
with open(f"sanitized_calibrated_samples/{split_type}/complete/{model}--bigcodebench-{suffix}", "w") as f:
145+
f.writelines(data)
147146
else:
148-
if suffix.startswith("complete"):
149-
with open(f"sanitized_samples/complete/{model}--bigcodebench-{suffix}", "w") as f:
150-
f.writelines(data)
151-
else:
152-
with open(f"sanitized_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f:
153-
f.writelines(data)
154-
147+
with open(f"sanitized_calibrated_samples/{split_type}/instruct/{model}--bigcodebench-{suffix}", "w") as f:
148+
f.writelines(data)
155149

156150
def read_task_perf(tids, task="complete"):
157151
model_results = dict()
@@ -302,7 +296,7 @@ def get_perf_df(data_dict):
302296

303297

304298
if __name__ == "__main__":
305-
# split_gen()
299+
split_gen()
306300
bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.1")
307301
bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.1")
308302
bcb_config = {

0 commit comments

Comments
 (0)