In [12]:
import pandas as pd
from matplotlib import pyplot as plt
import ipywidgets as widgets
import numpy as np

In [26]:
df = pd.read_csv('results/IMP_oracle_eval.csv')
print(len(df))

7950
7950


In [21]:
summary = []
oracles = ['BinaryOracle', 'MutationOracle', 'Mutation1Oracle', 'SoloOracle', 'RelativeOracle', 'DiffOracle', 'ExampleOracle']
judges = {'Meta-Llama-3-8B-Instruct-q8_0': '8B', 'Meta-Llama-3.1-70B-Instruct-q8_0': '70B', 'Meta-Llama-3.1-70B-Instruct-WQE-0.1-q8_0': 'WQE'}
indices = []
for oracle in oracles:
	for judge in judges:
		df_oracle = df[(df['oracle_class'] == oracle) & (df['explain'] == False) & (df['judge_name'] == judge)]
		avg = df_oracle['pred_correct'].mean()
		time = df_oracle['time_taken'].mean()
		summary.append([avg, time, avg/time])
		print(oracle, judges[judge], len(df_oracle))
		indices.append(f"{oracle}_{judges[judge]}")

oracles2 = ['ArmoRMOracle', 'InternLMOracle']
for oracle in oracles2:
	df_oracle = df[(df['oracle_class'] == oracle) & (df['explain'] == False)]
	avg = df_oracle['pred_correct'].mean()
	time = df_oracle['time_taken'].mean()
	summary.append([avg, time, avg/time])
	print(oracle, judges[judge], len(df_oracle))
	indices.append(f"{oracle}")


oracle_scoring = pd.DataFrame(data=summary,index=indices, columns=['accuracy', 'time', 'value'])
oracle_scoring

BinaryOracle 8B 242
BinaryOracle 70B 242
BinaryOracle WQE 242
MutationOracle 8B 242
MutationOracle 70B 242
MutationOracle WQE 242
Mutation1Oracle 8B 242
Mutation1Oracle 70B 242
Mutation1Oracle WQE 242
SoloOracle 8B 242
SoloOracle 70B 242
SoloOracle WQE 242
RelativeOracle 8B 242
RelativeOracle 70B 242
RelativeOracle WQE 242
DiffOracle 8B 204
DiffOracle 70B 204
DiffOracle WQE 204
ExampleOracle 8B 242
ExampleOracle 70B 242
ExampleOracle WQE 242
ArmoRMOracle WQE 246
InternLMOracle WQE 246


Unnamed: 0,accuracy,time,value
BinaryOracle_8B,0.557851,0.203859,2.736459
BinaryOracle_70B,0.619835,1.311118,0.472753
BinaryOracle_WQE,0.553719,1.300546,0.425759
MutationOracle_8B,0.433884,0.429671,1.009805
MutationOracle_70B,0.475207,2.839511,0.167355
MutationOracle_WQE,0.342975,2.827737,0.12129
Mutation1Oracle_8B,0.520661,0.212236,2.453221
Mutation1Oracle_70B,0.68595,1.387215,0.49448
Mutation1Oracle_WQE,0.495868,1.376518,0.360233
SoloOracle_8B,0.446281,0.39407,1.132491


In [16]:
def indexer(value):
    mapping = {'ResponseQuality.A_BETTER': 0, 'ResponseQuality.B_BETTER': 1, 'ResponseQuality.TIE': 2}
    return mapping[value.strip()]

def create_table(df, label=None, pred=None, oracle=None, explain=None, judge=None):
	if oracle != None:
		df = df[(df['oracle_class'] == oracle)]
	if explain != None:
		df = df[(df['explain'] == explain)]
	if judge != None:
		df = df[(df['judge_name'] == judge)]

	if label == "A":
		df = df[(df['original_label'] == 'ResponseQuality.A_BETTER')]
	elif label == "B":
		df = df[(df['original_label'] == 'ResponseQuality.B_BETTER')]
	elif label == "TIE":
		df = df[(df['original_label'] == 'ResponseQuality.TIE')]

	if pred == 1:
		df = df[(df['pred_correct'] == 1)]
	elif pred == .5:
		df = df[(df['pred_correct'] == .5)]
	elif pred == 0:
		df = df[(df['pred_correct'] == 0)]

	init_data = [[0,0,0], [0,0,0], [0,0,0]]
	for index, row in df.iterrows():
		
		i = indexer(row["original_pred"])
		j = 0
		if row["oracle_class"] not in ["SoloOracle", "BinaryOracle", "Mutation1Oracle", "DiffOracle"]:
			j = indexer(row["followup_pred"])
		init_data[i][j] += 1
	return pd.DataFrame(data=init_data,index=['A', 'B', 'TIE'], columns=['A', 'B', 'TIE'])
	

create_table(df,oracle='DiffOracle', explain=False, judge="Meta-Llama-3.1-70B-Instruct-q8_0")

Unnamed: 0,A,B,TIE
A,111,0,0
B,93,0,0
TIE,0,0,0


In [17]:
def create_table2(df, label=None, pred=None, oracle=None, explain=False, judge=None):
	if oracle != None:
		df = df[(df['oracle_class'] == oracle)]
	if explain != None:
		df = df[(df['explain'] == explain)]
	if judge != None:
		df = df[(df['judge_name'] == judge)]

	if label == "A":
		df = df[(df['original_label'] == 'ResponseQuality.A_BETTER')]
	elif label == "B":
		df = df[(df['original_label'] == 'ResponseQuality.B_BETTER')]
	elif label == "TIE":
		df = df[(df['original_label'] == 'ResponseQuality.TIE')]

	if pred == 1:
		df = df[(df['pred_correct'] == 1)]
	elif pred == .5:
		df = df[(df['pred_correct'] == .5)]
	elif pred == 0:
		df = df[(df['pred_correct'] == 0)]

	init_data = [[0,0,0], [0,0,0], [0,0,0]]
	for index, row in df.iterrows():
		i = 0 if row['original_label'] == 'ResponseQuality.A_BETTER' else (1 if row['original_label'] == 'ResponseQuality.B_BETTER' else 2)
		j = int((row["pred_correct"])*2)
		init_data[i][j] += 1
	return pd.DataFrame(data=init_data,index=['A', 'B', 'TIE'], columns=['0', '.5', '1'])
	
	
create_table2(df,oracle='DiffOracle', explain=False, judge="Meta-Llama-3.1-70B-Instruct-q8_0")

Unnamed: 0,0,.5,1
A,25,0,85
B,26,0,68
TIE,0,0,0
