In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import ipywidgets as widgets
import numpy as np
from enum import Enum

In [2]:
def df_init(only_relative=False, exclude_solo=False):
	df = pd.read_csv("results/oracle_eval.csv")
	df = df[(df['oracle_type'] == "guidance") & (df['explain'] == False) & (df['judge_name'] == 'Meta-Llama-3-70B-Instruct-q8_0.gguf')]
	if only_relative:
		df = df[(df['oracle_class'] == "RelativeOracle")]
	if exclude_solo:
		df = df[(df['oracle_class'] != "SoloOracle")]
	df_a = df[(df['winner_model_a'] == 1)]
	df_b = df[(df['winner_model_b'] == 1)]
	df_t = df[(df['winner_tie'] == 1)]
	min_len = min(len(df_a), len(df_b), len(df_t))
	df_a = df_a[0:min_len]
	df_b = df_b[0:min_len]
	df_t = df_t[0:min_len]

	return pd.concat([df_a, df_b, df_t], axis=0)


In [3]:
df = df_init()
summary = []
indices = ['RankOracle', 'SoloOracle', 'RelativeOracle', 'JointOracle']
for oracle in indices:
	df_oracle = df[(df['oracle_class'] == oracle)]
	avg = df_oracle['pred_correct'].mean()
	time = df_oracle['time_taken'].mean()
	summary.append([avg, time, avg/time])
oracle_scoring = pd.DataFrame(data=summary,index=['RankOracle', 'SoloOracle', 'RelativeOracle', 'JointOracle'], columns=['accuracy', 'time', 'value'])
oracle_scoring

Unnamed: 0,accuracy,time,value
RankOracle,0.422819,4.606258,0.091792
SoloOracle,0.375839,2.069095,0.181644
RelativeOracle,0.451613,2.828642,0.159657
JointOracle,0.449664,3.942683,0.11405


In [4]:
# Count half-correct as wrong
def remove_partial(df_oracle):
	df_oracle.loc[:,'pred_correct'] = df_oracle['pred_correct'].astype(int)

summary = []
indices = ['RankOracle', 'SoloOracle', 'RelativeOracle', 'JointOracle']
for oracle in indices:
	df_oracle = df[(df['oracle_class'] == oracle)].copy()
	remove_partial(df_oracle)
	avg = df_oracle['pred_correct'].mean()
	time = df_oracle['time_taken'].mean()
	summary.append([avg, time, avg/time])

oracle_scoring_no_half = pd.DataFrame(data=summary,index=['RankOracle', 'SoloOracle', 'RelativeOracle', 'JointOracle'], columns=['accuracy_no_half', 'time_no_half', 'value_no_half'])
oracle_scoring_no_half

Unnamed: 0,accuracy_no_half,time_no_half,value_no_half
RankOracle,0.288591,4.606258,0.062652
SoloOracle,0.375839,2.069095,0.181644
RelativeOracle,0.333333,2.828642,0.117842
JointOracle,0.268456,3.942683,0.06809


In [5]:
def indexer(value):
    mapping = {'ResponseQuality.A_BETTER': 0, 'ResponseQuality.B_BETTER': 1, 'ResponseQuality.TIE': 2}
    return mapping[value.strip()]

# Change pred_correct grading
# 
# Before: (row = original_pred, col = followup_pred)
#     A B T
# A | . A .
# B | B . .
# T | . . T
# Half credit given if either original/followup_pred matches label
#
# After: (row = original_pred, col = followup_pred)
#     A B T
# A | A A T
# B | B T T
# T | B A T
# No partial credit
def regrade(df):
	for index, row in df.iterrows():
		if row["oracle_class"] == "SoloOracle":
			continue
		
		if row["original_label"] == "ResponseQuality.A_BETTER":
			if (row["original_pred"] == "ResponseQuality.A_BETTER" and row["followup_pred"] != "ResponseQuality.TIE") or (row["original_pred"] == "ResponseQuality.TIE" and row["followup_pred"] == "ResponseQuality.B_BETTER"):
				df.loc[index, "pred_correct"] = 1
			else:
				df.loc[index, "pred_correct"] = 0
		elif row["original_label"] == "ResponseQuality.B_BETTER":
			if row["followup_pred"] == "ResponseQuality.A_BETTER" and row["original_pred"] != "ResponseQuality.A_BETTER":
				df.loc[index, "pred_correct"] = 1
			else:
				df.loc[index, "pred_correct"] = 0
		elif row["original_label"] == "ResponseQuality.TIE":
			if row["followup_pred"] == "ResponseQuality.TIE" or (row["followup_pred"] == "ResponseQuality.B_BETTER" and row["original_pred"] == "ResponseQuality.B_BETTER"):
				df.loc[index, "pred_correct"] = 1
			else:
				df.loc[index, "pred_correct"] = 0

#     A B T
# A | T A A
# B | B T T
# T | B A T
def regrader(df):
	scorer = [["ResponseQuality.TIE", "ResponseQuality.A_BETTER", "ResponseQuality.A_BETTER"],
					  ["ResponseQuality.B_BETTER", "ResponseQuality.B_BETTER", "ResponseQuality.TIE"],
						["ResponseQuality.TIE", "ResponseQuality.A_BETTER", "ResponseQuality.TIE"]]

	for index, row in df.iterrows():
		if row["oracle_class"] == "SoloOracle":
			continue
		
		df.loc[index, "pred_correct"] = int(scorer[indexer(row["original_pred"])][indexer(row["followup_pred"])] == row["original_label"])
	

df = df_init()
summary = []
indices = ['RankOracle', 'SoloOracle', 'RelativeOracle', 'JointOracle']
for oracle in indices:
	df_oracle = df[(df['oracle_class'] == oracle)].copy()
	# regrade
	regrader(df_oracle)
	avg = df_oracle['pred_correct'].mean()
	time = df_oracle['time_taken'].mean()
	summary.append([avg, time, avg/time])

oracle_scoring_regrade = pd.DataFrame(data=summary,index=['RankOracle', 'SoloOracle', 'RelativeOracle', 'JointOracle'], columns=['accuracy_regrade', 'time_regrade', 'value_regrade'])
oracle_scoring_regrade

Unnamed: 0,accuracy_regrade,time_regrade,value_regrade
RankOracle,0.402685,4.606258,0.087421
SoloOracle,0.375839,2.069095,0.181644
RelativeOracle,0.473118,2.828642,0.16726
JointOracle,0.422819,3.942683,0.107241


In [6]:
grading_comparison = pd.concat([oracle_scoring["accuracy"], oracle_scoring_no_half["accuracy_no_half"], oracle_scoring_regrade["accuracy_regrade"], oracle_scoring["time"], oracle_scoring["value"], oracle_scoring_no_half["value_no_half"], oracle_scoring_regrade["value_regrade"]], axis=1)
grading_comparison

Unnamed: 0,accuracy,accuracy_no_half,accuracy_regrade,time,value,value_no_half,value_regrade
RankOracle,0.422819,0.288591,0.402685,4.606258,0.091792,0.062652,0.087421
SoloOracle,0.375839,0.375839,0.375839,2.069095,0.181644,0.181644,0.181644
RelativeOracle,0.451613,0.333333,0.473118,2.828642,0.159657,0.117842,0.16726
JointOracle,0.449664,0.268456,0.422819,3.942683,0.11405,0.06809,0.107241


In [7]:


def create_table(df, label=None, pred=None):
	if label == "A":
		df = df[(df['winner_model_a'] == 1)]
	elif label == "B":
		df = df[(df['winner_model_b'] == 1)]
	elif label == "TIE":
		df = df[(df['winner_tie'] == 1)]

	if pred == 1:
		df = df[(df['pred_correct'] == 1)]
	elif pred == .5:
		df = df[(df['pred_correct'] == .5)]
	elif pred == 0:
		df = df[(df['pred_correct'] == 0)]

	init_data = [[0,0,0], [0,0,0], [0,0,0]]
	for index, row in df.iterrows():
		if row["oracle_class"] == "SoloOracle":
			continue
		i = indexer(row["original_pred"])
		j = indexer(row["followup_pred"])
		init_data[i][j] += 1
	return pd.DataFrame(data=init_data,index=['A', 'B', 'TIE'], columns=['A', 'B', 'TIE'])
	

In [8]:
labeler = widgets.Dropdown(
    options=['A', 'B', 'TIE', None],
    value=None,
    description='Label:',
)

pred = widgets.Dropdown(
    options=[1, .5, 0, None],
    value=None,
    description='Pred_correct:',
)

norm = widgets.Dropdown(
    options=['None', 'Max', "Sum"],
    value='None',
    description='Normalize nums:',
)

def plot():
    df = df_init(exclude_solo=True)
    remove_partial(df)
    table = create_table(df,labeler.value, pred.value)

    if norm.value == "Sum":
        table = table / table.sum().sum()
    elif norm.value == "Max":
        table = table / table.max().max()
    
    fig, ax = plt.subplots()

		# Hide the axes
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    ax.set_frame_on(False)

		# Create the table
    ax.table(cellText=table.values, colLabels=table.columns, rowLabels=table.index, cellLoc='center', loc='top')

		# Adjust layout to make room for the table
    plt.subplots_adjust(left=0.2, top=1)
    plt.show()

widgets.interact(lambda x, y, z: plot(), x=labeler, y=pred, z=norm)

# ROWS: Original pred
# COLS: Followup preds



interactive(children=(Dropdown(description='Label:', options=('A', 'B', 'TIE', None), value=None), Dropdown(de…

<function __main__.<lambda>(x, y, z)>

In [9]:
def create_table2(df, label=None, pred=None):
	init_data = [[0,0,0], [0,0,0], [0,0,0]]
	for index, row in df.iterrows():
		i = 0 if row['winner_model_a'] == 1 else (1 if row['winner_model_b'] == 1 else 2)
		j = int((row["pred_correct"])*2)
		init_data[i][j] += 1
	return pd.DataFrame(data=init_data,index=['A', 'B', 'TIE'], columns=['0', '.5', '1'])
	


In [10]:
df = df_init()
create_table2(df)

Unnamed: 0,0,.5,1
A,58,64,58
B,72,35,73
TIE,124,17,39


In [11]:
remove_partial(df)
create_table2(df)

Unnamed: 0,0,.5,1
A,122,0,58
B,107,0,73
TIE,141,0,39


In [12]:
regrader(df)
create_table2(df)

Unnamed: 0,0,.5,1
A,115,0,65
B,107,0,73
TIE,95,0,85


In [13]:
# model is_quality_preserved tests
# current: is_quality_preserved function
#     A B T
# A | N N N
# B | Y N Y
# T | Y N Y
df = df_init()

def labelQAcheck1(row):
	if row["oracle_class"] == "SoloOracle":
		return row["original_label"] in ["ResponseQuality.B_BETTER", "ResponseQuality.TIE"]
	return row["original_label"] in ["ResponseQuality.B_BETTER", "ResponseQuality.TIE"] and row["followup_label"] in ["ResponseQuality.A_BETTER", "ResponseQuality.TIE"]

def predQAcheck1(row):
	if row["oracle_class"] == "SoloOracle":
		return row["original_pred"] in ["ResponseQuality.B_BETTER", "ResponseQuality.TIE"]
	return row["original_pred"] in ["ResponseQuality.B_BETTER", "ResponseQuality.TIE"] and row["followup_pred"] in ["ResponseQuality.A_BETTER", "ResponseQuality.TIE"]

df["is_quality_preserved_label_1"] = df.apply(labelQAcheck1, axis=1)
df["is_quality_preserved_pred_1"] = df.apply(predQAcheck1, axis=1)
df["is_quality_preserved_correct_1"] = df.apply(lambda row: row["is_quality_preserved_label_1"] == row["is_quality_preserved_pred_1"], axis=1)
#df["is_quality_preserved_correct_1"].mean()


summary = []
indices = ['RankOracle', 'SoloOracle', 'RelativeOracle', 'JointOracle']
cols = ["ResponseQuality.A_BETTER", "ResponseQuality.B_BETTER", "ResponseQuality.TIE"]
for oracle in indices:
	df_oracle = df[(df['oracle_class'] == oracle)]
	oracle_data = []
	for col in cols:
		df_oracle_col = df_oracle[(df_oracle['original_label'] == col)]
		oracle_data.append(df_oracle_col["is_quality_preserved_correct_1"].mean())
	oracle_data.append(df_oracle["is_quality_preserved_correct_1"].mean())
	summary.append(oracle_data)

summary_data = []
for col in cols:
	df_col = df[(df['original_label'] == col)]
	summary_data.append(df_col["is_quality_preserved_correct_1"].mean())
summary_data.append(df["is_quality_preserved_correct_1"].mean())
summary.append(summary_data)

pd.DataFrame(data=summary,index=['RankOracle', 'SoloOracle', 'RelativeOracle', 'JointOracle', "Summary"], columns=['A', 'B', 'Tie', "Total"])


Unnamed: 0,A,B,Tie,Total
RankOracle,0.888889,0.481481,0.36,0.563758
SoloOracle,0.244444,0.888889,0.86,0.684564
RelativeOracle,0.711111,0.611111,0.566667,0.645161
JointOracle,0.866667,0.611111,0.48,0.644295
Summary,0.677778,0.655556,0.566667,0.633333


In [14]:
from functools import partial
# model is_quality_preserved tests
# hopeful: is_quality_preserved function
#     A B T
# A | N N N
# B | Y Y Y
# T | Y N Y
df = df_init()

def QAchecker(row, original, followup):
	scorer = [[False, False, False],
					  [True, True, True],
						[True, False, True]]
	if row["oracle_class"] == "SoloOracle":
		return row[original] in ["ResponseQuality.B_BETTER", "ResponseQuality.TIE"]
	return scorer[indexer(row[original])][indexer(row[followup])]

df["is_quality_preserved_label_2"] = df.apply(partial(QAchecker, original="original_label", followup="followup_label"), axis=1)
df["is_quality_preserved_pred_2"] = df.apply(partial(QAchecker, original="original_pred", followup="followup_pred"), axis=1)
df["is_quality_preserved_correct_2"] = df.apply(lambda row: row["is_quality_preserved_label_2"] == row["is_quality_preserved_pred_2"], axis=1)

summary = []
indices = ['RankOracle', 'SoloOracle', 'RelativeOracle', 'JointOracle']
cols = ["ResponseQuality.A_BETTER", "ResponseQuality.B_BETTER", "ResponseQuality.TIE"]
for oracle in indices:
	df_oracle = df[(df['oracle_class'] == oracle)]
	oracle_data = []
	for col in cols:
		df_oracle_col = df_oracle[(df_oracle['original_label'] == col)]
		oracle_data.append(df_oracle_col["is_quality_preserved_correct_2"].mean())
	oracle_data.append(df_oracle["is_quality_preserved_correct_2"].mean())
	summary.append(oracle_data)

summary_data = []
for col in cols:
	df_col = df[(df['original_label'] == col)]
	summary_data.append(df_col["is_quality_preserved_correct_2"].mean())
summary_data.append(df["is_quality_preserved_correct_2"].mean())
summary.append(summary_data)

pd.DataFrame(data=summary,index=['RankOracle', 'SoloOracle', 'RelativeOracle', 'JointOracle', "Summary"], columns=['A', 'B', 'Tie', "Total"])


Unnamed: 0,A,B,Tie,Total
RankOracle,0.844444,0.481481,0.42,0.57047
SoloOracle,0.244444,0.888889,0.86,0.684564
RelativeOracle,0.688889,0.611111,0.6,0.645161
JointOracle,0.844444,0.611111,0.5,0.644295
Summary,0.655556,0.655556,0.594444,0.635185
