Reading the dataset and extract the answer columns

In [59]:
import pandas as pd
from utils import extract_float, clean_string

output = "./output/chat-gpt4.csv"

df = pd.read_csv(output)
df = df[["Given Answer", "Model Answer", "Processed Answer"]]

df.tail(10)


Unnamed: 0,Given Answer,Model Answer,Processed Answer
1301,-28,-28,-28.0
1302,34,$34 million,34.0
1303,51%,50.8%,50.8
1304,38.9%,38.95%,38.95
1305,379,$367 million,367.0
1306,177.7,$177 million,177.0
1307,8%,8%,8.0
1308,1687,1687,1687.0
1309,-9.3,$9.3 billion,9.3
1310,-89%,-88.57%,-88.57


In [60]:

from utils import standardize_and_compare


def transform(x):
    try:
        # return float(extract_float(clean_string(str(x))))
        y = clean_string(str(x))
        # print(y)
        return float(extract_float(y))
    except Exception as e:
        # print(e)
        return -1

# df["A#round"] = df["Model Answer"].apply(lambda x: round(transform(x), 2))
df["A#Exact"] = df.apply(lambda x: (str(x["Given Answer"]) == str(x["Processed Answer"]) ) , axis=1)
df["A#Processed"] = df.apply(lambda x: standardize_and_compare(transform(x["Given Answer"]) , transform(x["Processed Answer"]) ) , axis=1)
df["A#round2"] = df.apply(lambda x: (round(transform(x["Given Answer"]),1) == round(transform(x["Processed Answer"]),1)), axis=1)
df["A#round1"] = df.apply(lambda x: (round(transform(x["Given Answer"]),1) == round(transform(x["Processed Answer"]),1)), axis=1)
df["A#round0"] = df.apply(lambda x: (round(transform(x["Given Answer"]),0) == round(transform(x["Processed Answer"]),0)), axis=1)
df["A#range5"] = df.apply(lambda x: abs(transform(x["Given Answer"]) - transform(x["Processed Answer"])) < 0.5, axis=1)
df["A#range4"] = df.apply(lambda x: abs(transform(x["Given Answer"]) - transform(x["Processed Answer"])) < 0.4, axis=1)
df["A#range3"] = df.apply(lambda x: abs(transform(x["Given Answer"]) - transform(x["Processed Answer"])) < 0.3, axis=1)
df["A#range2"] = df.apply(lambda x: abs(transform(x["Given Answer"]) - transform(x["Processed Answer"])) < 0.2, axis=1)
df["A#range1"] = df.apply(lambda x: abs(transform(x["Given Answer"]) - transform(x["Processed Answer"])) < 0.1, axis=1)

df.tail(10)

Unnamed: 0,Given Answer,Model Answer,Processed Answer,A#Exact,A#Processed,A#round2,A#round1,A#round0,A#range5,A#range4,A#range3,A#range2,A#range1
1301,-28,-28,-28.0,False,True,True,True,True,True,True,True,True,True
1302,34,$34 million,34.0,False,True,True,True,True,True,True,True,True,True
1303,51%,50.8%,50.8,False,False,False,False,True,True,True,True,False,False
1304,38.9%,38.95%,38.95,False,False,False,False,True,True,True,True,True,True
1305,379,$367 million,367.0,False,False,False,False,False,False,False,False,False,False
1306,177.7,$177 million,177.0,False,False,False,False,False,False,False,False,False,False
1307,8%,8%,8.0,False,True,True,True,True,True,True,True,True,True
1308,1687,1687,1687.0,False,True,True,True,True,True,True,True,True,True
1309,-9.3,$9.3 billion,9.3,False,False,False,False,False,False,False,False,False,False
1310,-89%,-88.57%,-88.57,False,False,False,False,True,True,False,False,False,False


In [61]:

length = len(df)
answers = ["A#Exact", "A#Processed", "A#round2", "A#round1", "A#round0", "A#range5", "A#range4", "A#range3", "A#range2", "A#range1"]

columns = ["Answer", "Total", "Count", "%"]

rows = []
for answer in answers:
    true_rows = df[df[answer] == True]
    num_true_rows = len(true_rows)
    rows.append({"Answer": answer, "Total": length, "Count": num_true_rows, "%": 100*num_true_rows/length})

df_analysis = pd.DataFrame(rows)

df_analysis.sort_values(by="%", ascending=False)

Unnamed: 0,Answer,Total,Count,%
5,A#range5,1311,825,62.929062
4,A#round0,1311,807,61.556064
6,A#range4,1311,794,60.564455
7,A#range3,1311,743,56.674294
8,A#range2,1311,703,53.623188
9,A#range1,1311,650,49.580473
2,A#round2,1311,578,44.088482
3,A#round1,1311,578,44.088482
1,A#Processed,1311,326,24.866514
0,A#Exact,1311,69,5.263158


Now we calculate the F1 of each Answer type

In [62]:
# True Positives (TP): 
# Answers that the model got exactly right or answers that are different from the given answers but are still correct upon verification.
TP = df_analysis[df_analysis["Answer"] == "A#Exact"]["Count"].values[0]
print("TP",TP)

# False Positives (FP): Answers that the model got wrong. This would be the total answers given minus the true positives.
FP = length - TP
print("FP",FP)

# False Negatives (FN): This would normally be the correct answers that the model failed to identify. 
FN = df_analysis.max()["Count"] - TP
print("FN",FN)

Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
F1 = 2 * (Precision * Recall) / (Precision + Recall)

print("Precision", Precision)
print("Recall", Recall)
print("F1", F1)

for answer in answers:
    df_analysis["TP"] = df_analysis["Count"]
    df_analysis["FP"] = length - df_analysis["TP"]
    df_analysis["FN"] = df_analysis.max()["Count"] - df_analysis["TP"] # always the max
    df_analysis["Precision"] = df_analysis["TP"] / (df_analysis["TP"] + df_analysis["FP"])
    df_analysis["Recall"] = df_analysis["TP"] / (df_analysis["TP"] + df_analysis["FN"])
    df_analysis["F1"] = 2 * (df_analysis["Precision"] * df_analysis["Recall"]) / (df_analysis["Precision"] + df_analysis["Recall"])
    
df_analysis.sort_values(by="F1", ascending=False)

TP 69
FP 1242
FN 756
Precision 0.05263157894736842
Recall 0.08363636363636363
F1 0.06460674157303371


Unnamed: 0,Answer,Total,Count,%,TP,FP,FN,Precision,Recall,F1
5,A#range5,1311,825,62.929062,825,486,0,0.629291,1.0,0.772472
4,A#round0,1311,807,61.556064,807,504,18,0.615561,0.978182,0.755618
6,A#range4,1311,794,60.564455,794,517,31,0.605645,0.962424,0.743446
7,A#range3,1311,743,56.674294,743,568,82,0.566743,0.900606,0.695693
8,A#range2,1311,703,53.623188,703,608,122,0.536232,0.852121,0.65824
9,A#range1,1311,650,49.580473,650,661,175,0.495805,0.787879,0.608614
2,A#round2,1311,578,44.088482,578,733,247,0.440885,0.700606,0.541199
3,A#round1,1311,578,44.088482,578,733,247,0.440885,0.700606,0.541199
1,A#Processed,1311,326,24.866514,326,985,499,0.248665,0.395152,0.305243
0,A#Exact,1311,69,5.263158,69,1242,756,0.052632,0.083636,0.064607
