In [91]:
import pandas as pd

In [92]:
# Load the results
results = pd.read_csv("../outputs/self_generated_eval.csv")

In [93]:
# Check if the prediction is correct and and the result to df
# Prediction and answer must be trimmed of spaces, lowercased, and removed of spaces    
results["correct"] = results["answer"].str.strip().str.lower().str.replace(" ", "") == results["predictions"].str.strip().str.lower().str.replace(" ", "")

# Analyze the results

## OverallAccuracy

In [94]:
# Count the number of correct predictions
num_correct = results["correct"].sum()
# Count the number of incorrect predictions
num_incorrect = len(results) - num_correct
# Calculate the accuracy
accuracy = num_correct / len(results)
print(f"Total number of predictions: {len(results)}")
print(f"Number of correct predictions: {num_correct}")
print(f"Number of incorrect predictions: {num_incorrect}")
print(f"Overall Accuracy: {accuracy:.2%}")


Total number of predictions: 960
Number of correct predictions: 284
Number of incorrect predictions: 676
Overall Accuracy: 29.58%


## Accuracy by the direction of the task

In [95]:
for direction in ["row", "column"]:
    print(f"Accuracy for {direction} direction: {results[results['direction'] == direction]['correct'].mean():.2%}")

Accuracy for row direction: 41.67%
Accuracy for column direction: 17.50%


## Accuracy by the type of the task

In [96]:
for task_type in ["arithmetic", "list_items"]:
    print(f"Accuracy for {task_type} task: {results[results['task'] == task_type]['correct'].mean():.2%}")


Accuracy for arithmetic task: 38.96%
Accuracy for list_items task: 20.21%


## Accuracy by the size of the table

In [97]:
for table_size in [4, 6, 8, 10, 12]:
    print(f"Accuracy for {table_size}x{table_size} table: {results[results['size'] == table_size]['correct'].mean():.2%}")

Accuracy for 4x4 table: 48.44%
Accuracy for 6x6 table: 32.81%
Accuracy for 8x8 table: 32.29%
Accuracy for 10x10 table: 16.67%
Accuracy for 12x12 table: 17.71%


## Accuracy by the task and direction

In [98]:
for task in ["arithmetic", "list_items"]:
    for direction in ["row", "column"]:
        print(f"Accuracy for {task} {direction}: {results[(results['task'] == task) & (results['direction'] == direction)]['correct'].mean():.3%}")


Accuracy for arithmetic row: 44.167%
Accuracy for arithmetic column: 33.750%
Accuracy for list_items row: 39.167%
Accuracy for list_items column: 1.250%


In [99]:
print(f"Overall Accuracy: {accuracy:.3%}")

Overall Accuracy: 29.583%
