In [1]:
%load_ext autoreload
%autoreload 2

from src.experiment import get_fallacy_df, RESPONSE_ERROR
from src.analysis import get_sanity_check, add_identification_scores
import pandas as pd

# Human Benchmark for Fallacy Identification

## Create Dataset

In [2]:
filename = 'data/fallacies_e1_human_empty.csv'
df_fallacies = get_fallacy_df(filename)


[2024-11-06 12:54:14] Created new fallacy identification dataframe.


In [3]:
# Randomize order
df_fallacies = df_fallacies.sample(frac=1, random_state=42)

In [4]:
# Number the reasoning steps within fallacy type and label, so we can choose how many sets to 
# respond to and still get a balanced dataset with all fallacy types and 50% correct and incorrect reasoning steps.
df_fallacies['set_number'] = df_fallacies.groupby(['fallacy', 'label'], observed=True).cumcount() + 1

In [5]:
# Verify that each set contains the same number of 1 and 0 labels
df_fallacies.groupby(['set_number'], observed=True)['label'].value_counts()

set_number  label
1           1        232
            0        232
2           1        232
            0        232
3           1        232
            0        232
4           1        232
            0        232
5           1        232
            0        232
6           1        232
            0        232
7           1        232
            0        232
8           1        232
            0        232
9           1        232
            0        232
10          1        232
            0        232
Name: count, dtype: int64

In [6]:
# Verify that each set contains all fallacy types
df_fallacies.groupby(['set_number'], observed=True)['fallacy'].nunique()

set_number
1     232
2     232
3     232
4     232
5     232
6     232
7     232
8     232
9     232
10    232
Name: fallacy, dtype: int64

In [7]:
# Entities are diverse within each set due to random order
df_fallacies.groupby(['set_number'], observed=True)['entity'].nunique()

set_number
1     436
2     450
3     446
4     433
5     440
6     435
7     442
8     440
9     439
10    442
Name: entity, dtype: int64

In [8]:
# Select only the columns we need, so the test is blinded
df_fallacies = df_fallacies[['step', 'category', 'subcategory', 'set_number']]

In [9]:
# Keep index so we can join back to the original dataset
df_fallacies.to_csv(filename, index=True, index_label='index')

### Scoring and Sanity Check

In [10]:
df_fallacies_e1 = get_fallacy_df('data/fallacies_e1.csv')

# o1-preview was aborted due to high cost
df_fallacies_e1 = df_fallacies_e1.drop(columns='o1_preview_response')
df_fallancies_e1_human = pd.read_csv('data/fallacies_e1_human.csv')
df_fallancies_e1_human = df_fallancies_e1_human.set_index('index').fillna('')

[2024-11-06 13:01:59] Loaded existing fallacy dataframe from data/fallacies_e1.csv.


In [11]:
# Join based on index
drop_columns = ['step', 'category', 'subcategory', 'set_number']
df_fallacies_e1 = df_fallacies_e1.join(df_fallancies_e1_human.drop(columns=drop_columns))

In [12]:
add_identification_scores(df_fallacies_e1, punish_missing=False)

In [13]:
# Check completeness of responses, predictions and scores
df_fallacies_e1.replace(['', RESPONSE_ERROR], None).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4640 entries, 0 to 4639
Data columns (total 52 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   step                          4640 non-null   object  
 1   entity                        4640 non-null   object  
 2   fallacy                       4640 non-null   category
 3   label                         4640 non-null   category
 4   category                      4640 non-null   category
 5   subcategory                   4640 non-null   category
 6   gpt_4o_response               4640 non-null   object  
 7   gpt_4_response                4640 non-null   object  
 8   gpt_4o_mini_response          4640 non-null   object  
 9   claude_3_5_sonnet_response    4640 non-null   object  
 10  claude_3_opus_response        4640 non-null   object  
 11  claude_3_haiku_response       4640 non-null   object  
 12  gemini_1_5_pro_response       4640 non-null   ob

In [14]:
get_sanity_check(df_fallacies_e1)

Unnamed: 0,response_length_mean,missing_responses,invalid_predictions
gpt_4o,3.3,0,0
gpt_4,2.5,0,0
gpt_4o_mini,3.4,0,0
claude_3_5_sonnet,2.4,0,0
claude_3_opus,3.2,0,0
claude_3_haiku,3.3,0,0
gemini_1_5_pro,2.5,0,0
gemini_1_5_flash,2.4,0,0
gemini_1_5_flash_8b,2.4,0,0
llama_3_1_70b,2.7,0,1
