In [27]:
%load_ext autoreload
%autoreload 2

from src.experiment import get_fallacy_df, RESPONSE_ERROR
from src.analysis import get_sanity_check, add_identification_scores
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Human Benchmark for Fallacy Identification

## Create Dataset

In [33]:
filename = 'data/fallacies_e1_human_empty.csv'
df_fallacies = get_fallacy_df(filename)


[2024-11-03 18:06:45] Created new fallacy identification dataframe.


In [34]:
# Number the reasoning steps within fallacy type and label, so we can choose how many sets to respond to and still 
# get a balanced dataset
df_fallacies['set_number'] = df_fallacies.groupby(['fallacy', 'label'], observed=True).cumcount() + 1

In [35]:
# Randomize order
df_fallacies = df_fallacies.sample(frac=1, random_state=42)

In [36]:
# Select only the columns we need, so the test is blinded
df_fallacies = df_fallacies[['step', 'set_number']]

In [37]:
# Keep index so we can join back to the original dataset
df_fallacies.to_csv(filename, index=True, index_label='index')

### Scoring and Sanity Check

In [20]:
df_fallacies_e1 = get_fallacy_df('data/fallacies_e1.csv')

# o1-preview was aborted due to high cost
df_fallacies_e1 = df_fallacies_e1.drop(columns='o1_preview_response')
df_fallancies_human = pd.read_csv('data/fallacies_e1_human.csv')
df_fallancies_human = df_fallancies_human.set_index('index').fillna('')

[2024-11-03 17:40:07] Loaded existing fallacy dataframe from data/fallacies_e1.csv.


In [21]:
response_cols = [col for col in df_fallancies_human.columns if col.endswith('_response')]
df_fallacies_e1 = df_fallacies_e1.join(df_fallancies_human[response_cols])

In [25]:
add_identification_scores(df_fallacies_e1, punish_missing=False)

Unnamed: 0,step,entity,fallacy,label,category,subcategory,gpt_4o_response,gpt_4_response,gpt_4o_mini_response,claude_3_5_sonnet_response,...,llama_3_1_8b_pred,llama_3_1_8b_score,mistral_large_2_pred,mistral_large_2_score,mistral_small_2_pred,mistral_small_2_score,o1_mini_pred,o1_mini_score,adrian_pred,adrian_score
0,Since John asked Maria if she used the last of...,tepas,Argument from Silence,1,informal,insufficiency,No.,No,No.,No,...,1,1,1,1,1,1,1,1,,
1,Since Alice asked if Bob knew what an 'ossia' ...,ossia,Argument from Silence,1,informal,insufficiency,No.,No,No.,No,...,1,1,1,1,1,1,1,1,,
2,Since Alice claims that the Hausdorff contents...,hausdorff contents,Argument from Silence,1,informal,insufficiency,No.,No,No.,No,...,1,1,1,1,1,1,1,1,,
3,"Since Tom, a seasoned tugboater, said that ice...",tugboaters,Argument from Silence,1,informal,insufficiency,No.,No,No.,No,...,1,1,1,1,1,1,1,1,,
4,Since John accuses Mary of being terrified of ...,beewolf,Argument from Silence,1,informal,insufficiency,No.,No,No.,No,...,1,1,1,1,1,1,1,1,,


In [26]:
# Check completeness of responses, predictions and scores
df_fallacies_e1.replace(['', RESPONSE_ERROR], None).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4640 entries, 0 to 4639
Data columns (total 51 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   step                          4640 non-null   object  
 1   entity                        4640 non-null   object  
 2   fallacy                       4640 non-null   category
 3   label                         4640 non-null   category
 4   category                      4640 non-null   category
 5   subcategory                   4640 non-null   category
 6   gpt_4o_response               4640 non-null   object  
 7   gpt_4_response                4640 non-null   object  
 8   gpt_4o_mini_response          4640 non-null   object  
 9   claude_3_5_sonnet_response    4640 non-null   object  
 10  claude_3_opus_response        4640 non-null   object  
 11  claude_3_haiku_response       4640 non-null   object  
 12  gemini_1_5_pro_response       4640 non-null   ob

In [24]:
get_sanity_check(df_fallacies_e1)

Unnamed: 0,response_length_mean,missing_responses,invalid_predictions
gpt_4o,3.3,0,0
gpt_4,2.5,0,0
gpt_4o_mini,3.4,0,0
claude_3_5_sonnet,2.4,0,0
claude_3_opus,3.2,0,0
claude_3_haiku,3.3,0,0
gemini_1_5_pro,2.5,0,0
gemini_1_5_flash,2.4,0,0
gemini_1_5_flash_8b,2.4,0,0
llama_3_1_70b,2.7,0,1
