In [5]:
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv
from src.llms import get_llms, init_langchain, LLM
from src.tuning import TuningSet
from src.experiment import (
    get_fallacy_df, save_fallacy_df, run_experiment,
    get_identification_zero_shot_prompt_template,
    get_identification_few_shot_prompt_template,
    get_identification_cot_prompt_template,
    get_classification_prompt_template
)

# Load environment variables from .env file
load_dotenv()

init_langchain()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Fallacy Experiments

## Fallacy Identification

### Experiment 1: Fallacy Identification with zero-shot Prompt

In [9]:
e1_filename = 'data/fallacies_e1.csv'
df_fallacies_e1 = get_fallacy_df(e1_filename)
df_fallacies_e1.head()

[2024-11-02 16:34:46] Loaded existing fallacy dataframe from data/fallacies_e1.csv.


Unnamed: 0,step,entity,fallacy,label,category,subcategory,gpt_4o_response,gpt_4_response,gpt_4o_mini_response,claude_3_5_sonnet_response,...,claude_3_haiku_response,gemini_1_5_pro_response,gemini_1_5_flash_response,gemini_1_5_flash_8b_response,llama_3_1_70b_response,llama_3_1_8b_response,mistral_large_2_response,mistral_small_2_response,o1_preview_response,o1_mini_response
0,Since John asked Maria if she used the last of...,tepas,Argument from Silence,1,informal,insufficiency,No.,No,No.,No,...,No.,No,No,No,No,No,No.,No.,No,No
1,Since Alice asked if Bob knew what an 'ossia' ...,ossia,Argument from Silence,1,informal,insufficiency,No.,No,No.,No,...,No.,No,No,No,No,No,No.,No.,No.,No
2,Since Alice claims that the Hausdorff contents...,hausdorff contents,Argument from Silence,1,informal,insufficiency,No.,No,No.,No,...,No.,No,No,No,No,No,No.,No.,No.,No
3,"Since Tom, a seasoned tugboater, said that ice...",tugboaters,Argument from Silence,1,informal,insufficiency,No.,No,No.,No,...,No.,Yes,No,No,No,No.,No.,No.,No.,No
4,Since John accuses Mary of being terrified of ...,beewolf,Argument from Silence,1,informal,insufficiency,No.,No,No.,No,...,No.,No,No,No,No,No,No.,No.,No.,No


In [10]:
prompt_template_e1 = get_identification_zero_shot_prompt_template()
print(prompt_template_e1)

Is the following reasoning step correct? You can only answer "Yes" or "No".
[step]


In [11]:
llms = get_llms([LLM.O1_MINI])

run_experiment(df_fallacies_e1, e1_filename, prompt_template_e1, llms, sleep_seconds=0.2)

save_fallacy_df(df_fallacies_e1, e1_filename)

[2024-11-02 16:38:58] Processed 100 responses for LLM o1_mini (index=199).
[2024-11-02 16:43:03] Processed 200 responses for LLM o1_mini (index=299).
[2024-11-02 16:47:36] Processed 300 responses for LLM o1_mini (index=399).
[2024-11-02 16:47:48] Error invoking LLM o1_mini: Error code: 400 - {'error': {'message': 'Invalid prompt: your prompt was flagged as potentially violating our usage policy. Please try again with a different prompt: https://platform.openai.com/docs/guides/reasoning/advice-on-prompting', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_prompt'}}
[2024-11-02 16:48:13] Error invoking LLM o1_mini: Error code: 400 - {'error': {'message': 'Invalid prompt: your prompt was flagged as potentially violating our usage policy. Please try again with a different prompt: https://platform.openai.com/docs/guides/reasoning/advice-on-prompting', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_prompt'}}
[2024-11-02 16:51:33] Processed 400 responses for

OpenAI o1-preview cost:
- 60 zero-shot prompt responses was \$1.64
- Estimated cost for 4640 prompts is \$126.83

OpenAI o1-mini cost:
- 100 zero-shot prompt responses was \$0.38
- Estimated cost for 4640 prompts is \$17.63
- Actual cost for 4640 prompts was \$15.89

### Experiment 2: Fallacy Identification with few-shot Prompt

In [18]:
e2_filename = 'data/fallacies_e2.csv'
df_fallacies_e2 = get_fallacy_df(e2_filename)
df_fallacies_e2.head()

[2024-10-31 14:14:12] Loaded existing fallacy dataframe from data/fallacies_e2.csv.


Unnamed: 0,step,entity,fallacy,label,category,subcategory,gpt_4o_response,claude_3_5_sonnet_response,gemini_1_5_pro_response,gpt_4o_mini_response,claude_3_haiku_response,gemini_1_5_flash_response,gemini_1_5_flash_8b_response,llama_3_1_70b_response,llama_3_1_8b_response,mistral_large_2_response,mistral_small_2_response
0,Since John asked Maria if she used the last of...,tepas,Argument from Silence,1,informal,insufficiency,No.,No.,No.,No.,No.,No,No,No.,No.,No.,No.
1,Since Alice asked if Bob knew what an 'ossia' ...,ossia,Argument from Silence,1,informal,insufficiency,No.,No.,No.,No.,No.,No,No,No.,No.,No.,No.
2,Since Alice claims that the Hausdorff contents...,hausdorff contents,Argument from Silence,1,informal,insufficiency,No.,No.,No.,No.,No.,No,No,No.,No.,No.,No.
3,"Since Tom, a seasoned tugboater, said that ice...",tugboaters,Argument from Silence,1,informal,insufficiency,No.,No.,Yes.,No.,No.,No,No,Yes,No.,Yes. No. No. Yes. No.,No.
4,Since John accuses Mary of being terrified of ...,beewolf,Argument from Silence,1,informal,insufficiency,No.,No.,No.,No.,No.,No,No,No.,No.,No.,No.


In [19]:
prompt_template_e2 = get_identification_few_shot_prompt_template()
print(prompt_template_e2)

Is the following reasoning step correct? You can only answer "Yes" or "No".
Since if it's raining then the streets are wet and it's raining now, therefore, the streets are wet.
Yes.
Since I found a shell on the beach and this shell was beautifully shaped and colored, therefore, all shells are beautifully shaped and colored.
No.
Since I am at home or I am in the city and I am at home, therefore, I am not in the city.
No.
Since heavy snowfall often leads to traffic jams and traffic jams cause delays, therefore, heavy snowfall can lead to delays.
Yes.
[step]


In [20]:
llms = get_llms([LLM.LLAMA_3_1_70B, LLM.LLAMA_3_1_8B])

run_experiment(df_fallacies_e2, e2_filename, prompt_template_e2, llms, sleep_seconds=0.2)

save_fallacy_df(df_fallacies_e2, e2_filename)


### Experiment 3: Fallacy Identification with chain-of-thought Prompt

In [2]:
e3_filename = 'data/fallacies_e3.csv'
df_fallacies_e3 = get_fallacy_df(e3_filename)
df_fallacies_e3.head()

[2024-11-01 08:22:52] Loaded existing fallacy dataframe from data/fallacies_e3.csv.


Unnamed: 0,step,entity,fallacy,label,category,subcategory,gpt_4o_response,claude_3_5_sonnet_response,gemini_1_5_pro_response,gpt_4o_mini_response,claude_3_haiku_response,gemini_1_5_flash_response,gemini_1_5_flash_8b_response,llama_3_1_70b_response,llama_3_1_8b_response,mistral_large_2_response,mistral_small_2_response
0,Since John asked Maria if she used the last of...,tepas,Argument from Silence,1,informal,insufficiency,No. The reasoning step is not necessarily corr...,Let's think through this step-by-step: 1. Joh...,No. Silence is not necessarily an admission o...,No.,Let's analyze this step-by-step: 1. John aske...,No. Silence doesn't necessarily mean guilt or...,No. Silence doesn't necessarily mean agreemen...,To evaluate the correctness of the reasoning s...,No. This reasoning step is incorrect because ...,"No, the reasoning step is not necessarily corr...","No, the reasoning step is not correct. Maria's..."
1,Since Alice asked if Bob knew what an 'ossia' ...,ossia,Argument from Silence,1,informal,insufficiency,No. The reasoning step is not correct. Bob's s...,Let's think through this step by step: 1. Ali...,No. Silence could indicate a lack of knowledg...,No.,Let's analyze this step-by-step: 1. Alice ask...,No. Silence doesn't imply knowledge. Bob's s...,No. Silence does not equate to knowledge. Bo...,"No, the reasoning step is not correct. The c...",No. This reasoning is incorrect because remai...,No. Bob's silence does not necessarily indicat...,"No, the reasoning step is not correct. Bob's s..."
2,Since Alice claims that the Hausdorff contents...,hausdorff contents,Argument from Silence,1,informal,insufficiency,No. The reasoning step is not correct. The con...,Let's think through this step-by-step: 1. Ali...,No. Bob's silence doesn't confirm Alice's cla...,No.,Let's analyze this step-by-step: 1. Alice cla...,No. Bob's silence doesn't provide evidence su...,"No. Alice's claim, even if true, and Bob's si...",To evaluate the correctness of the reasoning s...,No. This reasoning is incorrect because it as...,"No, the reasoning step is not correct. Bob's s...","No, the reasoning step is not correct. Just be..."
3,"Since Tom, a seasoned tugboater, said that ice...",tugboaters,Argument from Silence,1,informal,insufficiency,No. The reasoning step is not necessarily corr...,Let's think through this step-by-step: 1. The...,"No. While Tom's experience is valuable, the a...",No.,"Let's analyze this step-by-step: 1. Tom, a se...",No. The reasoning commits an appeal to author...,No. Just because one person says something an...,To evaluate the correctness of the reasoning s...,No. The reasoning step is incorrect for sever...,"No, the reasoning step is not necessarily corr...","No, the reasoning step is not correct. Here's ..."
4,Since John accuses Mary of being terrified of ...,beewolf,Argument from Silence,1,informal,insufficiency,No. The reasoning step is not correct. The fac...,Let's think through this step-by-step: 1. Joh...,No. Silence does not imply guilt or fear. Ma...,No.,Let's analyze this step-by-step: 1. John accu...,No. Silence does not equal guilt or fear. Ma...,No. Silence in the face of an accusation does...,"No, the reasoning step is not correct. Here'...",No. This reasoning step is incorrect because ...,No. Mary's silence does not necessarily mean s...,"No, the reasoning step is not correct. Just be..."


In [3]:
prompt_template_e3 = get_identification_cot_prompt_template()
print(prompt_template_e3)

Is the following reasoning step correct?
Let's think step by step and then answer "Yes" or "No".
[step]


In [4]:
llms = get_llms([LLM.LLAMA_3_1_70B, LLM.LLAMA_3_1_8B])

run_experiment(df_fallacies_e3, e3_filename, prompt_template_e3, llms, sleep_seconds=0.2)

save_fallacy_df(df_fallacies_e3, e3_filename)


## Fallacy Classification

### Experiment 4: Fallacy Classification with zero-shot Prompt

In [9]:
e4_filename = 'data/fallacies_e4.csv'
df_fallacies_e4 = get_fallacy_df(e4_filename, only_incorrect=True)

df_fallacies_e4.head()

[2024-11-01 08:38:53] Loaded existing fallacy dataframe from data/fallacies_e4.csv.


Unnamed: 0,step,entity,fallacy,label,category,subcategory,gpt_4o_response,gpt_4_response,gpt_4o_mini_response,claude_3_5_sonnet_response,claude_3_haiku_response,gemini_1_5_pro_response,gemini_1_5_flash_8b_response,o1_preview_response,mistral_large_2_response,mistral_small_2_response,llama_3_1_70b_response
0,Since John asked Maria if she used the last of...,tepas,Argument from Silence,1,informal,insufficiency,Argument from Silence,Argument from Silence,Affirming the Consequent,Argument from Silence,Denying the Antecedent,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence
1,Since Alice asked if Bob knew what an 'ossia' ...,ossia,Argument from Silence,1,informal,insufficiency,Argument from Silence,Argument from Silence,Affirming the Consequent,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,(164) Argument from Silence,Argument from Silence
2,Since Alice claims that the Hausdorff contents...,hausdorff contents,Argument from Silence,1,informal,insufficiency,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Appeal to Silence,Argument from Silence
3,"Since Tom, a seasoned tugboater, said that ice...",tugboaters,Argument from Silence,1,informal,insufficiency,Argument from Silence,(79) Appeal to Authority,Appeal to Authority,Appeal to Authority,Appeal to Authority,Appeal to Authority,Appeal to Authority,Argument from Silence,Argument from Silence,Appeal to Authority,Argument from Silence
4,Since John accuses Mary of being terrified of ...,beewolf,Argument from Silence,1,informal,insufficiency,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence,Argument from Silence


In [10]:
prompt_template_e4 = get_classification_prompt_template()
print(prompt_template_e4)

You are a logical fallacy classifier. Given an incorrect reasoning step, your task is to identify its type of fallacy.
Answer by choosing one of these fallacies:
(1) Affirming the Consequent
(2) Denying the Antecedent
(3) Negating Antecedent and Consequent
(4) Commutation of Conditionals
(5) Affirming a Disjunct
(6) Denying a Conjunct
(7) Fallacy of the Undistributed Middle
(8) Exclusive Premises
(9) Fallacy of Four Terms
(10) Illicit Substitution of Identicals
(11) Illicit Minor
(12) Illicit Major
(13) Negative Conclusion from Affirmative Premises
(14) Affirmative Conclusion from a Negative Premise
(15) False Conversion
(16) Unwarranted Contrast
(17) Quantifier Shift Fallacy
(18) Existential Fallacy
(19) Fallacy of Every and All
(20) Illicit Contraposition
(21) Gamblers Fallacy
(22) Hot Hand Fallacy
(23) Conjunction Fallacy
(24) Disjunction Fallacy
(25) Argument of the Beard
(26) Appeal to Extremes
(27) Type Token Fallacy
(28) Use Mention Error
(29) Reification
(30) Fake Precision
(31

In [11]:
llms = get_llms([LLM.LLAMA_3_1_70B, LLM.LLAMA_3_1_8B])

run_experiment(df_fallacies_e4, e4_filename, prompt_template_e4, llms, sleep_seconds=0.1)

save_fallacy_df(df_fallacies_e4, e4_filename)

[2024-11-01 08:41:10] Processed 100 responses for LLM llama_3_1_70b (index=338).
[2024-11-01 08:43:39] Processed 200 responses for LLM llama_3_1_70b (index=438).
[2024-11-01 08:46:31] Processed 300 responses for LLM llama_3_1_70b (index=538).
[2024-11-01 08:48:54] Processed 400 responses for LLM llama_3_1_70b (index=638).
[2024-11-01 08:51:39] Processed 500 responses for LLM llama_3_1_70b (index=738).
[2024-11-01 08:54:39] Processed 600 responses for LLM llama_3_1_70b (index=838).
[2024-11-01 08:57:17] Processed 700 responses for LLM llama_3_1_70b (index=938).
[2024-11-01 08:59:49] Processed 800 responses for LLM llama_3_1_70b (index=1038).
[2024-11-01 09:02:41] Processed 900 responses for LLM llama_3_1_70b (index=1138).
[2024-11-01 09:05:25] Processed 1000 responses for LLM llama_3_1_70b (index=1238).
[2024-11-01 09:08:19] Processed 1100 responses for LLM llama_3_1_70b (index=1338).
[2024-11-01 09:11:05] Processed 1200 responses for LLM llama_3_1_70b (index=1438).
[2024-11-01 09:13:50

### Experiment 5: Fallacy Classification with Fine-Tuning

In [3]:
e5_filename = 'data/fallacies_e5.csv'
df_fallacies_e5 = get_fallacy_df(e5_filename, only_incorrect=True)

# Select only test set
df_fallacies_e5 = df_fallacies_e5[df_fallacies_e5['tuning'] == TuningSet.TEST.value]

[2024-10-28 14:41:15] Loaded existing fallacy dataframe from data/fallacies_e5.csv.


In [4]:
prompt_template_e5 = get_classification_prompt_template()

llms = get_llms([LLM.GPT_4O_MINI_TUNED])

run_experiment(df_fallacies_e5, e5_filename, prompt_template_e5, llms, sleep_seconds=0.5)

save_fallacy_df(df_fallacies_e5, e5_filename)

[2024-10-28 14:52:57] Processed 100 responses for LLM gpt_4o_mini_tuned_v1 (index=199).
[2024-10-28 14:54:09] Processed 200 responses for LLM gpt_4o_mini_tuned_v1 (index=399).
[2024-10-28 14:55:21] Processed 300 responses for LLM gpt_4o_mini_tuned_v1 (index=599).
[2024-10-28 14:56:34] Processed 400 responses for LLM gpt_4o_mini_tuned_v1 (index=799).
[2024-10-28 14:57:48] Processed 500 responses for LLM gpt_4o_mini_tuned_v1 (index=999).
[2024-10-28 14:58:59] Processed 600 responses for LLM gpt_4o_mini_tuned_v1 (index=1199).
[2024-10-28 15:00:17] Processed 700 responses for LLM gpt_4o_mini_tuned_v1 (index=1399).
[2024-10-28 15:01:31] Processed 800 responses for LLM gpt_4o_mini_tuned_v1 (index=1599).
[2024-10-28 15:02:52] Processed 900 responses for LLM gpt_4o_mini_tuned_v1 (index=1799).
[2024-10-28 15:04:13] Processed 1000 responses for LLM gpt_4o_mini_tuned_v1 (index=1999).
[2024-10-28 15:05:30] Processed 1100 responses for LLM gpt_4o_mini_tuned_v1 (index=2199).
