# Evaluating classification and rule articulation

In [1]:
import json
from tqdm import tqdm
import openai
import numpy as np
import pandas as pd 
import re

from ArticulationExperiment import ArticulationExperiment
with open("openaik.txt", "r") as file:
    openai.api_key = file.readline().strip()

In [2]:
# Define prompts
SYSTEM_PROMPT = ""
CONTEXT_PROMPT ='Your task is to label an intput sentence as "True" or "False" according to an unknown decision rule. The decision rule can be deduced from the following examples: \n'
CLASSIFICATION_PROMPT = '\nPlease classify the following input with a single word "True" or "False":\n'
MULTIPLE_CLASSIFICATIONS_PROMPT = '\nPlease classify each of the following inputs with a single word "True" or "False":\n'
ARTICULATION_PROMPT = "\nPlease articulate the rule for correctly classifying the inputs. Use short and clear language."
COT_PROMPT = "Please show for every input the decisive keywords for labeling true or false according to your stated rule."
RULE_COMPARISON_PROMPT = 'A language model has predicted a decision rule from labeled data. I provide you with both the predicted rule and the original rule used to label the data. Please judge in a single sentence how well the prediction and matches the original. Then, please give a final score on a linear scale between 0 (no match) and 1 (perfect match). Here are the rules to compare:'

## Evaluating single datasets

In [3]:
# Choose rule
rule_single = "colors"
dataset_dir_single = "./data/n100/"

with open(dataset_dir_single + f"{rule_single}.json", "r") as file:
    original_rule_formulation_single = json.load(file)['rule_formulation']
    print('Original Rule:')
    print(original_rule_formulation_single)

Original Rule:
{'true': 'The sentence mentions a color.', 'false': 'The sentence does not mention any color.'}


In [4]:
# String management and prompting has been set up to an external task
exp_single = ArticulationExperiment(
    model = "gpt-3.5-turbo-1106",
    dataset_dir = dataset_dir_single,

    n_exp = 1,
    n_fewshot_examples = 5,
    n_tasks=5,

    system_prompt = SYSTEM_PROMPT,
    context_prompt = CONTEXT_PROMPT,
    classification_prompt = CLASSIFICATION_PROMPT,
    articulation_prompt = ARTICULATION_PROMPT,
    multiple_classifications_prompt = MULTIPLE_CLASSIFICATIONS_PROMPT,
)

In [5]:
# Build a prompt from dataset samples
fewshot_examples_single, task_inputs_single, task_labels_single = exp_single.get_examples_and_tasks(rule=rule_single)
experiment_prompt_single = CONTEXT_PROMPT + fewshot_examples_single + MULTIPLE_CLASSIFICATIONS_PROMPT + task_inputs_single
print(experiment_prompt_single)

Your task is to label an intput sentence as "True" or "False" according to an unknown decision rule. The decision rule can be deduced from the following examples: 
Input: The old barn stood out with its faded coat of rustic red paint. Label: True
Input: The artist used a soothing shade of blue for the background of her painting. Label: True
Input: The old barn stood weathered and worn, its paint now a faded shade of yellow. Label: True
Input: I need to buy some new shoes for work. Label: False
Input: I need to buy some new shoes for the summer. Label: False
Input: The firefighter wore a vibrant red helmet as she rushed into the burning building. Label: True
Input: The kitten chased the butterfly around the garden. Label: False
Input: The baby's infectious giggle filled the room with joy. Label: False
Input: She ran her fingers through her hair, feeling the soft strands between her fingertips. Label: False
Input: The old barn was a weathered, rusty brown. Label: True

Please classify ea

In [6]:
# Get predictions via OpenAI API
out = openai.ChatCompletion.create(
    model='gpt-3.5-turbo-1106',
    messages=[
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": experiment_prompt_single}
    ]
)

predictions_single = out.choices[0].message.content
print(predictions_single)

Based on the provided examples, it seems that the decision rule is labeling sentences as "True" if they describe colors or visual imagery related to the environment, and "False" if they do not. Applying this rule, the classification for each input would be:

(0) True
(1) True
(2) True
(3) True
(4) True
(5) False
(6) True
(7) False
(8) True
(9) False


In [7]:
# Evaluate the accuracy over a range of experiments
print("Experiment variables:")
print(f'Number of few-shot examples per prompt: {2 * exp_single.n_fewshot_examples} (true and false examples equally weighted)')
print(f'Number of sentences to label per prompt: {2 * exp_single.n_tasks} (true and false examples equally weighted)')
print(f'Total number of prompts evaluated: {2 * exp_single.n_exp}\n')

accuracy_single = exp_single.classify(rule=rule_single, verbose=False)

print(f'\nRule title: {rule_single}')
print(f'Rule formulation: "{original_rule_formulation_single}"')
print(f'Accuracy: {accuracy_single}')

Experiment variables:
Number of few-shot examples per prompt: 10 (true and false examples equally weighted)
Number of sentences to label per prompt: 10 (true and false examples equally weighted)
Total number of prompts evaluated: 2



  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:04<00:00,  4.28s/it]


Rule title: colors
Rule formulation: "{'true': 'The sentence mentions a color.', 'false': 'The sentence does not mention any color.'}"
Accuracy: [0.7]





## Measuring classification accuracy 

In [8]:
dataset_dir_sweep = "./data/T1_selection/"

with open(dataset_dir_sweep + f"rules.json", "r") as file:
    rules = json.load(file)
    print('We selected datasets for these rules:')
    for rule in rules.keys():
        print(f'- {rule}')

We selected datasets for these rules:
- lowercase
- german
- dates_before_2000
- colors
- positive_sentiment
- political_left
- capitalistic
- female_subject
- years
- happy_sad
- angry_calm
- active_passive
- positive_future_outcome
- present


In [11]:
# String management and prompting has been set up to an external task
exp_sweep = ArticulationExperiment(
    model = "gpt-3.5-turbo-1106",
    dataset_dir = dataset_dir_sweep,

    n_exp = 10,
    n_fewshot_examples = 35,
    n_tasks=15,

    system_prompt = SYSTEM_PROMPT,
    context_prompt = CONTEXT_PROMPT,
    classification_prompt = CLASSIFICATION_PROMPT,
    articulation_prompt = ARTICULATION_PROMPT,
    multiple_classifications_prompt = MULTIPLE_CLASSIFICATIONS_PROMPT,
)

In [12]:
rule_titles = []
accuracies = []
for i, rule in enumerate(rules.keys()):
    print(f"{i}. rule: {rule}")
    acc = exp_sweep.classify(rule)
    accuracies.append(acc)
    rule_titles.append(rule)

df = pd.DataFrame(accuracies, columns=rule_titles, index=[f"acc_exp{i}" for i in range(len(accuracies[0]))])
df

0. rule: lowercase


100%|██████████| 10/10 [02:51<00:00, 17.16s/it]


Evaluation of prompt #9 failed: predictions can't be parsed automatically
1. rule: german


 30%|███       | 3/10 [00:49<01:52, 16.03s/it]

Evaluation of prompt #2 failed: predictions can't be parsed automatically


 90%|█████████ | 9/10 [02:14<00:12, 12.44s/it]

Evaluation of prompt #8 failed: predictions can't be parsed automatically


100%|██████████| 10/10 [02:20<00:00, 14.09s/it]


Evaluation of prompt #9 failed: predictions can't be parsed automatically
2. rule: dates_before_2000


100%|██████████| 10/10 [03:08<00:00, 18.87s/it]


3. rule: colors


100%|██████████| 10/10 [02:33<00:00, 15.35s/it]


4. rule: positive_sentiment


100%|██████████| 10/10 [02:32<00:00, 15.21s/it]


5. rule: political_left


100%|██████████| 10/10 [02:26<00:00, 14.64s/it]


6. rule: capitalistic


100%|██████████| 10/10 [02:10<00:00, 13.02s/it]


7. rule: female_subject


100%|██████████| 10/10 [03:28<00:00, 20.87s/it]


8. rule: years


100%|██████████| 10/10 [02:56<00:00, 17.63s/it]


9. rule: happy_sad


100%|██████████| 10/10 [02:30<00:00, 15.07s/it]


10. rule: angry_calm


100%|██████████| 10/10 [03:11<00:00, 19.10s/it]


11. rule: active_passive


100%|██████████| 10/10 [02:40<00:00, 16.02s/it]


12. rule: positive_future_outcome


100%|██████████| 10/10 [03:04<00:00, 18.42s/it]


13. rule: present


100%|██████████| 10/10 [02:45<00:00, 16.51s/it]


Unnamed: 0,acc_exp0,acc_exp1,acc_exp2,acc_exp3,acc_exp4,acc_exp5,acc_exp6,acc_exp7,acc_exp8,acc_exp9
lowercase,0.6,0.666667,0.533333,0.466667,0.633333,0.466667,0.566667,0.4,0.566667,0.0
german,0.4,0.433333,0.0,0.4,0.466667,0.566667,0.233333,0.466667,0.0,0.0
dates_before_2000,0.966667,0.966667,0.766667,0.9,0.9,0.833333,0.766667,0.966667,0.933333,0.8
colors,0.733333,0.766667,0.8,0.9,0.8,0.9,0.9,0.833333,0.733333,0.933333
positive_sentiment,0.933333,0.9,0.833333,0.766667,0.833333,0.7,0.866667,0.933333,0.833333,0.933333
political_left,0.966667,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
capitalistic,0.966667,1.0,0.966667,1.0,0.833333,1.0,0.933333,0.966667,1.0,1.0
female_subject,0.466667,0.633333,0.533333,0.566667,0.6,0.6,0.8,0.666667,0.6,0.7
years,0.433333,0.366667,0.5,0.466667,0.533333,0.633333,0.566667,0.466667,0.433333,0.5
happy_sad,0.933333,0.9,0.966667,0.966667,0.966667,0.933333,1.0,0.933333,0.866667,0.933333


In [22]:
rule_titles

['lowercase',
 'german',
 'lowercase',
 'german',
 'dates_before_2000',
 'colors',
 'positive_sentiment',
 'political_left',
 'capitalistic',
 'female_subject',
 'years']

In [25]:
list(zip(rule_titles, accuracies))

[('lowercase', array([0.65, 0.55, 0.  , 0.6 , 0.5 ])),
 ('german', array([0.  , 0.  , 0.  , 0.45, 0.45])),
 ('lowercase', array([0.5 , 0.35, 0.55, 0.75, 0.7 ])),
 ('german', array([0.4, 0. , 0.5, 0. , 0. ])),
 ('dates_before_2000', array([0.85, 0.85, 0.75, 0.95, 0.9 ])),
 ('colors', array([0.5 , 0.95, 0.8 , 0.95, 0.9 ])),
 ('positive_sentiment', array([0.9 , 0.85, 0.8 , 0.85, 0.9 ])),
 ('political_left', array([1.  , 0.95, 1.  , 1.  , 1.  ])),
 ('capitalistic', array([1.  , 0.85, 0.95, 0.95, 1.  ])),
 ('female_subject', array([0.75, 0.6 , 0.6 , 0.4 , 0.5 ])),
 ('years', array([0.45, 0.35, 0.4 , 0.5 , 0.65]))]

In [None]:
#[('lowercase', array([0.65, 0.55, 0.  , 0.6 , 0.5 ])),
#  ('german', array([0.  , 0.  , 0.  , 0.45, 0.45])),
#  ('lowercase', array([0.5 , 0.35, 0.55, 0.75, 0.7 ])),
#  ('german', array([0.4, 0. , 0.5, 0. , 0. ])),
#  ('dates_before_2000', array([0.85, 0.85, 0.75, 0.95, 0.9 ])),
#  ('colors', array([0.5 , 0.95, 0.8 , 0.95, 0.9 ])),
#  ('positive_sentiment', array([0.9 , 0.85, 0.8 , 0.85, 0.9 ])),
#  ('political_left', array([1.  , 0.95, 1.  , 1.  , 1.  ])),
#  ('capitalistic', array([1.  , 0.85, 0.95, 0.95, 1.  ])),
#  ('female_subject', array([0.75, 0.6 , 0.6 , 0.4 , 0.5 ])),
#  ('years', array([0.45, 0.35, 0.4 , 0.5 , 0.65]))]

Unnamed: 0,acc_exp0,acc_exp1,acc_exp2,acc_exp3,acc_exp4
lowercase,0.65,0.55,0.0,0.6,0.5
german,0.0,0.0,0.0,0.45,0.45
lowercase,0.5,0.35,0.55,0.75,0.7
german,0.4,0.0,0.5,0.0,0.0
dates_before_2000,0.85,0.85,0.75,0.95,0.9
colors,0.5,0.95,0.8,0.95,0.9
positive_sentiment,0.9,0.85,0.8,0.85,0.9
political_left,1.0,0.95,1.0,1.0,1.0
capitalistic,1.0,0.85,0.95,0.95,1.0
female_subject,0.75,0.6,0.6,0.4,0.5


## Testing rule articulation

### Do rule articulation based on few-shot prompts

In [28]:
fewshot_examples, _, _ = exp.get_fewshot_examples(rule='capitalistic')
print(fewshot_examples)

rule_articulation = exp.generate_rule_articulation(fewshot_examples)
rule_articulation

Input: Economic freedom spurs entrepreneurship and empowers individuals to pursue their dreams. Label: True
Input: Hard work and innovation should be rewarded with financial success in a free economy. Label: True
Input: Free enterprise empowers individuals to start businesses and create jobs, fostering economic opportunity and growth. Label: True
Input: Private ownership and investment in businesses incentivize efficiency and productivity. Label: True
Input: Individuals should have the opportunity to invest their savings and build wealth for themselves and their families. Label: True
Input: Consumer choice drives market efficiency and product quality. Label: True
Input: Competition in the marketplace encourages businesses to continuously improve and innovate. Label: True
Input: The free market fosters innovation and competition, leading to economic growth and prosperity. Label: True
Input: Market competition spurs efficiency and drives down prices. Label: True
Input: Entrepreneurship e

'The decision rule for classifying the inputs is that any statement promoting the benefits of a free market economy, entrepreneurship, innovation, competition, and individual empowerment should be labeled as "True," while any statement critical of wealth accumulation, corporate influence, inequality, and prioritizing human needs over financial gain should be labeled as "False."'

### Does the articulated rule match the original rule?

In [30]:
scores = np.zeros(exp.n_exp)
for i in range(exp.n_exp):
    test_rule = 'capitalistic'

    test_rule_formulation = rules[test_rule]["true"]

    fewshot_examples, _, _ = exp.get_fewshot_examples(rule=test_rule)
    rule_articulation = exp.generate_rule_articulation(fewshot_examples)


    messages = [
        {"role": "system", "content": ""},
        {"role": "user", "content": RULE_COMPARISON_PROMPT + f"Original rule: {test_rule_formulation}\n" + f"Predicted rule: {rule_articulation}"},
        ]

    # OpenAI Text Generation
    out = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-1106",
        messages=messages,
    )
    judge_text = out.choices[0].message.content
    print(f"{test_rule=}")
    print(f"{test_rule_formulation=}")
    print(f"{rule_articulation=}")
    print(f"{judge_text=}")

    try:
        score = re.findall("\d+\.\d+", judge_text)[0]
        scores[i] = score
        print(f"{score=}")
    except:
        pass
    

scores.mean()

test_rule='capitalistic'
test_rule_formulation="The sentence is in favor of capitalistic ideals and does not contain the words 'capitalist' or 'communist' or 'socialist'."
rule_articulation='The rule for classifying the inputs as "True" or "False" is:\n- Inputs that emphasize the positive impact of private enterprise, entrepreneurship, innovation, competition, and market-driven economy are labeled as "True."\n- Inputs that criticize the pursuit of wealth, consumerism, corporate profit prioritization over social welfare, and inequalities in access to essential resources are labeled as "False."'
judge_text='The predicted rule closely matches the original rule, with a high level of alignment in capturing the essence of favoring capitalistic ideals and identifying criteria for classifying inputs as "True" or "False." I would rate the match as 0.9 out of 1.'
score='0.9'
test_rule='capitalistic'
test_rule_formulation="The sentence is in favor of capitalistic ideals and does not contain the w

0.812

#### Explain the label of fewshot examples based on a self-articulated rule

In [21]:
fewshot_examples = exp.get_fewshot_examples(rule='capitalistic')
rule_articulation = exp.generate_rule_articulation()

messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": CONTEXT_PROMPT + fewshot_examples + ARTICULATION_PROMPT},
    {"role": "assistant", "content": rule_articulation},
    {"role": "user", "content": COT_PROMPT}
    ]

# OpenAI Text Generation
out = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=messages,
)
rule_example_mapping = out.choices[0].message.content
print(f"{RULE=}")
# print(f"{messages=}")
print(rule_example_mapping)

fewshot_examples = None

NameError: name 'get_fewshot_examples' is not defined

### Does articulating increase classification accuracy?

In [None]:
labels = []
preds = []

for i in tqdm(range(N_EXP)):
    fewshot_examples, task_input, label = get_examples_prompt()
    labels.append(label)

    rule_articulation = articulation_by_model()

    messages = [
    {"role": "system", "content": STSTEM_PROMPT},
    {"role": "user", "content": CONTEXT_PROMPT + fewshot_examples + ARTICULATION_PROMPT},
    {"role": "assistant", "content": rule_articulation},
    {"role": "user", "content": CLASSIFICATION_PROMPT + task_input}
    ]
    # OpenAI Text Generation
    out = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
    )
    prediction = out.choices[0].message.content
    prediction.strip(' ,./123567890;:"')
    preds.append(prediction)


    print(CONTEXT_PROMPT + fewshot_examples + ARTICULATION_PROMPT)
    print(f"\n{rule_articulation}\n")
    print(CLASSIFICATION_PROMPT + task_input)
    print(f'{prediction=}')
    print(f'{label=}\n\n')

print(f'{labels=}')
print(f'{preds=}')

# Check if preds are in ["True", "False"]
for p in preds:
    if p not in ["True", "False"]:
        raise ValueError(f"{p} cannot be converted into binary prediction.")

labels = np.array([1 if l == "True" else 0 for l in labels ])
preds = np.array([1 if l == "True" else 0 for l in preds ])
acc = np.sum(labels == preds) / N_EXP

print(f"Accuracy = {acc}")
    

In [46]:
rule_gen_prompt = 'I am doing a benchmark for sentence classification with gpt-3.5-turbo. Each sentence is labeled as true or false given a specific rule. I am looking for rules where gpt-3.5-turbo is good at classification but bad ad articulating the rule.'

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": rule_gen_prompt},
    ]
# OpenAI Text Generation
out = openai.ChatCompletion.create(
    model="gpt-4",
    messages=messages,
)
ans = out.choices[0].message.content
print(ans)

KeyboardInterrupt: 