In [7]:
import dataset_preprocessing as dps
from GeneralLLM import ChatGPT

## Dataset Preprocessing
### Get PertEval-formatted Data

In [2]:
original_path = './eval_data/mmlu_original/'
save_path = './eval_data/mmlu_processed/'

# Transform orginal data to PertEval-formatted data
_, _, test_data = dps.preprocess_mmlu(
    folder_path = original_path,
    save_path = save_path)

./eval_data/mmlu_original/college_mathematics_test.csv processed
./eval_data/mmlu_original/professional_medicine_test.csv processed
./eval_data/mmlu_original/professional_psychology_test.csv processed
./eval_data/mmlu_original/high_school_world_history_test.csv processed


In [13]:
dps.write_to_jsonl(test_data[:10],'./eval_data/mmlu_processed/test_head10.jsonl')

In [18]:
print(test_data[0])

{'subject': 'college_mathematics_test', 'id': 0, 'question': 'Let k be the number of real solutions of the equation e^x + x - 2 = 0 in the interval [0, 1], and let n be the number of real solutions that are not in [0, 1]. Which of the following is true?', 'options': ['k = 0 and n = 1', 'k = 1 and n = 0', 'k = n = 1', 'k > 1'], 'option_ids': ['A', 'B', 'C', 'D'], 'question_first': True, 'correct': [False, True, False, False], 'text_type': 'choice'}


### Get Perturbed data

In [4]:
import KPPerturbation as kpp

In [6]:
help(kpp.ParaphrasingPerturbation)

Help on class ParaphrasingPerturbation in module KPPerturbation:

class ParaphrasingPerturbation(KPPerturbation)
 |  ParaphrasingPerturbation(paraphrase_config: dict, rewriter: GeneralLLM.ChatGPT)
 |  
 |  Method resolution order:
 |      ParaphrasingPerturbation
 |      KPPerturbation
 |      abc.ABC
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, paraphrase_config: dict, rewriter: GeneralLLM.ChatGPT)
 |      Args:
 |          paraphrase_config:dict, the configuration of paraphrasing.
 |              <key:value> = {"n_candidates":int,
 |                             "similarity_score":float}
 |              "n_candidates" denotes the number of generated candidates for each
 |              question sentence. "similarity_score" denotes the expected similarity
 |              score of the paraphrasing result, between 0 and 1. The larger, the similar.
 |          rewriter:ChatGPT (gpt-4-turbo is recommended), the rewriter that paraphrases questions.
 |  
 |  

In [11]:
'''
Please fill your own OpenAI api-key or replace the rewriter with another LLM.
p.s., you can implement your own LLM class based on GeneralLLM.LargeLanguageModel.
'''

perturbation = kpp.MixedPerturbation(
    [kpp.ChangeTypePerturbation(),
     kpp.CaesarPerturbation(),
     kpp.ParaphrasingPerturbation(
         paraphrase_config={'n_candidates':3,'similarity_score':0.8},
         rewriter=ChatGPT(model='gpt-4-turbo',
                          api_key=''))])

# The path of PertEval-formatted data
file_name = "./eval_data/mmlu_processed/test_head10.jsonl" 
# The path of perturbed data
target_name = "./eval_data/mmlu_processed/test_head10_mixed1.jsonl"

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu


In [15]:
# Perturb original data with knowledge-invariant perturbation
perturbed_data = dps.perturb_dataset(ptb = perturbation,
                                     file_name=file_name,
                                     target_name=target_name)

  0%|                                                    | 0/10 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 10%|████▍                                       | 1/10 [00:05<00:48,  5.38s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 20%|████████▊                                   | 2/10 [00:12<00:50,  6.33s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 30%|█████████████▏                              | 3/10 [00:16<00:37,  5.40s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 40%|█████████████████▌                          | 4/10 [00:18<00:22,  3.82s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 50%|██████████████████████                      | 5/10 [00:22<00:19,  3.93s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 60%|██████████████████████████▍                 | 6/10 [00:25<00:14,  3.65s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 70%|██████████████████████████████▊             | 7/10 [00:30<00:12,  4.23s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 80%|███████████████████████████████████▏        | 8/10 [00:34<00:08,  4.13s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 90%|███████████████████████████████████████▌    | 9/10 [00:39<00:04,  4.44s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|███████████████████████████████████████████| 10/10 [00:40<00:00,  4.07s/it]


In [19]:
help(eval('kpp.ChangeTypePerturbation'))

Help on class ChangeTypePerturbation in module KPPerturbation:

class ChangeTypePerturbation(KPPerturbation)
 |  Method resolution order:
 |      ChangeTypePerturbation
 |      KPPerturbation
 |      abc.ABC
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __str__(self)
 |      Return str(self).
 |  
 |  perturb(self, mcq: KPPerturbation.MultipleChoiceQuestion) -> KPPerturbation.MultipleChoiceQuestion
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __abstractmethods__ = frozenset()
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from KPPerturbation:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [16]:
# Preview original data 
original_question = kpp.MultipleChoiceQuestion() 
original_question.load_dict(test_data[0])
print(original_question.get_prompt())

Please select the correct option(s) from the following options given the question:
Question: Let k be the number of real solutions of the equation e^x + x - 2 = 0 in the interval [0, 1], and let n be the number of real solutions that are not in [0, 1]. Which of the following is true?
Options:
A k = 0 and n = 1
B k = 1 and n = 0
C k = n = 1
D k > 1
Your output must strictly follow this format:
{"answer": <the list of selected options, e.g., ["A", "B", "C", "D"]>}
Your output:


In [17]:
# Preview perturbed data
perturbed_question = kpp.MultipleChoiceQuestion() 
perturbed_question.load_dict(perturbed_data[0])
print(perturbed_question.get_prompt())

Please judge whether each of the options is correct or incorrect given the question:
Question: Define k as the count of real solutions to the equation e^x + x - 2 = 0 within the interval [0, 1], and let n represent the count of real solutions outside the interval [0, 1]. Which of the following statements is correct?
Options:
U k = 0 and n = 1
V k = 1 and n = 0
W k = n = 1
X k > 1
Your output must strictly follow this format:
{"U": <"True" or "False">, "V": <"True" or "False">, "W": <"True" or "False">, "X": <"True" or "False">}
Your output:


## Obtain LLMs' Responses to Test Data

In [20]:
from functools import partial
from run_benchmark import test_dataset, parallel_test_dataset

In [21]:
# As an example, select high_school_world_history_test as the test object.
test_subjects = ['high_school_world_history_test']
API_KEY = '' # Use your own OPENAI_API_KEY

In [15]:
# Test on original data
test_data_path = "./eval_data/mmlu_processed/test.jsonl"
log_path_prefix = "./log/mmlu_test_gpt-3.5.turbo"

parallel_test_dataset(
    file_path = test_data_path,
    log_path_prefix = log_path_prefix,
    simple_question_path = None,
    subjects = test_subjects,
    model_class = partial(ChatGPT, api_key = API_KEY),
    model_selection = 'gpt-3.5-turbo',
    temperature = 0.2,
    thread_func = test_dataset,
    n_thread = 8,
    start_id = None,
    end_id = None
)

INFO:root:Log path: ./log/mmlu_test_gpt-3.5.turbo_2024-06-04 16:59:19.007533.jsonl


# of samples = 237


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

In [14]:
# Test on perturbed data
perturbed_data_path = "./eval_data/mmlu_processed/test_ChangeType_OptionCaesar.jsonl"
perturbed_log_path_prefix = "./log/mmlu_test_gpt-3.5.turbo_ChangeType_OptionCaesar"

parallel_test_dataset(
    file_path = perturbed_data_path,
    log_path_prefix = perturbed_log_path_prefix,
    simple_question_path = None,
    subjects = test_subjects,
    model_class = partial(ChatGPT, api_key = API_KEY),
    model_selection = 'gpt-3.5-turbo',
    temperature = 0.2,
    thread_func = test_dataset,
    n_thread = 8,
    start_id = None,
    end_id = None
)

INFO:root:Log path: ./log/mmlu_test_gpt-3.5.turbo_ChangeType_OptionCaesar_2024-06-04 17:12:41.788116.jsonl


# of samples = 237


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

## Transition Analysis

In [5]:
import transition_analysis as tas

In [6]:
log_path1 = "./log/mmlu_test_gpt-3.5.turbo_2024-06-04 16:59:19.007533.jsonl"
log_path2 = "./log/mmlu_test_gpt-3.5.turbo_ChangeType_OptionCaesar_2024-06-04 17:12:41.788116.jsonl"

# Overall Performance Stability & Correct Response Consistency
tas.transition_analysis(log_path1, log_path2, subjects = test_subjects)

MCQ Benchmark Performance:
score_1 = 0.6624 (157/237)
score_2 = 0.5781 (137/237)
performance_drop_rate = -0.0844
Wilcoxon hypothesis test = WilcoxonResult(statistic=954.0, pvalue=0.002772833657622028)

recall_c = 0.7707
acc@consist = 0.5105
MCQ Benchmark Performance:
score_1 = 0.6624 (157/237)
score_2 = 0.5781 (137/237)
performance_drop_rate = -0.0844
Wilcoxon hypothesis test = WilcoxonResult(statistic=954.0, pvalue=0.002772833657622028)

recall_c = 0.7707
acc@consist = 0.5105
macro_acc (before) = 0.6624472573839663
macro_acc (after) = 0.5780590717299579
macro_pdr = -0.08438818565400841
macro_rop = 0.7707006369426752
macro_acc@consist = 0.510548523206751


In [9]:
# Response Pattern Analysis
tas.response_pattern_analysis(log_path1)
print()
tas.response_pattern_analysis(log_path2)

% of correct responses: 0.6624
% of invalid responses: 0.0042
% of too many options: 0.2363
% of incorrect single option: 0.0802
% of incorrect multiple options: 0.0169

% of correct responses: 0.5781
% of invalid responses: 0.0633
% of too many options: 0.2616
% of incorrect single option: 0.0928
% of incorrect multiple options: 0.0042
