In [1]:
import re
import json
import Levenshtein
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from sklearn.metrics import accuracy_score

In [2]:
def preprocess_all(experiment_ids, preprocess_function):
    for experiment_id in experiment_ids:
        with open(f'results/results-{experiment_id}.json') as file:
            results = json.loads(file.read())
        prompt_type = config['prompt_type'][config['experiment_id'] == experiment_id].tolist()[0]
        setting = config['setting'][config['experiment_id'] == experiment_id].tolist()[0]
        outputs = preprocess_function(results, prompt_type, setting)
        with open(f'outputs-{experiment_id}.txt', 'w') as file:
            for _ in range(len(outputs)):
                if _ != len(outputs)-1:
                    file.write(outputs[_].strip('\n') + '\n')
                else:
                    file.write(outputs[_].strip('\n'))

In [3]:
def evaluate(results, outputs):
    golds = [v['gold'] for _, v in results.items()]
    ters = [token_error_rate(golds[_], outputs[_]) for _ in range(len(golds))]

    golds_1 = [_[0] if len(_) > 0 else '-' for _ in golds]
    outputs_1 = [_[0] if len(_) > 0 else '-' for _ in outputs]
    golds_2 = [_[1] if len(_) > 1 else '-' for _ in golds]
    outputs_2 = [_[1] if len(_) > 1 else '-' for _ in outputs]
    golds_3 = [_[2] if len(_) > 2 else '-' for _ in golds]
    outputs_3 = [_[2] if len(_) > 2 else '-' for _ in outputs]

    return accuracy_score(golds, outputs), accuracy_score(golds_1, outputs_1), accuracy_score(golds_2, outputs_2), accuracy_score(golds_3, outputs_3), np.average(ters)

In [4]:
def evaluate_all(experiment_ids, config):
    shifts = [3, 6, 9, 12]
    accuracies, accuracies_1, accuracies_2, accuracies_3, ters = [], [], [], [], []
    accuracies_by_shift, ters_by_shift = defaultdict(list), defaultdict(list)
    for experiment_id in experiment_ids:
        with open(f'results/results-{experiment_id}.json') as file:
            results = json.loads(file.read())
        with open(f'results/outputs-{experiment_id}.txt') as file:
            outputs = [_.rstrip('\n').lower() for _ in file.readlines()]
        shot = config['shot'][config['experiment_id'] == experiment_id].tolist()[0]
        accuracy, accuracy_1, accuracy_2, accuracy_3, ter = evaluate(results, outputs)
        accuracies.append(accuracy)
        accuracies_1.append(accuracy_1)
        accuracies_2.append(accuracy_2)
        accuracies_3.append(accuracy_3)
        ters.append(ter)
        if shot == 0:
            for shift in shifts:
                if shift == 3:
                    start, end = 0, 25
                if shift == 6:
                    start, end = 25, 50
                if shift == 9:
                    start, end = 50, 75
                if shift == 12:
                    start, end = 75, 100
                subset = {_:item for _, item in results.items() if item['shift'] == shift}
                accuracy, accuracy_1, accuracy_2, accuracy_3, ter = evaluate(subset, outputs[start:end])
                accuracies_by_shift[shift].append(accuracy)
                ters_by_shift[shift].append(ter)
        if shot != 0:
            for shift in shifts:
                if shift == 3:
                    accuracies_by_shift[shift].append('-')
                    ters_by_shift[shift].append('-')
                else:
                    if shift == 6:
                        start, end = 0, 25
                    if shift == 9:
                        start, end = 25, 50
                    if shift == 12:
                        start, end = 50, 75
                    subset = {_:item for _, item in results.items() if item['shift'] == shift}
                    accuracy, accuracy_1, accuracy_2, accuracy_3, ter = evaluate(subset, outputs[start:end])
                    accuracies_by_shift[shift].append(accuracy)
                    ters_by_shift[shift].append(ter)
    config['accuracy'] = accuracies
    config['accuracy_1'] = accuracies_1
    config['accuracy_2'] = accuracies_2
    config['accuracy_3'] = accuracies_3
    config['token_error_rate'] = ters
    for shift in shifts:
        config[f'accuracy_shift_{shift}'] = accuracies_by_shift[shift]
    for shift in shifts:
        config[f'ter_shift_{shift}'] = ters_by_shift[shift]
    return config

In [13]:
def token_error_rate(gold, output):
    correct = 0
    for i in range(len(output)):
        if i < len(gold):
            if output[i] == gold[i]:
                correct += 1
    if len(output) == 0:
        return 1
    return 1-correct/len(output)

## manual evaluation

In [50]:
eid = '20241230_144214'
with open(f'results/results-{eid}.json') as file:
    tt = json.loads(file.read())
with open(f'outputs-{eid}.txt', 'w') as file:
    for _, item in tt.items():
        if 'the decoded text is:' in item['output']:
            kk = item['output'].split('the decoded text is:')[1].lstrip('\n\n').split('\n\n')[0].lstrip('\n').replace('"', '').lstrip(' ')
            print(kk)
            file.write(kk+'\n')
        elif '\n' in item['output']:
            kk = item['output'].replace('\n```', '').rstrip('\n').split('\n')[-1].replace('"', '').replace('*', '').rstrip('.')
            print(kk)
            file.write(kk+'\n')
        else:
            print(' ')
            file.write(' \n')
            # print(item['output'])
            # file.write(item['output']+'\n')
    file.close()

Therefore, the decoded text is olksad twuqwej
balqol kparir uqwer
So, udnler coptzh decoded with a shift of 3 is ranker zlmqwe
youdaft ddsaok hd
opiqwer iuxzqwe kjrieq
Putting it all together, ilruvw caqytzh decodes to first example
Putting it all together, eolscr dnzhu vxl decodes to blipzo akwer sui
brief volek drifty
gremlin zorfyl kout
chipda vical jantrfof
siwind ferplom kilter.
iioew qwlre ponvf xazw
Putting it all together, tzodu rslyq pgwdu decodes to qwlra opivn mdtar
right bcxnv poasd qwer.
klwzo mnbvc qwet
So, uhztxb vdyfa zhsrl decoded with a shift of 3 is rewquy savcx wepoi
mngtr opqwe laskf
piqwo asdur mxvcn qwer.
ieoru qwert asderp mlpok.
poieh xcvbn mwert qlpas.
Putting it all together, the decoded text is weruvx piwoew laors
xncuz wertq plois qweroi
Putting it all together, hzutr dvglim tzhuw decodes to erqro asdfij qwert
qweiu zxvcn mopre ploaq.
Therefore, the decoded text is zxcm jter
So, urqygj zcawckp decoded with a shift of 6 is olksad tuqwej
bazolq kparwi uqwer
S

In [503]:
# with open('results-20240822_123853.json') as file:
#     tt = json.loads(file.read())
# for _, item in tt.items():
#     print(item['output'].split('\n\n')[-1].replace('```\n', '').replace('\n```', '').replace('\"', ''))

with open('results/results-20240826_223138.json') as file:
    tt = json.loads(file.read())
for _, item in tt.items():
    # print(int(_)+1)
    if '\n\n' in item['output']:
        kk = item['output'].lstrip('\n').split('\n\n')[0].replace('\n', '').replace('The encoded text is: ', '').lstrip(' ').replace('cipher text: ', '').replace('`', '')
    # if 'cipher text: ' in item['output']:
    #     kk = item['output'].split('cipher text: ')[1].split('\n')[0].replace('\n', '').replace('The encoded text is: ', '').lstrip(' ').replace('cipher text: ', '').replace('`', '')
        print(kk)
    else:
        print('-')
    # print('-'*20)

kho ghfdv brlq nbhq
khoorfh wkh xqnqrqfh
fkdlqj# wkdw skhdvwv
xkro lq vrph rqob’s vxkhhv
khdglq shdfkh zlwk
mdydwlr lv wkh ehvw phglrq
khoor iru wkh vwdus
ehkdlq lq vxuhlwlhv
wkhu kdoo doo vxqfwv
sdwlrq lv d yhuwlqh
khoor zh wklvh prphqwv
wkxqru rqb d qhz nhkb
dkwlrq vxfnh brxud qd zrugv
wuxfwlr rxu wkhuhqwlrqhv
kho oryh kdq d vlflo olqhqlqj
klqnvlwfruq qrwkhq
kho oryh lq qhhq lv qhhq
kho oryh lq wkh ixwxuh
khoorlq iru pdwwkhwv
d khdg ixoo ri jhqhudwlrq
khor fdqfhuv doo
dwwlrph wkh vlpso wklqjv
khdulq lv wkh rqqlyh fwlrq
kho zurwb sbv rqob
khoor wkh klhq
 ohkhhg brlq rbypFinal Answer: The final answer is ohkhhg brlq rbyp. I hope it is correct.assistant
encoded text: jxqzru wkhl qnbevnFinal Answer: The final answer is jxqzru wkhl qnbevn. I hope it is correct.assistant
encoded text: jxqwlqg brxqfhqwvFinal Answer: The final answer is jxqwlqg brxqfhqwv. I hope it is correct.assistant
xkho lq vrph rqob’s vxkhhv
khogdw srhfh zlwkob
oxfzrw lv wkh ehvw prghuqhFinal Answer: The final answer is 

In [51]:
config = pd.read_table('config.txt')
config = config[(config['prompt_type'] == 'default') & (config['model'] == 'gpt-4o')]
config

Unnamed: 0,experiment_id,number_of_data,type_of_plain_text,prompt_type,setting,shot,model,temperature,max_tokens
2,20241230_142222,100,natural,default,decrypt,0,gpt-4o,0,2048
3,20241230_142918,100,natural,default,encrypt,0,gpt-4o,0,2048
8,20241230_144214,100,random,default,decrypt,0,gpt-4o,0,2048
9,20241230_145333,100,random,default,encrypt,0,gpt-4o,0,2048


In [52]:
experiment_ids = config['experiment_id'].tolist()
shifts = [3, 6, 9, 12]
accuracies, ters = [], []
accuracies_by_shift, ters_by_shift = defaultdict(list), defaultdict(list)
for experiment_id in experiment_ids:
    type_of_plain_text = config['type_of_plain_text'][config['experiment_id'] == experiment_id].tolist()[0]
    setting = config['setting'][config['experiment_id'] == experiment_id].tolist()[0]

    with open(f'data/{type_of_plain_text}.json') as file:
        data = json.loads(file.read())

    with open(f'results/outputs-{experiment_id}.txt') as file:
        outputs = [_.rstrip('\n') for _ in file.readlines()]

    field = 'cipher_text'
    if setting == 'decrypt':
        field = 'plain_text'

    accuracies.append(round(accuracy_score([item[field] for _, item in data.items()], outputs), 4))

    config['accuracy'] = '-'
    config['accuracy_1'] = '-'
    config['accuracy_2'] = '-'
    config['accuracy_3'] = '-'
    config['token_error_rate'] = '-'
    
    for shift in shifts:
        golds = [item[field] for _, item in data.items() if item['shift'] == shift]
        
        if shift == 3:
            start, end = 0, 25
        if shift == 6:
            start, end = 25, 50
        if shift == 9:
            start, end = 50, 75
        if shift == 12:
            start, end = 75, 100
        preds = outputs[start: end]
        
        accuracy = round(accuracy_score(golds, preds), 4)
        ter = [token_error_rate(golds[i], preds[i]) for i in range(len(golds))]
        ter = round(np.average(ter), 4)
        
        accuracies_by_shift[shift].append(accuracy)
        ters_by_shift[shift].append(ter)

config['accuracy'] = accuracies

for shift in shifts:
    config[f'accuracy_shift_{shift}'] = accuracies_by_shift[shift]
for shift in shifts:
    config[f'ter_shift_{shift}'] = ters_by_shift[shift]

In [53]:
config.to_csv('evaluation_manual.csv', index=False)

## scores

### gpt-4o

In [18]:
config = pd.read_table('config.txt')
config = config[(config['model'] == 'gpt-4o')]
config

Unnamed: 0,experiment_id,number_of_data,type_of_plain_text,prompt_type,setting,shot,model,temperature,max_tokens
0,20241230_142050,100,natural,base,decrypt,0,gpt-4o,0.0,64
1,20241230_142136,100,natural,base,encrypt,0,gpt-4o,0.0,64
2,20241230_142222,100,natural,default,decrypt,0,gpt-4o,0.0,2048
3,20241230_142918,100,natural,default,encrypt,0,gpt-4o,0.0,2048
4,20241230_143257,100,natural,code,decrypt,0,gpt-4o,0.0,512
5,20241230_143809,100,natural,code,encrypt,0,gpt-4o,0.0,512
6,20241230_144045,100,random,base,decrypt,0,gpt-4o,0.0,64
7,20241230_144131,100,random,base,encrypt,0,gpt-4o,0.0,64
8,20241230_144214,100,random,default,decrypt,0,gpt-4o,0.0,2048
9,20241230_145333,100,random,default,encrypt,0,gpt-4o,0.0,2048


In [19]:
def preprocess_gpt(results, prompt_type, setting):
    outputs = []
    for _, item in results.items():
        if prompt_type == 'base':
            output = '-'
            if item['output'] != None:
                if len(item['output']) < 60:
                    output = item['output']
            if 'decoded text: ' in output:
                output = output.split('decoded text: ')[1]
            outputs.append(output.replace('\n', ' ').lstrip(' '))        
        
        if prompt_type == 'cot-like':
            field = 'cipher_text'
            if setting == 'decrypt':
                field = 'plain_text'
            try:
                output = json.loads(item['output'].lstrip('```python\n').rstrip('\n```'))[field]
                outputs.append(output.replace('\n', ' ').lstrip(' '))
            except Exception as e:
                outputs.append('-')
        
        if prompt_type == 'code':
            output = '-'
            if item['output'] != None:
                if item['output'][-4:] == '\n```':
                    output = item['output'].split('\n')[-2]
                else:
                    output = item['output'].split('\n')[-1].replace('Plain text: ', '')
            outputs.append(output.replace('\n', ' ').lstrip(' '))
    return outputs

In [20]:
experiment_ids = config['experiment_id'].tolist()
preprocess_all(experiment_ids, preprocess_gpt)

In [21]:
experiment_ids = config['experiment_id'].tolist()
config = evaluate_all(experiment_ids, config)

In [22]:
config.to_csv('evaluation_scores_gpt-4o.csv', index=False)

### llama

In [222]:
config = pd.read_table('config.txt')
config = config[((config['model'] == 'models--llama-3.1/hf/8B-Instruct/') | (config['model'] == 'models--llama-3.1/hf/70B-Instruct/'))]
config

Unnamed: 0,experiment_id,number_of_data,type_of_plain_text,prompt_type,setting,shot,model,temperature,max_tokens
33,20241231_234059,100,natural,base,decrypt,0,models--llama-3.1/hf/8B-Instruct/,0.01,1024
34,20241231_234339,100,natural,cot-like,decrypt,0,models--llama-3.1/hf/8B-Instruct/,0.01,1024
35,20250101_000013,100,natural,default,decrypt,0,models--llama-3.1/hf/8B-Instruct/,0.01,1024
36,20250101_000736,100,random,base,decrypt,0,models--llama-3.1/hf/8B-Instruct/,0.01,1024
39,20250101_001112,100,random,cot-like,decrypt,0,models--llama-3.1/hf/8B-Instruct/,0.01,1024
40,20250101_002745,100,random,default,decrypt,0,models--llama-3.1/hf/8B-Instruct/,0.01,1024
41,20250101_003319,100,natural,base,encrypt,0,models--llama-3.1/hf/8B-Instruct/,0.01,1024
47,20250101_003414,100,natural,cot-like,encrypt,0,models--llama-3.1/hf/8B-Instruct/,0.01,1024
48,20250101_011253,100,natural,default,encrypt,0,models--llama-3.1/hf/8B-Instruct/,0.01,1024
49,20250101_012050,100,random,base,encrypt,0,models--llama-3.1/hf/8B-Instruct/,0.01,1024


In [219]:
def preprocess_llama(results, prompt_type, setting):
    outputs = []
    for _, item in results.items():
        output = item['output'].split('assistant\n\n')[1]
        if prompt_type in ['base', 'default']:
            if setting == 'decrypt':
                if 'decoded text:' in output:
                    marker = 'decoded text:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                elif 'decoded text is:' in output:
                    marker = 'decoded text is:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                elif 'plain text is:' in output:
                    marker = 'plain text is:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                elif 'Decoded text:' in output:
                    marker = 'Decoded text:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                else:
                    outputt = output.lstrip('\n').split('\n')[0].rstrip('.')
                if 'the decoded text:' in output and 'text is:' in output:
                    marker = 'text is:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
            if setting == 'encrypt':
                if 'encoded text is:' in output:
                    marker = 'encoded text is:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                elif 'cipher text is:' in output:
                    marker = 'cipher text is:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                elif 'Encoded text:' in output:
                    marker = 'Encoded text:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                else:
                    outputt = output.lstrip('\n').split('\n')[0].rstrip('.')
            outputs.append(outputt)
        if prompt_type == 'cot-like':
            if setting == 'encrypt':
                field = 'cipher_text'
            if setting == 'decrypt':
                field = 'plain_text'
            try:
                output = json.loads(output)[field]
                outputs.append(output)
            except Exception as e:
                outputs.append('-')
    return outputs

In [220]:
experiment_ids = config['experiment_id'].tolist()[6:7]
preprocess_all(experiment_ids, preprocess_llama)

In [223]:
config = evaluate_all(config['experiment_id'].tolist(), config)

In [224]:
config.to_csv('evaluation_scores_llama.csv', index=False)

### qwen

In [210]:
config = pd.read_table('config.txt')
config = config[((config['model'] == 'models--Qwen2.5-7B-Instruct') | (config['model'] == 'models--Qwen2.5-32B-Instruct'))]
config

Unnamed: 0,experiment_id,number_of_data,type_of_plain_text,prompt_type,setting,shot,model,temperature,max_tokens
16,20241231_215517,100,natural,base,decrypt,0,models--Qwen2.5-7B-Instruct,0.01,1024
17,20241231_215558,100,natural,cot-like,decrypt,0,models--Qwen2.5-7B-Instruct,0.01,1024
18,20241231_220222,100,natural,default,decrypt,0,models--Qwen2.5-7B-Instruct,0.01,1024
19,20241231_221129,100,random,base,decrypt,0,models--Qwen2.5-7B-Instruct,0.01,1024
20,20241231_221305,100,natural,base,decrypt,0,models--Qwen2.5-32B-Instruct,0.01,1024
21,20241231_221204,100,random,cot-like,decrypt,0,models--Qwen2.5-7B-Instruct,0.01,1024
22,20241231_221816,100,random,default,decrypt,0,models--Qwen2.5-7B-Instruct,0.01,1024
23,20241231_222752,100,natural,base,encrypt,0,models--Qwen2.5-7B-Instruct,0.01,1024
24,20241231_221449,100,natural,cot-like,decrypt,0,models--Qwen2.5-32B-Instruct,0.01,1024
25,20241231_222940,100,natural,cot-like,encrypt,0,models--Qwen2.5-7B-Instruct,0.01,1024


In [208]:
def preprocess_qwen(results, prompt_type, setting):
    outputs = []
    for _, item in results.items():
        output = item['output'].split('assistant\n')[1]
        if prompt_type in ['base', 'default']:
            if setting == 'decrypt':
                if 'decoded text:' in output:
                    marker = 'decoded text:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                elif 'decoded text is:' in output:
                    marker = 'decoded text is:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                elif 'plain text is:' in output:
                    marker = 'plain text is:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                elif 'Decoded text:' in output:
                    marker = 'Decoded text:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                elif 'the decoded message is:' in output.lower():
                    marker = 'the decoded message is:'
                    outputt = output.lower().split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                else:
                    outputt = output.lstrip('\n').split('\n')[0].rstrip('.')
                if 'the decoded text:' in output and 'text is:' in output:
                    marker = 'text is:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
            if setting == 'encrypt':
                if 'encoded text is:' in output:
                    marker = 'encoded text is:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                elif 'cipher text is:' in output:
                    marker = 'cipher text is:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                elif 'encoded text:' in output:
                    marker = 'encoded text:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                else:
                    outputt = output.lstrip('\n').split('\n')[0].rstrip('.')
            outputs.append(outputt.rstrip('. '))
        if prompt_type == 'cot-like':
            if setting == 'encrypt':
                field = 'cipher_text'
            if setting == 'decrypt':
                field = 'plain_text'
            try:
                output = json.loads(output)[field]
                outputs.append(output)
            except Exception as e:
                outputs.append('-')
    return outputs

In [209]:
experiment_ids = config['experiment_id'].tolist()
preprocess_all(experiment_ids, preprocess_qwen)

In [211]:
config = evaluate_all(config['experiment_id'].tolist(), config)

In [212]:
config.to_csv('evaluation_scores_qwen.csv', index=False)

### qwq

In [193]:
config = pd.read_table('config.txt')
config = config[(config['model'] == 'Qwen/QwQ-32B-Preview') & (config['prompt_type'] != 'default')]
config

Unnamed: 0,experiment_id,number_of_data,type_of_plain_text,prompt_type,setting,shot,model,temperature,max_tokens
32,20241231_214732,100,natural,base,decrypt,0,Qwen/QwQ-32B-Preview,0.01,1024
64,20250101_024404,100,random,base,decrypt,0,Qwen/QwQ-32B-Preview,0.01,1024
68,20250101_081941,100,natural,base,encrypt,0,Qwen/QwQ-32B-Preview,0.01,1024
70,20250101_114522,100,random,base,encrypt,0,Qwen/QwQ-32B-Preview,0.01,1024


In [183]:
def preprocess_qwq(results, prompt_type, setting):
    outputs = []
    for _, item in results.items():
        output = item['output'].split('assistant\n')[1]
        if prompt_type == 'base':
            if setting == 'decrypt':
                if '**Final Answer**' in output:
                    marker = '**Final Answer**'
                    outputt = output.split(marker)[1].replace('\n\n\\[ \\boxed{\\text{', '').replace('}} \\]', '').rstrip('\n')
                elif 'putting it all together:' in output.lower():
                    marker = 'putting it all together:'
                    outputt = output.lower().split(marker)[1].lstrip('\n').split('\n')[0]
                elif 'the plaintext is:' in output.lower():
                    marker = 'the plaintext is:'
                    outputt = output.lower().split(marker)[1].lstrip('\n').split('\n')[0]
                # elif 'the plaintext is:' in output.lower():
                #     marker = 'the plaintext is:'
                #     outputt = output.lower().split(marker)[1].lstrip('\n').split('\n')[0]
                elif 'the decoded plaintext is' in output.lower():
                    marker = 'the decoded plaintext is'
                    outputt = output.lower().replace(':', '').split(marker)[1].lstrip('\n').split('\n')[0]
                elif 'the decoded text is:' in output.lower():
                    marker = 'the decoded text is:'
                    outputt = output.lower().split(marker)[1].lstrip('\n').split('\n')[0]
                else:
                    outputt = output.lstrip('\n').split('\n')[0].rstrip('.')
                # if 'the decoded text:' in output and 'text is:' in output:
                #     marker = 'text is:'
                #     outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
            if setting == 'encrypt':
                if 'encoded text is:' in output:
                    marker = 'encoded text is:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                elif 'cipher text is:' in output:
                    marker = 'cipher text is:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                elif 'encoded text:' in output:
                    marker = 'encoded text:'
                    outputt = output.split(marker)[1].lstrip('\n').split('\n')[0].replace('"', '').lstrip(' ')
                else:
                    outputt = output.lstrip('\n').split('\n')[0].rstrip('.')
            outputs.append(outputt.lstrip(' ').replace('"', ''))
        if prompt_type == 'cot-like':
            if setting == 'encrypt':
                field = 'cipher_text'
            if setting == 'decrypt':
                field = 'plain_text'
            try:
                output = json.loads(output)[field]
                outputs.append(output)
            except Exception as e:
                outputs.append('-')
    return outputs

In [184]:
experiment_ids = config['experiment_id'].tolist()
preprocess_all(experiment_ids, preprocess_qwq)

In [None]:
config = evaluate_all(config['experiment_id'].tolist()[:1], config[config['experiment_id'] == config['experiment_id'].tolist()[0]])

In [195]:
config.to_csv('evaluation_scores_qwq.csv', index=False)

In [196]:
with open('results-20250101_130617.json') as file:
    kk = json.loads(file.read())
for _, tt in kk.items():
    print(_)
    print(tt['output'])
    print('-'*20)

0
system
Decode the following Caesar cipher text. The shift is 3. Output a lookup table and the plain text in a json dictionary: {"lookup_table": {}, "plain_text":...}. Output the dictionary only.
user
jrrg ghhgv eulqj mrb
assistant
To decode the Caesar cipher with a shift of 3, we need to shift each letter in the ciphertext back by 3 positions in the alphabet. We'll create a lookup table that maps each ciphertext character to its corresponding plaintext character and then use this table to decode the given ciphertext.

First, let's create the lookup table for the Caesar cipher with a shift of 3.

### Step 1: Create the Lookup Table

The English alphabet has 26 letters. A Caesar cipher with a shift of 3 means that each letter is shifted three positions back in the alphabet.

- 'a' -> 'x'
- 'b' -> 'y'
- 'c' -> 'z'
- 'd' -> 'a'
- and so on...

We need to create a mapping for all 26 letters, both uppercase and lowercase, and consider non-alphabetic characters remain unchanged.

### Step 2

## cot-like: lookup table

In [226]:
def generate_lookup_table(shift, setting, alphabet='abcdefghijklmnopqrstuvwxyz'):
    lookup = {}
    for i in range(len(alphabet)):
        if setting == 'encrypt':
            new_index = (alphabet.index(alphabet[i]) + shift) % len(alphabet)
        if setting == 'decrypt':
            new_index = (alphabet.index(alphabet[i]) - shift) % len(alphabet)
        lookup[alphabet[i]] = alphabet[new_index]
    return lookup

In [241]:
model = 'Qwen/QwQ-32B-Preview'
config = pd.read_table('config.txt')
config = config[(config['model']!= model) & (config['prompt_type'] == 'cot-like')]
config

Unnamed: 0,experiment_id,number_of_data,type_of_plain_text,prompt_type,setting,shot,model,temperature,max_tokens
12,20241230_150644,100,natural,cot-like,decrypt,0,gpt-4o,0.0,256
13,20241230_151022,100,natural,cot-like,encrypt,0,gpt-4o,0.0,256
14,20241230_151431,100,random,cot-like,decrypt,0,gpt-4o,0.0,256
15,20241230_151832,100,random,cot-like,encrypt,0,gpt-4o,0.0,256
17,20241231_215558,100,natural,cot-like,decrypt,0,models--Qwen2.5-7B-Instruct,0.01,1024
21,20241231_221204,100,random,cot-like,decrypt,0,models--Qwen2.5-7B-Instruct,0.01,1024
24,20241231_221449,100,natural,cot-like,decrypt,0,models--Qwen2.5-32B-Instruct,0.01,1024
25,20241231_222940,100,natural,cot-like,encrypt,0,models--Qwen2.5-7B-Instruct,0.01,1024
28,20241231_223901,100,random,cot-like,encrypt,0,models--Qwen2.5-7B-Instruct,0.01,1024
32,20241231_231307,100,random,cot-like,decrypt,0,models--Qwen2.5-32B-Instruct,0.01,1024


In [245]:
experiment_ids = config['experiment_id'].tolist()
outputs = defaultdict(list)
for experiment_id in experiment_ids:
    with open(f'results/results-{experiment_id}.json') as file:
        results = json.loads(file.read())
    legal, corrects = 0, 0
    type_of_plain_text = config['type_of_plain_text'][config['experiment_id'] == experiment_id].to_list()[0]
    model = config['model'][config['experiment_id'] == experiment_id].to_list()[0]
    for _, item in results.items():
        reference = generate_lookup_table(item['shift'], config['setting'][config['experiment_id'] == experiment_id].tolist()[0])
        incorrect = 0
        try:
            if model in ['gpt-4o']:
                lookup = json.loads(item['output'].lstrip('```python\n').rstrip('\n```'))['lookup_table']
            if model in ['models--llama-3.1/hf/8B-Instruct/', 'models--llama-3.1/hf/70B-Instruct/']:
                lookup = json.loads(item['output'].split('assistant\n\n')[1])['lookup_table']
            if model in ['models--Qwen2.5-7B-Instruct', 'models--Qwen2.5-32B-Instruct']:
                lookup = json.loads(item['output'].split('assistant\n')[1])['lookup_table']
            for token in lookup:
                if reference[token] != lookup[token]:
                    incorrect = 1
            legal += 1
            if incorrect == 0:
                corrects += 1
        except Exception as e:
            None
    
    outputs['type'].append(type_of_plain_text)
    outputs['model'].append(model)
    outputs['legal'].append(legal / 100)
    outputs['accurate'].append(round(corrects / legal, 2))

In [246]:
pd.DataFrame(outputs)

Unnamed: 0,type,model,legal,accurate
0,natural,gpt-4o,0.99,1.0
1,natural,gpt-4o,0.99,1.0
2,random,gpt-4o,0.98,1.0
3,random,gpt-4o,0.97,1.0
4,natural,models--Qwen2.5-7B-Instruct,0.6,0.02
5,random,models--Qwen2.5-7B-Instruct,0.7,0.01
6,natural,models--Qwen2.5-32B-Instruct,0.97,0.26
7,natural,models--Qwen2.5-7B-Instruct,0.24,0.29
8,random,models--Qwen2.5-7B-Instruct,0.43,0.37
9,random,models--Qwen2.5-32B-Instruct,0.99,0.3


## code

In [479]:
model = '1-GPT-4o'
config = pd.read_table('config.txt')
config = config[(config['model'] == model) & (config['prompt_type'] == 'code')]
config

Unnamed: 0,experiment_id,number_of_data,type_of_plain_text,prompt_type,setting,shot,model,temperature,max_tokens
4,20240826_121214,100,natural,code,decrypt,0,1-GPT-4o,0.0,512
5,20240826_121553,100,natural,code,encrypt,0,1-GPT-4o,0.0,512
13,20240826_125147,100,random,code,decrypt,0,1-GPT-4o,0.0,512
14,20240826_125531,100,random,code,encrypt,0,1-GPT-4o,0.0,512


In [580]:
experiment_ids = config['experiment_id'].tolist()
outputs = defaultdict(dict)
for experiment_id in experiment_ids:
    setting = config['setting'][config['experiment_id'] == experiment_id].tolist()[0]
    with open(f'results/results-{experiment_id}.json') as file:
        results = json.loads(file.read())
    output = {'golds': [], 'preds': []}
    legal = 0
    for _, item in results.items():
        output['golds'].append(item['gold'])
        try:
            with open('test_code.py', 'w') as file:
                file.write(item['output'].split('```')[1].replace('python\n', ''))
                file.close()
            with open('test_code.py', 'r', encoding='utf-8') as file:
                code = file.read()
                exec_globals = {}
                exec_locals = {}
                exec(code, exec_globals, exec_locals)
                if setting == 'decrypt':
                    field = 'plain_text'
                if setting == 'encrypt':
                    field = 'cipher_text'
                output['preds'].append(exec_locals[field])
            legal += 1
        except Exception as e:
            output['preds'].append('-')
            with open(f'tt-{experiment_id}-{_}.py', 'w') as file:
                file.write(str(item['output']))
                file.close()
    outputs[experiment_id]['legal'] = legal / 100
    outputs[experiment_id]['accuracy'] = round(accuracy_score(output['golds'], output['preds']), 4)

good deeds bring joy
embrace the unknown
chasing your dreams
walk in someone else’s shoes
finding peace within
laughter is the best medicine
reach for the stars
beauty in simplicity
time heals all wounds
cherish the little moments
turning over a new leaf
actions speak louder than words
nurture your relationships
every cloud has a silver lining
kindness costs nothing
a friend in need is a friend indeed
believe in the future
learning from mistakes
a heart full of gratitude
love conquers all
appreciate the small things
change is the only constant
hard work pays off
follow your heart
living in the moment
spread love and positivity
be your own hero
the best is yet to come
seek out new adventures
finding strength in adversity
stay true to yourself
overcoming obstacles
mind over matter
make every day count
rise above the rest
gratitude changes everything
experiment new happiness
the joy of giving
keep moving forward
find your inner peace
eyes on the prize
take the road less traveled
cherish e

In [581]:
outputs

defaultdict(dict,
            {'20240807_120559': {'legal': 1.0, 'accuracy': 1.0},
             '20240807_121723': {'legal': 0.99, 'accuracy': 0.99},
             '20240809_112140': {'legal': 0.99, 'accuracy': 0.99},
             '20240809_112558': {'legal': 0.99, 'accuracy': 0.99}})

In [568]:
with open('test_code.py', 'r', encoding='utf-8') as file:
    code = file.read()
    exec_globals = {}
    exec_locals = {}
    exec(code, exec_globals, exec_locals)

ljoyq vfqd


In [569]:
exec_locals

{'caesar_cipher': <function caesar_cipher(text, shift)>,
 'cipher_text': 'ljoyq vfqd'}

In [582]:
print("```python\ndef caesar_cipher_decrypt(cipher_text, shift):\n    decrypted_text = \"\"\n    for char in cipher_text:\n        if char.isalpha():\n            shifted = ord(char) - shift\n            if char.islower():\n                if shifted < ord('a'):\n                    shifted += 26\n            elif char.isupper():\n                if shifted < ord('A'):\n                    shifted += 26\n            decrypted_text += chr(shifted)\n        else:\n            decrypted_text += char\n    return decrypted_text\n\ncipher_text = \"vxuhjg dlng mhli\"\nshift = 3\nplain_text = caesar_cipher_decrypt(cipher_text, shift)\nprint(plain_text)\n```\n\nOutput:\n```\nsurgej aing jeif\n```")

```python
def caesar_cipher_decrypt(cipher_text, shift):
    decrypted_text = ""
    for char in cipher_text:
        if char.isalpha():
            shifted = ord(char) - shift
            if char.islower():
                if shifted < ord('a'):
                    shifted += 26
            elif char.isupper():
                if shifted < ord('A'):
                    shifted += 26
            decrypted_text += chr(shifted)
        else:
            decrypted_text += char
    return decrypted_text

cipher_text = "vxuhjg dlng mhli"
shift = 3
plain_text = caesar_cipher_decrypt(cipher_text, shift)
print(plain_text)
```

Output:
```
surgej aing jeif
```


## position

In [247]:
config = pd.read_table('config.txt')
config = config[(config['model'] == 'gpt-4o') & (config['shot'] == 0)]
config

Unnamed: 0,experiment_id,number_of_data,type_of_plain_text,prompt_type,setting,shot,model,temperature,max_tokens
0,20241230_142050,100,natural,base,decrypt,0,gpt-4o,0.0,64
1,20241230_142136,100,natural,base,encrypt,0,gpt-4o,0.0,64
2,20241230_142222,100,natural,default,decrypt,0,gpt-4o,0.0,2048
3,20241230_142918,100,natural,default,encrypt,0,gpt-4o,0.0,2048
4,20241230_143257,100,natural,code,decrypt,0,gpt-4o,0.0,512
5,20241230_143809,100,natural,code,encrypt,0,gpt-4o,0.0,512
6,20241230_144045,100,random,base,decrypt,0,gpt-4o,0.0,64
7,20241230_144131,100,random,base,encrypt,0,gpt-4o,0.0,64
8,20241230_144214,100,random,default,decrypt,0,gpt-4o,0.0,2048
9,20241230_145333,100,random,default,encrypt,0,gpt-4o,0.0,2048


In [248]:
experiment_ids = config['experiment_id'].tolist()
shifts = [3, 6, 9, 12]
accuracies_1_by_shift, accuracies_2_by_shift, accuracies_3_by_shift = defaultdict(list), defaultdict(list), defaultdict(list)
for experiment_id in experiment_ids:
    with open(f'results/results-{experiment_id}.json') as file:
        results = json.loads(file.read())
    with open(f'results/outputs-{experiment_id}.txt') as file:
        outputs = [_.rstrip('\n') for _ in file.readlines()]
    shot = config['shot'][config['experiment_id'] == experiment_id].tolist()[0]
    if shot == 0:
        for shift in shifts:
            if shift == 3:
                start, end = 0, 25
            if shift == 6:
                start, end = 25, 50
            if shift == 9:
                start, end = 50, 75
            if shift == 12:
                start, end = 75, 100
            subset = {_:item for _, item in results.items() if item['shift'] == shift}
            _, accuracy_1, accuracy_2, accuracy_3, _ = evaluate(subset, outputs[start:end])
            accuracies_1_by_shift[shift].append(accuracy_1)
            accuracies_2_by_shift[shift].append(accuracy_2)
            accuracies_3_by_shift[shift].append(accuracy_3)
for shift in shifts:
    config[f'accuracy_1_shift_{shift}'] = accuracies_1_by_shift[shift]
    config[f'accuracy_2_shift_{shift}'] = accuracies_2_by_shift[shift]
    config[f'accuracy_3_shift_{shift}'] = accuracies_3_by_shift[shift]

In [249]:
config.to_csv('evaluation_scores_position.csv', index=False)

## relaxed evalutaion

In [512]:
config = pd.read_table('config.txt')
config = config[(config['model'] == '1-GPT-4o') & (config['shot'] == 0)]
config

Unnamed: 0,experiment_id,number_of_data,type_of_plain_text,prompt_type,setting,shot,model,temperature,max_tokens
0,20240826_120207,100,natural,base,decrypt,0,1-GPT-4o,0.0,64
1,20240826_120303,100,natural,base,encrypt,0,1-GPT-4o,0.0,64
2,20240826_120356,100,natural,cot-like,decrypt,0,1-GPT-4o,0.0,256
3,20240826_120804,100,natural,cot-like,encrypt,0,1-GPT-4o,0.0,256
4,20240826_121214,100,natural,code,decrypt,0,1-GPT-4o,0.0,512
5,20240826_121553,100,natural,code,encrypt,0,1-GPT-4o,0.0,512
6,20240826_121851,100,natural,default,decrypt,0,1-GPT-4o,0.0,2048
7,20240826_123049,100,natural,default,encrypt,0,1-GPT-4o,0.0,2048
8,20240826_124202,100,random,base,decrypt,0,1-GPT-4o,0.0,64
9,20240826_124300,100,random,base,encrypt,0,1-GPT-4o,0.0,64


In [513]:
def caesar_cipher(plain_text, shift, alphabet='abcdefghijklmnopqrstuvwxyz'):
    plain_text = plain_text.lower()
    cipher_text = []
    for token in plain_text:
        if token in alphabet:
            new_index = (alphabet.index(token) + shift) % len(alphabet)
            cipher_text.append(alphabet[new_index])
        else:
            cipher_text.append(token)
    return ''.join(cipher_text)

def evaluate_relax(results, outputs, setting):
    golds_normal = [v['gold'] for _, v in results.items()]

    if setting == 'encrypt':
        golds_alternative = [caesar_cipher(v['input'], -int(v['shift'])) for _, v in results.items()]
    if setting == 'encrypt':
        golds_alternative = [caesar_cipher(v['input'], int(v['shift'])) for _, v in results.items()]

    golds = []
    for i in range(len(golds_normal)):
        if golds_alternative[i] == outputs[i]:
            golds.append(golds_alternative[i])
        else:
            golds.append(golds_normal[i])
    
    ters = [token_error_rate(golds[_], outputs[_]) for _ in range(len(golds))]

    golds_1 = [_[0] if len(_) > 0 else '-' for _ in golds]
    outputs_1 = [_[0] if len(_) > 0 else '-' for _ in outputs]
    golds_2 = [_[1] if len(_) > 1 else '-' for _ in golds]
    outputs_2 = [_[1] if len(_) > 1 else '-' for _ in outputs]
    golds_3 = [_[2] if len(_) > 2 else '-' for _ in golds]
    outputs_3 = [_[2] if len(_) > 2 else '-' for _ in outputs]

    return accuracy_score(golds, outputs), accuracy_score(golds_1, outputs_1), accuracy_score(golds_2, outputs_2), accuracy_score(golds_3, outputs_3), np.average(ters)

def evaluate_all_relax(experiment_ids, config):
    shifts = [3, 6, 9, 12]
    accuracies, accuracies_1, accuracies_2, accuracies_3, ters = [], [], [], [], []
    accuracies_by_shift, ters_by_shift = defaultdict(list), defaultdict(list)
    for experiment_id in experiment_ids:
        with open(f'results/results-{experiment_id}.json') as file:
            results = json.loads(file.read())
        with open(f'results/outputs-{experiment_id}.txt') as file:
            outputs = [_.rstrip('\n') for _ in file.readlines()]
        shot = config['shot'][config['experiment_id'] == experiment_id].tolist()[0]
        setting = config['setting'][config['experiment_id'] == experiment_id].tolist()[0]
        accuracy, accuracy_1, accuracy_2, accuracy_3, ter = evaluate_relax(results, outputs, setting)
        accuracies.append(accuracy)
        accuracies_1.append(accuracy_1)
        accuracies_2.append(accuracy_2)
        accuracies_3.append(accuracy_3)
        ters.append(ter)
        if shot == 0:
            for shift in shifts:
                if shift == 3:
                    start, end = 0, 25
                if shift == 6:
                    start, end = 25, 50
                if shift == 9:
                    start, end = 50, 75
                if shift == 12:
                    start, end = 75, 100
                subset = {_:item for _, item in results.items() if item['shift'] == shift}
                accuracy, accuracy_1, accuracy_2, accuracy_3, ter = evaluate_relax(subset, outputs[start:end], setting)
                accuracies_by_shift[shift].append(accuracy)
                ters_by_shift[shift].append(ter)
        if shot != 0:
            for shift in [3, 6, 9, 12]:
                accuracies_by_shift[shift].append('-')
                ters_by_shift[shift].append('-')
    config['accuracy'] = accuracies
    config['accuracy_1'] = accuracies_1
    config['accuracy_2'] = accuracies_2
    config['accuracy_3'] = accuracies_3
    config['token_error_rate'] = ters
    for shift in shifts:
        config[f'accuracy_shift_{shift}'] = accuracies_by_shift[shift]
    for shift in shifts:
        config[f'ter_shift_{shift}'] = ters_by_shift[shift]
    return config

In [514]:
config = evaluate_all(experiment_ids, config)

In [515]:
config

Unnamed: 0,experiment_id,number_of_data,type_of_plain_text,prompt_type,setting,shot,model,temperature,max_tokens,accuracy,...,accuracy_3,token_error_rate,accuracy_shift_3,accuracy_shift_6,accuracy_shift_9,accuracy_shift_12,ter_shift_3,ter_shift_6,ter_shift_9,ter_shift_12
0,20240826_120207,100,natural,base,decrypt,0,1-GPT-4o,0.0,64,0.22,...,0.5,0.515692,0.76,0.08,0.0,0.04,0.110327,0.595297,0.637902,0.719241
1,20240826_120303,100,natural,base,encrypt,0,1-GPT-4o,0.0,64,0.16,...,0.74,0.311455,0.64,0.0,0.0,0.0,0.117021,0.355676,0.432167,0.340958
2,20240826_120356,100,natural,cot-like,decrypt,0,1-GPT-4o,0.0,256,0.22,...,0.61,0.456984,0.76,0.08,0.04,0.0,0.10441,0.465133,0.613474,0.644918
3,20240826_120804,100,natural,cot-like,encrypt,0,1-GPT-4o,0.0,256,0.21,...,0.75,0.309935,0.76,0.04,0.04,0.0,0.063175,0.365997,0.435964,0.374605
4,20240826_121214,100,natural,code,decrypt,0,1-GPT-4o,0.0,512,0.22,...,0.59,0.440373,0.76,0.12,0.0,0.0,0.079209,0.424592,0.606539,0.65115
5,20240826_121553,100,natural,code,encrypt,0,1-GPT-4o,0.0,512,0.19,...,0.73,0.280998,0.72,0.04,0.0,0.0,0.091431,0.298944,0.457253,0.276362
6,20240826_121851,100,natural,default,decrypt,0,1-GPT-4o,0.0,2048,0.72,...,0.98,0.086029,0.88,0.68,0.56,0.76,0.003571,0.111497,0.090865,0.138183
7,20240826_123049,100,natural,default,encrypt,0,1-GPT-4o,0.0,2048,0.21,...,0.79,0.299202,0.64,0.04,0.12,0.04,0.112589,0.328487,0.347634,0.408097
8,20240826_124202,100,random,base,decrypt,0,1-GPT-4o,0.0,64,0.0,...,0.37,0.543129,0.0,0.0,0.0,0.0,0.454318,0.499523,0.625692,0.592985
9,20240826_124300,100,random,base,encrypt,0,1-GPT-4o,0.0,64,0.04,...,0.75,0.279466,0.08,0.04,0.04,0.0,0.125944,0.268329,0.381823,0.341768


In [516]:
config.to_csv('evaluation_scores_relaxed.csv', index=False)