In [11]:
import os
import json
from collections import defaultdict

lang = 'english'
data_path = "/home/bhux/workplace/hamming/hamming-examples/ts/src/createM3Dataset/m3/data/text-question/"
test_file_path = data_path + f"{lang}-questions-test.json"
# 'all' or integer
num_samples = 'all'
# 'low', 'mid', or 'high' in list
selected_levels = ['low', 'mid', 'high']
# 'default' or 'en-instruct' or 'en-trans'
method = 'default'
# 'zero-shot' or 'few-shot'
setting = 'zero-shot'
# 'chat'
model = 'chat'

In [12]:
def generate_one_example(question, lang, method, fill_answer=False, return_answer=False):
    answer_word = {'english': "Answer:", 'chinese': '答案：', 'vietnamese': 'Câu trả lời:', 'thai': 'คำตอบ:', 'italian': 'La risposta:',
                   'javanese': 'Wangsulan:', 'swahili': 'Jibu:', 'afrikaans': 'Antwoord:' ,'portuguese': 'Responder:'}
    background = '\\\n'+'\\\n'.join(question['background_description']) if question['background_description'] != [] else ''
    if method == 'default':
        prompt = background + '\\\n' + question['question_text'] + '\\\n' + '\\\n '.join(question['options']) + f'\\\n{answer_word[lang]}'
    elif method == 'en-instruct':
        prompt = background + '\\\n' + question['question_text'] + '\\\n' + '\\\n '.join(question['options']) + f'\\\n Answer:'
    elif method == 'en-trans':
        prompt = question['background_description_english'] + '\\\n' + question['question_text_english'] + '\\\n' + question['options_english'] + f'\\\nAnswer:'
    
    if return_answer:
        return prompt, str(question['answer_text'])

    if fill_answer:
        prompt += str(question['answer_text'])
    
    return prompt

def generate_prompt(lang, method, setting, model, test_question, dev_question):
    subject2target = {'english': {'language': 'English', 'math': "Math", 'social-science': "Social Science", 'natural-science': 'Natural Science'},
                      'english4all': {'language': 'Language', 'math': "Math", 'social-science': "Social Science", 'natural-science': 'Natural Science'},
                      'chinese':  {'language': '语文', 'math': "数学", 'social-science': "社会科学", 'natural-science': '自然科学'},
                      'javanese': {'language': 'Bahasa Jawa'},
                      'swahili': {'language': 'KISWAHILI'},
                      'thai': {'language': 'ภาษาไทย', 'math': 'คณิตศาสตร์', 'social-science': 'สังคมศึกษา', 'natural-science': 'วิทยาศาสตร์'},
                      'vietnamese': {'language': 'Tiếng Việt', 'math': "Toán", 'social-science': "Khoa học xã hội", 'natural-science': 'Khoa học tự nhiên'},
                      'italian': {'language': 'Italiano', 'math': "Matematica", 'social-science': "Scienze sociali", 'natural-science': 'Scienze naturali'},
                      'afrikaans': {'language': 'Afrikaans Huistaal', 'math': "Wiskunde", 'social-science': "Sosiale Wetenskappe", 'natural-science': 'Natuurwetenskap'},
                      'portuguese': {'language': 'Linguagens', 'math': 'Matemática', 'social-science': 'Ciências Humanas', 'natural-science': 'Ciências da Natureza'},
                      }
    subject = subject2target[lang][test_question['subject_category']]

    # default to use own target language in the prompt/instruction (monolingual setting)
    if method == 'default':
        if lang == 'english':
            hint = f"The following is a multiple choice question about {subject}."
        elif lang == 'chinese':
            hint = f"以下是关于{subject}的单项选择题。"
        elif lang == 'javanese':
            # have different registered of different levels
            if test_question['level'] == 'low':
                hint = "Ing ngisor iki ana pitakon pilihan ganda babagan Bahasa Jawa."
            else:
                hint = "Menika soal pilihan ganda babagan Bahasa Jawa."
        elif lang == 'thai':
            hint = f"ต่อไปนี้เป็นคำถามแบบปรนัย วิชา{subject}."
        elif lang == 'vietnamese':
            hint = f"Sau đây là các câu hỏi trắc nghiệm về {subject}."
        elif lang == 'italian':
            hint = f"Le seguenti sono domande a risposta multipla su {subject}."
        elif lang == 'afrikaans':
            hint = f"Die volgende is veelvuldige keuse vrae oor {subject}."
        elif lang == 'swahili':
            hint = f"Yafuatayo ni maswali ya chaguo nyingi kuhusu Kiswahili."
        elif lang == 'portuguese':
            hint = f"A seguir estão questões de múltipla escolha sobre {subject}."
        else:
            raise NotImplemented

        # need to instruct the model to only output the option text
        if model in ['chat', 'fake'] or setting == 'zero-shot':
            if lang == 'english':
                hint += ' Please only give the correct option, without any other details or explanations.'
            elif lang == 'chinese':
                hint += ' 请仅给出正确选项对应的选项序号而非其他细节。'
            elif lang == 'thai':
                hint += ' โปรดระบุคำตอบเป็นตัวเลือกที่ถูกต้องโดยไม่ต้องให้รายละเอียดอื่นเพิ่มเติม.'
            elif lang == 'vietnamese':
                hint += ' Vui lòng chỉ đưa ra phương án đúng, không có bất kỳ chi tiết hay giải thích nào khác.'
            elif lang == 'italian':
                hint += ' Dai solo l\'opzione corretta, senza altri dettagli o spiegazioni'
            elif lang == 'javanese':
                hint += ' Nyuwun paringaken pilihan wangsulan ingkang leres mawon, tanpa detail utawi penjelasan sanesipun.'
            elif lang == 'afrikaans':
                hint += ' Gee asseblief net die korrekte opsie, sonder enige ander besonderhede of verduidelikings.'
            elif lang == 'swahili':
                hint += ' Tafadhali toa chaguo sahihi pekee, bila maelezo yoyote au maelezo.'
            elif lang == 'portuguese':
                hint += ' Por favor, dê apenas a opção correta, sem quaisquer outros detalhes ou explicações.'
            else:
                raise NotImplementedError

    # for any language, just use english instructions
    elif method == 'en-instruct' or method == 'en-trans':
        subject = subject2target['english4all'][test_question['subject_category']]
        hint = f"The following is a multiple choice question about {subject}."
        hint += ' Please only give the correct option, without any other details or explanations.'
    else:
        raise NotImplementedError
    hint += " \ "
    if setting == 'zero-shot':
        prompt, answer = generate_one_example(test_question, lang, method, return_answer=True)
        prompt = hint + '\\\n\\\n' + prompt
    elif setting == 'few-shot':
        dev_questions_list = dev_question[test_question['level']][test_question['subject_category']]
        prompt, answer = generate_one_example(test_question, lang, method, return_answer=True)
        prompt = hint + '\\\n\\\n' + '\\\n\\\n'.join(dev_questions_list) + '\\\n\\\n' + prompt
    else:
        raise NotImplementedError

    return prompt, answer

def generate_dev_examples(dev_questions, lang, method):

    # save the dev examples into a dict, according to their levels and subject categories
    dev_example_dict = defaultdict(lambda: defaultdict(list))
    for q in dev_questions:
        level = q['level']
        cate = q['subject_category']
        dev_string = generate_one_example(q, lang, method, fill_answer=True)
        dev_example_dict[level][cate].append(dev_string)
    
    return dev_example_dict

In [13]:
if setting == 'few-shot':   
    dev_file_path = data_path + f"{lang}-questions-dev.json"
    if os.path.exists(dev_file_path):
        with open(dev_file_path, "r") as f:
            dev_questions = json.load(f)
        dev_examples = generate_dev_examples(dev_questions, lang, method)
    else:
        raise FileNotFoundError
else:
    dev_examples = {}

if os.path.exists(test_file_path):
    with open(test_file_path, "r") as f:
        test_questions = json.load(f)

if num_samples != 'all':
    num_samples = int(num_samples)
    test_questions = test_questions[:num_samples]
        
# if only want to test on certain levels
if len(selected_levels) < 3:
    test_questions = [q for q in test_questions if q['level'] in selected_levels]

all_prompts = []
for question in test_questions:
    prompt_pair = generate_prompt(lang, method, setting, model, question, dev_examples)
    all_prompts.append(prompt_pair)

In [14]:
print(all_prompts[0])

('The following is a multiple choice question about Social Science. Please only give the correct option, without any other details or explanations. \\ \\\n\\\n\\\nWhich statement is an opinion about World War II?\\\n(1) Dropping atomic bombs was not necessary to end the war.\\\n (2) The economies of many countries were damaged by the war.\\\n (3) Many families suffered the loss of loved ones during the war.\\\n (4) Italy and Germany were members of the Axis powers.\\\nAnswer:', '1')


In [15]:
header = "import { DatasetItemValue } from \"@hamming/hamming-sdk\"; \n\n const sampleData: DatasetItemValue[] = ["
footer = "];export default sampleData;"
def create_data_file(prompt_pairs):
    line = header
    for prompt_pair in prompt_pairs:
        p,a = prompt_pair
        p = p.replace("\"", "\\\"")
        p = p.replace("\n", "\\\n")
        a = a.replace("\"", "\\\"")
        a = a.replace("\n", "\\\n")
        line += "\n{ \n\
            input: { \n\
            query:\"" + f"{p}" + "\", \n\
            }, \n\
            output: { \n\
            response: \n\
                \"" + f"{a}" + "\",\n\
            }, \n\
            metadata: {},\n\
        },"
    line += footer
    return line

l = create_data_file(all_prompts)

with open('./m3-data.ts', 'w') as writer:
    writer.writelines(l)