In [1]:
from pathlib import Path
import pandas as pd
import json

In [4]:
csqa_task_description_en = """
### Human: You will be presented with a question that has multiple possible answers. Choose the most suitable option out of "A", "B", "C", "D" or "E" based on your commonsense knowledge. Please provide your answer in the form of a single letter in quotation marks.

Question:

{question}

Options:

A: {answer_a}
B: {answer_b}
C: {answer_c}
D: {answer_d}
E: {answer_e}

### Assistant: "
"""

# # fabian „D“
csqa_task_description_de = """
### Human: Dir wird eine Frage vorgelegt, die mehrere Antwortmöglichkeiten hat. Wähle die am besten geeignete Option aus "A", "B", "C", "D" oder "E" basierend auf deinem gesunden Menschenverstand. Bitte gebe deine Antwort in Form eines einzelnen Buchstabens in Anführungszeichen an.

Frage:

{question}

Antwortmöglichkeiten:

A: {answer_a}
B: {answer_b}
C: {answer_c}
D: {answer_d}
E: {answer_e} 

### Assistant: "
"""

# janine
csqa_task_description_es = """
### Human: Te presenta una pregunta que tiene múltiples respuestas posibles. Elija la opción más adecuada entre "A", "B", "C", "D" o "E" basándote en tus conocimientos de sentido común. Responda con una sola letra entre comillas.

Pregunta:

{question}

Opciones:

A: {answer_a}
B: {answer_b}
C: {answer_c}
D: {answer_d}
E: {answer_e}

### Assistant: "
"""

# anastassia
csqa_task_description_ru = """
### Human: Тебе будет предложен вопрос, имеющий несколько возможных ответов. Выбери наиболее подходящий вариант ответа из "A", "B", "C", "D" или "E" основываясь на своих знаниях. Пожалуйста, дай ответ одной буквой в кавычках.

Вопрос:

{question}

Варианты:

A: {answer_a}
B: {answer_b}
C: {answer_c}
D: {answer_d}
E: {answer_e}

### Assistant: "
"""

# thea
csqa_task_description_fr = """
### Human: On te posera une question comportant de multiples réponses possibles. Choisis l'option qui convient le mieux parmi "A", "B", "C", "D" ou "E" en fonction de ton bon sens. Merci d'envoyer ta réponse sous la forme d'une seule lettre entre guillemets.

Question :

{question}

Options de réponse :

A: {answer_a}
B: {answer_b}
C: {answer_c}
D: {answer_d}
E: {answer_e}

### Assistant: "
"""

# cui
csqa_task_description_zh = """
### Human: 您将看到一个单项选择题。 根据您的常识，从"A"、"B"、"C"、"D"或"E"中选择最合适的选项。 请以带引号的单个字母的形式作答。

问题：

{question}

选项：

A: {answer_a}
B: {answer_b}
C: {answer_c}
D: {answer_d}
E: {answer_e}

### Assistant: "
"""

csqa_task_description_hi = """
### Human: आपको एक प्रश्न प्रस्तुत किया जाएगा जिसके कई संभावित उत्तर होंगे। अपने सामान्य ज्ञान के आधार पर "A", "B", "C", "D" या "E" में से सबसे उपयुक्त विकल्प चुनें। कृपया अपना उत्तर उद्धरण चिह्नों में एक अक्षर के रूप में दें।

सवाल:

{question}

विकल्प:

A: {answer_a}
B: {answer_b}
C: {answer_c}
D: {answer_d}
E: {answer_e}

### Assistant: "
"""

language_prompts = {
    'en': csqa_task_description_en,
    'es': csqa_task_description_es,
    'de': csqa_task_description_de,
    'ru': csqa_task_description_ru,
    'fr': csqa_task_description_fr,
    'zh': csqa_task_description_zh,
    'hi': csqa_task_description_hi,
}

In [5]:
raw_data = Path('/srv/scratch1/kew/X-CSR_datasets/X-CSQA')
output_dir = Path('../data/xcsqa')

for lang_tag in language_prompts.keys():
    data = pd.read_json(raw_data / lang_tag / 'dev.jsonl', lines=True)
    with open(output_dir / f'xcsqa_dev_en_{lang_tag}.json', 'w', encoding='utf8') as en_outf:
        with open(output_dir / f'xcsqa_dev_{lang_tag}_{lang_tag}.json', 'w', encoding='utf8') as x_outf:
            for i, row in data.iterrows():
                question = row['question']['stem']
                answer_a = row['question']['choices'][0]['text']
                answer_b = row['question']['choices'][1]['text']
                answer_c = row['question']['choices'][2]['text']
                answer_d = row['question']['choices'][3]['text']
                answer_e = row['question']['choices'][4]['text']
                
                answer_key = row['answerKey']
                answer_text = row['question']['choices'][ord(answer_key) - ord('A')]['text']
                
                # en_formatted_item
                en_formatted_item = {
                    'instruction': csqa_task_description_en.format(question=question, answer_a=answer_a, answer_b=answer_b, answer_c=answer_c, answer_d=answer_d, answer_e=answer_e).strip(),
                    'answer': [answer_key, answer_text],
                    'question': question,
                    'options': [answer_a, answer_b, answer_c, answer_d, answer_e],
                }
                en_outf.write(f'{json.dumps(en_formatted_item, ensure_ascii=False)}\n')

                # x_formatted_item
                x_formatted_item = {
                    'instruction': language_prompts[lang_tag].format(question=question, answer_a=answer_a, answer_b=answer_b, answer_c=answer_c, answer_d=answer_d, answer_e=answer_e).strip(),
                    'answer': [answer_key, answer_text],
                    'question': question,
                    'options': [answer_a, answer_b, answer_c, answer_d, answer_e],
                }
                x_outf.write(f'{json.dumps(x_formatted_item, ensure_ascii=False)}\n')
    
    print(f'Finished writing {lang_tag} data. Wrote {len(data)} items.')

Finished writing hi data. Wrote 1000 items.
