In [24]:
from pathlib import Path
import pandas as pd
import json

In [25]:
csqa_task_description_en = """
### Human: You will be presented with a question in {language} that has multiple possible answers. Choose the most suitable option out of "A", "B", "C", "D", and "E", based on your commonsense knowledge.

Question:

{question}

Options:

A: {answer_a}
B: {answer_b}
C: {answer_c}
D: {answer_d}
E: {answer_e} 

### Assistant: Given the answers "A", "B", "C", "D", or "E", the most logical answer is \""""

# csqa_task_description_de = """
# ### Human: Es wird eine Frage gestellt, die mehrere Antwortmöglichkeiten hat. Wähle aus den Antwortmöglichkeiten "A", "B", "C", "D" und "E" diejenige aus, die deinem gesunden Menschenverstand am ehesten entspricht.

# Frage:

# {question}

# Optionen:

# A: {answer_a}
# B: {answer_b}
# C: {answer_c}
# D: {answer_d}
# E: {answer_e} 

# ### Assistant: Die logischste Antwort ist """


# langauge_prompt = {
#     'en': csqa_task_description_en,
#     'es': csqa_task_description_en,
#     'de': csqa_task_description_en
# }

In [26]:
raw_data = Path('/srv/scratch1/kew/X-CSR_datasets/X-CSQA')
languages = {
    'en': 'English',
    'de': 'German', 
    'es': 'Spanish',
    'fr': 'French', 
    'hi': 'Hindi', 
    'ru': 'Russian', 
    # 'sw': 'Swahili'
}
for lang_tag, lang_name in languages.items():

    data = pd.read_json(raw_data / lang_tag / 'dev.jsonl', lines=True)
    with open(f'data/xcsr_dev_en_{lang_tag[:2]}.json', 'w', encoding='utf8') as outf:
        for i, row in data.iterrows():
            question = row['question']['stem']
            answer_a = row['question']['choices'][0]['text']
            answer_b = row['question']['choices'][1]['text']
            answer_c = row['question']['choices'][2]['text']
            answer_d = row['question']['choices'][3]['text']
            answer_e = row['question']['choices'][4]['text']
            
            answer_key = row['answerKey']
            answer_text = row['question']['choices'][ord(answer_key) - ord('A')]['text']
            formatted_item = {
                'instruction': csqa_task_description_en.format(language=lang_name, question=question, answer_a=answer_a, answer_b=answer_b, answer_c=answer_c, answer_d=answer_d, answer_e=answer_e).strip(),
                'answer': [answer_key, answer_text],
                'question': question,
                'options': [answer_a, answer_b, answer_c, answer_d, answer_e],
            }
            outf.write(f'{json.dumps(formatted_item, ensure_ascii=False)}\n')
        
    print(f'Finished writing {lang_tag} data. Wrote {len(data)} items.')

Finished writing en data. Wrote 1000 items.
Finished writing de data. Wrote 1000 items.
Finished writing es data. Wrote 1000 items.
Finished writing fr data. Wrote 1000 items.
Finished writing hi data. Wrote 1000 items.
Finished writing ru data. Wrote 1000 items.


In [4]:
data['question'][0]

{'stem': 'The dental office handled a lot of patients who experienced traumatic mouth injury, where were these patients coming from?',
 'choices': [{'label': 'A', 'text': 'town'},
  {'label': 'B', 'text': 'michigan'},
  {'label': 'C', 'text': 'hospital'},
  {'label': 'D', 'text': 'schools'},
  {'label': 'E', 'text': 'office building'}]}