**Transforming our data**

Note: This code is specified for TOEFL and RACE dataset in our specific format (json) in which each instance is separated by a comma (","). Further, each element in the instance is sepated by "<delimiter!>", except for the last one (the explantion for the correct answer) which is separated by ":".

<br>


If your Data file is in the same format you can use following code without alterations:

In [None]:
import numpy as np
import json
import re

This approach separates each instance into 6 parts - id, context, question, answer choices, correct answer, and explantion. The output file contains same information (with numerical id) but in this data file **all** elements (including the explanation for the correct answer) are separated by "<delimiter!>".

In [None]:
def process_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)

    processed_data = []
    for idx, (key, value) in enumerate(data.items(), start=1):
        parts = key.split('<delimiter!>')
        #print(parts)
        context = parts[0].strip()
        question = parts[1].strip()

        # Splitting answer choices using regular expressions
        answer_choices_str = parts[2].strip()
        answer_choices = re.split(r'\s(?=[A-D]\.)', answer_choices_str)
        answer_choices = [choice.strip() for choice in answer_choices]

        correct_answer = parts[3].strip()

        processed_data.append({
            'id': idx,
            'context': context,
            'question': question,
            'answer_choices': answer_choices,
            'correct_answer': correct_answer,
            'explanation': value
        })

    return processed_data

# Let's load our dataset
file_path = '/content/train_gpt_TOEFL_verified.json'  #input the dataset file path here
processed_data = process_file(file_path)
print(len(processed_data)) # to see how many Q&A we have to go through

690


In [None]:
output=[]

for item in processed_data:
  id = item['id']
  context = item['context'].replace('\n', ' ')
  question = item['question']
  answer_choices = item['answer_choices']
  correct_answer = item['correct_answer']
  explanation = item['explanation']

  answer_choices_string = ' '.join(item['answer_choices'])

  formatted_string = f"{id}<delimiter!>{context}<delimiter!>{question}<delimiter!>{answer_choices_string}<delimiter!>{correct_answer}<delimiter!>{explanation}"
  output.append(formatted_string)

  #print(id) # to see what step we are on

In [None]:
with open('train_gpt_TOEFL_verified_processed_all.json', 'w') as file:
    json.dump(output, file)

This approach separates each instance into 3 parts as required by contrastive learning - key (which includes context and question), positive answers (which includes the correct answer and the explanation for it), and negative answers (which includes the incorrect answer choices). The resulting file cointains elements separated by "," as in the input file but in this case all the 3 parts of each element are separated by "<delimiter!>".

In [None]:
letter_to_num = {"A":0, "B":1, "C":2, "D":3} # to convert letter to answer

def process_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)

    processed_data = []
    for idx, (key, value) in enumerate(data.items(), start=1):
        parts = key.split('<delimiter!>')
        context = parts[0].strip()
        question = parts[1].strip()

        # Splitting answer choices using regular expressions
        answer_choices_str = parts[2].strip()
        answer_choices = re.split(r'\s(?=[A-D]\.)', answer_choices_str)
        answer_choices = [choice.strip() for choice in answer_choices]

        correct_answer = parts[3].strip()

        positive_answer = answer_choices[letter_to_num[correct_answer]]

        negative_answer = [x for x in answer_choices if x != positive_answer]

        processed_data.append({
            'key': context + ":" + question,
            'positive_answers': positive_answer+":"+value,
            'negative_answers': negative_answer
        })


    return processed_data

# Let's load our first dataset (RACE test dataset) to test the MQAG model on
file_path = '/content/train_gpt_TOEFL_verified.json'
processed_data = process_file(file_path)
print(len(processed_data)) # to see how many Q&A we have to go through

690


In [None]:
output=[]

for item in processed_data:
  key = item['key']
  positive_answers = item['positive_answers']
  negative_answers = ' '.join(item['negative_answers'])

  formatted_string = f"{key}<delimiter!>{positive_answers}<delimiter!>{negative_answers}"
  output.append(formatted_string)

  #print(id) # to see what step we are on

In [None]:
with open('train_gpt_TOEFL_verified_processed_key_pos_neg.json', 'w') as file:
    json.dump(output, file)