# Remove Answer Option Prefixes and Convert Answer Letters to Numbers

This notebook removes the "A) ", "B) ", "C) ", "D) ", "E) " prefixes from choices in the question data and converts answer letters to numbers (A->1, B->2, etc.).

In [2]:
import json
import pandas as pd
import re
import os

In [3]:
# Define paths to data files
tyt_path = 'd:/UserData/Desktop/python_projects/lm_harness_data_prepare/data/2024_YKS_TYT_questions.json'
ayt_path = 'd:/UserData/Desktop/python_projects/lm_harness_data_prepare/data/2024_YKS_AYT_questions.json'

# Function to load JSON data
def load_json_data(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            # The files aren't standard JSON arrays - each line is a separate JSON object
            data = []
            for line in f:
                line = line.strip()
                if line and not line.startswith('//'):
                    try:
                        data.append(json.loads(line))
                    except json.JSONDecodeError:
                        # Some lines might be partial or malformed
                        pass
            return data
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return []

# Load data
tyt_data = load_json_data(tyt_path)
ayt_data = load_json_data(ayt_path)

print(f"Loaded {len(tyt_data)} TYT questions and {len(ayt_data)} AYT questions")

Loaded 125 TYT questions and 166 AYT questions


In [4]:
# Convert to pandas DataFrames
tyt_df = pd.DataFrame(tyt_data)
ayt_df = pd.DataFrame(ayt_data)

# Display sample data
print("TYT Sample:")
if not tyt_df.empty:
    display(tyt_df.head(2))
else:
    print("TYT DataFrame is empty")

print("\nAYT Sample:")
if not ayt_df.empty:
    display(ayt_df.head(2))
else:
    print("AYT DataFrame is empty")

TYT Sample:


Unnamed: 0,0
0,E) V
1,E) geleneği aktarmasını - mantıksal bir akışın



AYT Sample:


Unnamed: 0,0
0,E) karmaşıklığını - niteliğini
1,"E) V. cümlede, bir bilim insanının yaptığı ica..."


In [None]:
# Function to remove option prefixes (A), B), etc.) from choices
def clean_choices(choices):
    if not isinstance(choices, list):
        return []
    
    # Using a regex pattern to match options like "A) ", "B) ", etc.
    pattern = re.compile(r'^[A-E])\s')
    
    cleaned_choices = []
    for choice in choices:
        if isinstance(choice, str):
            # Remove the option prefix
            cleaned = pattern.sub('', choice)
            cleaned_choices.append(cleaned)
        else:
            cleaned_choices.append(choice)
    
    return cleaned_choices

# Function to convert answer letter to number (A->1, B->2, etc.)
def convert_answer_to_number(answer):
    if isinstance(answer, str) and len(answer) == 1 and 'A' <= answer <= 'E':
        # Convert A->1, B->2, etc.
        return ord(answer) - ord('A') + 1
    return answer

# Process each DataFrame
def process_dataframe(df):
    if df.empty:
        return df
    
    # Create a copy to avoid modifying the original
    processed_df = df.copy()
    
    # Clean choices
    if 'choices' in processed_df.columns:
        processed_df['choices'] = processed_df['choices'].apply(clean_choices)
    
    # Convert answer letters to numbers
    if 'answer' in processed_df.columns:
        processed_df['answer'] = processed_df['answer'].apply(convert_answer_to_number)
    
    # Remove answer_key field if it exists
    if 'answer_key' in processed_df.columns:
        processed_df = processed_df.drop('answer_key', axis=1)
    
    return processed_df

# Process the DataFrames
tyt_processed = process_dataframe(tyt_df)
ayt_processed = process_dataframe(ayt_df)

# Display processed data
print("Processed TYT Sample:")
if not tyt_processed.empty:
    display(tyt_processed.head(2))
else:
    print("Processed TYT DataFrame is empty")

print("\nProcessed AYT Sample:")
if not ayt_processed.empty:
    display(ayt_processed.head(2))
else:
    print("Processed AYT DataFrame is empty")

In [None]:
# Function to save processed data back to JSON files
def save_processed_data(df, output_path):
    if df.empty:
        print(f"No data to save to {output_path}")
        return
    
    # Create output directory if it doesn't exist
    output_dir = os.path.dirname(output_path)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Convert DataFrame to list of dictionaries
    records = df.to_dict('records')
    
    # Save to JSON file
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(records, f, ensure_ascii=False, indent=4)
    
    print(f"Saved processed data to {output_path}")

# Define output paths
tyt_output_path = 'd:/UserData/Desktop/python_projects/lm_harness_data_prepare/data/processed_TYT_questions.json'
ayt_output_path = 'd:/UserData/Desktop/python_projects/lm_harness_data_prepare/data/processed_AYT_questions.json'

# Save processed data
save_processed_data(tyt_processed, tyt_output_path)
save_processed_data(ayt_processed, ayt_output_path)

In [None]:
# Verify the saved data
def load_and_verify(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            print(f"Successfully loaded {file_path}")
            print(f"Number of questions: {len(data)}")
            
            # Verify first record
            if data:
                print("Sample question:")
                sample = data[0]
                print(f"Question: {sample.get('question', 'N/A')}")
                print(f"Answer: {sample.get('answer', 'N/A')}")
                print(f"Choices: {sample.get('choices', 'N/A')}")
                return data
            else:
                print("No data found in the file")
                return []
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return []

print("Verifying TYT processed data:")
tyt_verified = load_and_verify(tyt_output_path)

print("\nVerifying AYT processed data:")
ayt_verified = load_and_verify(ayt_output_path)

## Conclusion

The notebook has successfully:
1. Loaded question data from TYT and AYT JSON files
2. Removed option prefixes (A), B), etc.) from choices
3. Converted answer letters to numbers (A->1, B->2, etc.)
4. Removed the answer_key field
5. Saved the processed data to new JSON files

The processed data is now ready for further use in language model training or evaluation.