In [3]:
import json
from typing import Dict
from copy import deepcopy

def load_json_file(filepath: str) -> dict:
    with open(filepath, 'r') as f:
        return json.load(f)

def save_json_file(data: dict, filepath: str):
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=2)

def standardize_dictionaries(leetcode_path: str, usaco_path: str):
    # Field mappings (old_name: standard_name)
    field_mappings = {
        # USACO -> Standard
        'name': 'title',                    # Standardize name to title
        'problem_link': 'problem_link',     # Keep as is
        'test_data_link': 'test_data_link', # Keep as is
        'solution_link': 'solution_link',    # Keep as is
        'contest_link': 'contest_link',      # Keep as is
        'inner_contest_link': 'inner_contest_link', # Keep as is
        'problem_level': 'difficulty',      # Standardize problem_level to difficulty
        'cp_id': 'cp_id',                  # Keep as is
        'problem_id': 'id',                # Standardize problem_id to id
        'description': 'description',       # Keep as is
    }

    try:
        leetcode_dict = load_json_file(leetcode_path)
        usaco_dict = load_json_file(usaco_path)
    except json.JSONDecodeError as e:
        print(f"Error loading JSON files: {e}")
        return

    # Create new standardized dictionaries
    new_leetcode_dict = {}
    new_usaco_dict = {}

    # Standardize LeetCode dictionary
    for problem_id, data in leetcode_dict.items():
        new_data = deepcopy(data)
        new_data['source'] = 'leetcode'
        
        # Rename fields according to mapping
        for old_name, new_name in field_mappings.items():
            if old_name in new_data:
                new_data[new_name] = new_data.pop(old_name)
        
        new_leetcode_dict[problem_id] = new_data

    # Standardize USACO dictionary
    for problem_id, data in usaco_dict.items():
        new_data = deepcopy(data)
        new_data['source'] = 'usaco'
        
        # Rename fields according to mapping
        for old_name, new_name in field_mappings.items():
            if old_name in new_data:
                new_data[new_name] = new_data.pop(old_name)
        
        new_usaco_dict[problem_id] = new_data

    # Save standardized dictionaries
    save_json_file(new_leetcode_dict, leetcode_path.replace('.json', '_standardized.json'))
    save_json_file(new_usaco_dict, usaco_path.replace('.json', '_standardized.json'))

    # Print summary of changes
    print("=== Standardization Summary ===")
    print("\nStandard fields used:")
    for old_name, new_name in field_mappings.items():
        if old_name != new_name:
            print(f"- '{old_name}' → '{new_name}'")
    
    print("\nFiles created:")
    print(f"- {leetcode_path.replace('.json', '_standardized.json')}")
    print(f"- {usaco_path.replace('.json', '_standardized.json')}")

In [None]:
leetcode_path = "leetcode_problem_dict.json"
usaco_path = "usaco_subset307_dict.json"
standardize_dictionaries(leetcode_path, usaco_path)

In [4]:
def standardize_livebench_dictionary(livebench_path: str):
    try:
        livebench_dict = load_json_file(livebench_path)
    except json.JSONDecodeError as e:
        print(f"Error loading JSON file: {e}")
        return

    # Create new standardized dictionary
    new_livebench_dict = {}

    # Standardize Livebench dictionary
    for problem_id, data in livebench_dict.items():
        new_data = {
            'source': 'livebench',
            'id': problem_id,
            'title': f"Math Question {problem_id[:8]}",  # Use first 8 chars of hash as short ID
            'description': data['turns'][0],  # The question text is in the first turn
            'cp_id': problem_id[:8],         # Use first 8 chars as CP ID
            'ground_truth': data.get('ground_truth', ''),
            'category': data.get('category', ''),
            'task': data.get('task', ''),
            'subtask': data.get('subtask', '')
        }
        
        # Set difficulty based on competition type
        if 'amc' in data.get('subtask', '').lower():
            new_data['difficulty'] = 'easy'
        elif 'aime' in data.get('subtask', '').lower():
            new_data['difficulty'] = 'medium'
        elif 'imo' in data.get('subtask', '').lower():
            new_data['difficulty'] = 'hard'
        else:
            new_data['difficulty'] = 'medium'  # Default to medium if unknown
        
        new_livebench_dict[problem_id] = new_data

    # Save standardized dictionary
    save_json_file(new_livebench_dict, livebench_path.replace('.json', '_standardized.json'))

    # Print summary
    print("=== Standardization Summary ===")
    print("\nFields standardized:")
    print("- Added 'source' field with value 'livebench'")
    print("- Created 'title' from problem ID")
    print("- Extracted 'description' from turns")
    print("- Created 'cp_id' from problem ID")
    print("- Mapped competition types to difficulties")
    
    print("\nFile created:")
    print(f"- {livebench_path.replace('.json', '_standardized.json')}")

# Usage
livebench_path = "livebench_math_question_dict.json"
standardize_livebench_dictionary(livebench_path)

=== Standardization Summary ===

Fields standardized:
- Added 'source' field with value 'livebench'
- Created 'title' from problem ID
- Extracted 'description' from turns
- Created 'cp_id' from problem ID
- Mapped competition types to difficulties

File created:
- livebench_math_question_dict_standardized.json
