In [1]:
import os
import json
from jsonschema import validate, ValidationError

### Check for format anomalies

In [21]:
standard_schema = {
    "type": "object",
    "properties": {
        "feature1": {
            "type": "object",
            "properties": {
                "explanation": {"type": "string"},
                "score": {"type": "number"}
            },
            "required": ["explanation", "score"]
        },
        "feature2": {
            "type": "object",
            "properties": {
                "explanation": {"type": "string"},
                "score": {"type": "number"}
            },
            "required": ["explanation", "score"]
        },
        "feature3": {
            "type": "object",
            "properties": {
                "explanation": {"type": "string"},
                "score": {"type": "number"}
            },
            "required": ["explanation", "score"]
        },
        "feature4": {
            "type": "object",
            "properties": {
                "explanation": {"type": "string"},
                "score": {"type": "number"}
            },
            "required": ["explanation", "score"]
        },
        "feature5": {
            "type": "object",
            "properties": {
                "explanation": {"type": "string"},
                "score": {"type": "number"}
            },
            "required": ["explanation", "score"]
        },
        "overall": {
            "type": "object",
            "properties": {
                "explanation": {"type": "string"},
                "score": {"type": "number"}
            },
            "required": ["explanation", "score"]
        }
    },
    "required": ["feature1", "feature2", "feature3", "feature4", "feature5", "overall"]
}

In [2]:
standard_schema_cot = {
    "type": "object",
    "properties": {
        "explanation": {"type": "string"},
        "score": {"type": "number"}
    },
    "required": ["score"]
}

In [3]:
def validate_json(json_data, schema):
    try:
        validate(instance=json_data, schema=schema)
        return True
    except ValidationError as e:
        return False

def find_non_conforming_files(directory, schema):
    non_conforming_files = []
    for root, _, files in os.walk(directory):
        for filename in files:
            if filename.endswith(".json"):
                file_path = os.path.join(root, filename)
                with open(file_path, 'r') as file:
                    try:
                        json_data = json.load(file)
                        if not validate_json(json_data, schema):
                            non_conforming_files.append(file_path)
                    except json.JSONDecodeError:
                        # 如果JSON文件解析失败，也视为不符合标准
                        non_conforming_files.append(file_path)
    return non_conforming_files

In [23]:
json_directory = 'data/validation_solution'

non_conforming_files = find_non_conforming_files(json_directory, standard_schema)

print("These files have incorrect format:")
for file in non_conforming_files:
    relative_path = os.path.relpath(file, json_directory)
    print(relative_path)

These files have incorrect format:


In [19]:
json_directory = 'data/train_solution_cot'

non_conforming_files = find_non_conforming_files(json_directory, standard_schema_cot)

print("These files have incorrect format:")
for file in non_conforming_files:
    relative_path = os.path.relpath(file, json_directory)
    print(relative_path)

These files have incorrect format:


def rename_numeric_keys(data):
    """Renames keys in the dictionary where the value type is a number to 'score'."""
    new_data = {}
    for k, v in data.items():
        new_key = 'score' if isinstance(v, (int, float)) else k
        if new_key in new_data:
            # If 'score' already exists, create a unique key
            count = 1
            unique_key = f"{new_key}_{count}"
            while unique_key in new_data:
                count += 1
                unique_key = f"{new_key}_{count}"
            new_data[unique_key] = v
        else:
            new_data[new_key] = v
    return new_data

def process_json_files(directory):
    """Recursively processes all JSON files in the specified directory and its subdirectories, renaming keys with numeric values."""
    for root, _, files in os.walk(directory):
        for filename in files:
            if filename.endswith(".json"):
                file_path = os.path.join(root, filename)
                print(f"Processing file: {file_path}")

                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)

                # Rename keys with numeric values
                new_data = rename_numeric_keys(data)

                # Write the updated data back to the file
                with open(file_path, 'w', encoding='utf-8') as file:
                    json.dump(new_data, file, ensure_ascii=False, indent=4)
                print(f"Updated file: {file_path}")

if __name__ == "__main__":
    # Define the directory containing the JSON files
    json_directory = "data/train_solution_cot"

    # Process the JSON files
    process_json_files(json_directory)


### Update overall scores

In [26]:
def calculate_overall_average(data):
    feature_scores = [data[feature]['score'] for feature in data if feature.startswith('feature')]
    average_score = round(sum(feature_scores) / len(feature_scores), 2)
    data['overall']['score'] = average_score
    return data

In [27]:
def process_directory(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as json_file:
                    data = json.load(json_file)
                updated_data = calculate_overall_average(data)
                with open(file_path, 'w') as json_file:
                    json.dump(updated_data, json_file, indent=4)
                print(f'Updated {file_path}')

In [28]:
def process_directory(directory):
    for root, _, files in os.walk(directory):
        print(f'Processing directory: {root}')
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                print(f'Processing file: {file_path}')
                try:
                    with open(file_path, 'r') as json_file:
                        data = json.load(json_file)
                    updated_data = calculate_overall_average(data)
                    with open(file_path, 'w') as json_file:
                        json.dump(updated_data, json_file, indent=4)
                    print(f'Successfully updated {file_path}')
                except Exception as e:
                    print(f'Failed to process {file_path}: {e}')

In [29]:
# Set the directory path
directory_path = 'data/validation_solution'

# Process the directory
process_directory(directory_path)

Processing directory: data/validation_solution
Processing directory: data/validation_solution/problem-294
Processing file: data/validation_solution/problem-294/para_2_and_3.json
Successfully updated data/validation_solution/problem-294/para_2_and_3.json
Processing file: data/validation_solution/problem-294/para_3_and_4.json
Successfully updated data/validation_solution/problem-294/para_3_and_4.json
Processing file: data/validation_solution/problem-294/para_1_and_2.json
Successfully updated data/validation_solution/problem-294/para_1_and_2.json
Processing directory: data/validation_solution/problem-260
Processing file: data/validation_solution/problem-260/para_6_and_7.json
Successfully updated data/validation_solution/problem-260/para_6_and_7.json
Processing file: data/validation_solution/problem-260/para_2_and_3.json
Successfully updated data/validation_solution/problem-260/para_2_and_3.json
Processing file: data/validation_solution/problem-260/para_8_and_9.json
Successfully updated da

with open('para_1_and_2.json', 'r') as file:
    data = json.load(file)

updated_data = calculate_overall_average(data)

with open('para_1_and_2.json', 'w') as file:
    json.dump(updated_data, file, indent=4)