In [1]:
import json
from pprint import pprint
from pathlib import Path
import os
from collections import defaultdict

In [None]:
# Define the path to the JSON file
file_path = Path('benchmark_output_lite/runs/v1.5.0')
test_path = ''
model_path = r'commonsense,dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-flash-001/stats.json'

path = Path('benchmark_output_lite/runs/v1.5.0/commonsense,dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-flash-001/stats.json')
# Concatenate paths
full_path = file_path / model_path

# Print the constructed path to verify
print(f"Constructed path: {full_path}")

# Ensure the constructed path exists before trying to open it
if path.exists():
    # Open and load the JSON file
    with path.open('r') as file:
        data = json.load(file)
    
    # Accessing the data
    pprint(data)
else:
    print(f"The file at {full_path} does not exist.")


# quasi_exact_match = score
# exact_match

In [19]:
from pathlib import Path
import json
from collections import defaultdict

# Define the root directory
root_dir = Path('benchmark_output_lite/runs/v1.5.0')

# Initialize a dictionary to store the data
results = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

# Loop through all directories
for json_file in root_dir.rglob('stats.json'):
    
    # Extract information from the directory name
    parts = str(json_file.relative_to(root_dir)).split(',')
    if len(parts) >= 4:
        test_name = parts[0]
        dataset = parts[1].split('=')[1]
        method = parts[2].split('=')[1]
        model_name = parts[3].split('=')[1].split('-')[1]
    print(parts)
    break


['commonsense', 'dataset=openbookqa', 'method=multiple_choice_joint', 'model=01-ai_yi-large-preview\\stats.json']


In [5]:

# Define the root directory
root_dir = Path('benchmark_output_lite/runs/v1.5.0')


# Define the specific metrics for each test
test_metrics = {
    'narrative_qa': 'f1_score',
    'natural_qa': 'f1_score',
    'commonsense': 'exact_match',
    'openbook': 'exact_match',
    'mmlu': 'exact_match',
    'wmt_14': 'bleu_4',
    'med_qa': 'quasi_exact_match',
    'math': 'math_equiv_chain_of_thought',
    'legalbench': 'quasi_exact_match',
    'gsm': 'final_number_exact_match'
}

# Function to extract information and read stats.json
def process_json_file(json_file):
    
    # Extract information from the directory name
    parts = str(json_file.relative_to(root_dir)).split(',')
    test_name = parts[0]
    dataset = method = model_name = subset = subject = mode = lang_pair = None
    if test_name == 'commonsense': 
        dataset = parts[1].split('=')[1]
        method = parts[2].split('=')[1]
        model_name = parts[3].split('=')[1].split(os.sep)[0]
    elif test_name == 'gsm':       
        model_name = parts[1].split('=')[1].split(os.sep)[0]
    elif test_name == 'legalbench':
        subset = parts[1].split('=')[1]
        model_name = parts[2].split('=')[1].split(os.sep)[0]
    elif test_name == 'math':
        subject = parts[1].split('=')[1]      
        model_name = parts[5].split('=')[1].split(os.sep)[0]
    elif test_name == 'med_qa':
        model_name = parts[1].split('=')[1].split(os.sep)[0]
    elif test_name == 'mmlu':
        subject = parts[1].split('=')[1]      
        method = parts[2].split('=')[1]
        model_name = parts[3].split('=')[1].split(os.sep)[0]
    elif test_name == 'narrative_qa':
        model_name = parts[1].split('=')[1].split(os.sep)[0]
    elif test_name == 'natural_qa':
        mode = parts[1].split('=')[1]      
        model_name = parts[2].split('=')[1].split(os.sep)[0]
    elif test_name == 'wmt_14':
        lang_pair = parts[1].split('=')[1]
        model_name = parts[2].split('=')[1].split(os.sep)[0]
    else:
        print(test_name + ' isn\'t set up')

    # Read the stats.json file
    with open(json_file, 'r') as file:
        data = json.load(file)


     # Extract the score using the specific metric for the test
    metric = test_metrics.get(test_name, None)
    split = 'valid' if test_name == 'natural_qa' else 'test'
    if not metric:
        print(f"No metric defined for test {test_name}")
        return None
    
    # Extract the score
    score = None
    for entry in data:
        if entry['name']['name'] == metric and entry['name']['split'] == split:
            score = entry['mean']
            #print(f'score: {score}')
            break
   
    if score is not None:
       
         # Structure the extracted data
        test_result = {
            #'model_name': model_name,
            'test_name': test_name,
            'metric': metric,
            'score': score
        }
        
        if dataset:
            test_result['dataset'] = dataset
        if method:
            test_result['method'] = method
        if subset:
            test_result['subset'] = subset
        if subject:
            test_result['subject'] = subject
        if mode:
            test_result['mode'] = mode
        if lang_pair:
            test_result['language_pair'] = lang_pair
        
        return model_name, test_result
    else:
        print(f"No exact match test score found in {json_file}")
        return None

# Dictionary to store results
model_results = defaultdict(lambda: defaultdict(list))

# Loop through all directories and process stats.json files
for json_file in root_dir.rglob('stats.json'):
    result = process_json_file(json_file)
    if result:
        model_name, test_result = result
        test_name = test_result['test_name']
        
         # Check for subset or subject and organize accordingly
        subset_info = {
            'score': test_result['score']
        }
        if 'subset' in test_result:
            subset_info['subset'] = test_result['subset']
        if 'subject' in test_result:
            subset_info['subject'] = test_result['subject']
        if 'mode' in test_result:
            subset_info['mode'] = test_result['mode']
        if 'language_pair' in test_result:
            subset_info['language_pair'] = test_result['language_pair']
        
        model_results[model_name][test_name].append(subset_info)


# Convert to desired JSON format
final_results = []
for model_name, tests in model_results.items():
    model_entry = {
        'model': model_name,
        'tests': []
    }
    for test_name, results in tests.items():
        test_entry = {
            'test_name': test_name,
            'metric': test_metrics[test_name],
            'subsets': results
        }
        model_entry['tests'].append(test_entry)

    final_results.append(model_entry)
    
# Print the final JSON document
json_object = json.dumps(final_results, indent=2)
print(json_object)
# Writing to sample.json
with open("scores_v1.5.json", "w") as outfile:
    outfile.write(json_object)

[
  {
    "model": "01-ai_yi-large-preview",
    "tests": [
      {
        "test_name": "commonsense",
        "metric": "exact_match",
        "subsets": [
          {
            "score": 0.946
          }
        ]
      },
      {
        "test_name": "gsm",
        "metric": "final_number_exact_match",
        "subsets": [
          {
            "score": 0.69
          }
        ]
      },
      {
        "test_name": "legalbench",
        "metric": "quasi_exact_match",
        "subsets": [
          {
            "score": 0.7368421052631579,
            "subset": "abercrombie"
          },
          {
            "score": 0.14489795918367346,
            "subset": "corporate_lobbying"
          },
          {
            "score": 0.23705722070844687,
            "subset": "function_of_decision_section"
          },
          {
            "score": 0.591,
            "subset": "international_citizenship_questions"
          },
          {
            "score": 0.8842105263157894,