In [5]:
import pandas as pd
import os
import json
PARENT_DIR = "evalute_outputs"

all_outputs = {}

aspects = [ 'actionability', 'grounding_specificity','verifiability', 'helpfulness']

for aspect in aspects:
    all_outputs[aspect] = []

In [6]:
### iterate over folders in the parent directory

for type in ['adapters', 'base_model']:

    path = os.path.join(PARENT_DIR, type)
    for folder in os.listdir(path):
        for mode in ['score_only', 'score_rationale']:
            for temp_type in ['chat', 'instruction']:


                model_name = folder

                ################ MAKE IT MORE CUSTOMIZABLE                
                automatic_score_only_path = os.path.join(path, folder, f"{mode}/{temp_type}/all/step-0/review_evaluation_automatic_labels/results_all_test.txt")

                if model_name != 'chatgpt':
                    # human_score_only_path = os.path.join(path, folder, f"{mode}/{temp_type}/all/step-0/review_evaluation_human_annotation/results_actionability_grounding_specificity_verifiability_helpfulness_full.txt")
                    human_score_only_path = os.path.join(path, folder, f"{mode}/{temp_type}/all/step-0/review_evaluation_human_annotation/results_combined_main_aspects_full.txt")

                else:
                    human_score_only_path = os.path.join(path, folder, f"{mode}/{temp_type}/all/step-0/review_evaluation_human_annotation/results_combined_main_aspects_full.txt")
 
                # read the json files if they exist
                automatic_score_only = None
                human_score_only = None

                if os.path.exists(automatic_score_only_path):
                    with open(automatic_score_only_path, 'r') as f:
                        automatic_score_only = json.load(f)
                else:
                    print(f"File missing: {automatic_score_only_path}")

                if os.path.exists(human_score_only_path):
                    with open(human_score_only_path, 'r') as f:
                        human_score_only = json.load(f)
                else:
                    print(f"File missing: {human_score_only_path}")

                # Process the files if they are loaded
                for aspect in aspects:

                    if automatic_score_only or human_score_only:
                        row = {}
                        row['model_name'] = model_name
                        row['mode'] = mode
                        row['type'] = type
                        row['tempelate_type'] = temp_type
                        if automatic_score_only:
                            # Extract the relevant data from the automatic score only file
                            auto_results = automatic_score_only['all_test'][aspect]['total_stats']
                            row['QWK_automatic'] = auto_results['kappa_quadratic']
                            row['pearson_automatic'] = auto_results['pearson']
                            row['spearman_automatic'] = auto_results['spearman']
                            row['original_len_automatic'] = auto_results['original_len']
                            row['processed_len_automatic'] = auto_results['filtered_len']
                            row['success_rate_automatic'] = auto_results['sucess_rate']
                            if aspect == 'verifiability':
                                row['f1_X'] = auto_results['f1_X']

                        if human_score_only:
                            # human_results = human_score_only[f'{aspect}_full'][aspect] if model_name != 'chatgpt' else human_score_only[f'combined_main_aspects_full'][aspect]
                            human_results = human_score_only[f'combined_main_aspects_full'][aspect]

                            for annotator in ['Annotator1', 'Annotator2', 'boda', 'total_stats']:
                                cur_data = human_results[annotator]
                                row[f'QWK_{annotator}_human'] = cur_data['kappa_quadratic']
                                row[f'pearson_{annotator}_human'] = cur_data['pearson']
                                row[f'spearman_{annotator}_human'] = cur_data['spearman']
                                if aspect == 'verifiability':
                                    row[f'f1_X_{annotator}_human'] = cur_data['f1_X']

                                if annotator == 'total_stats':
                                    row[f'alpha_{annotator}_human'] = cur_data['krippendorff_alpha']
                                    row[f'original_len_human'] = cur_data['original_len']
                                    row[f'processed_len_human'] = cur_data['filtered_len']
                                    row['success_rate_human'] = cur_data['sucess_rate']

                        all_outputs[aspect].append(row)

            

# Save the results to an Excel file
with pd.ExcelWriter('evaluation_results.xlsx') as writer:
    for aspect in aspects:
        df = pd.DataFrame(all_outputs[aspect])
        df.to_excel(writer, sheet_name=aspect, index=False)


File missing: evalute_outputs/adapters/Llama-3.1-8B/score_only/chat/all/step-0/review_evaluation_automatic_labels/results_all_test.txt
File missing: evalute_outputs/adapters/Llama-3.1-8B/score_only/chat/all/step-0/review_evaluation_human_annotation/results_combined_main_aspects_full.txt
File missing: evalute_outputs/adapters/Llama-3.1-8B/score_rationale/chat/all/step-0/review_evaluation_automatic_labels/results_all_test.txt
File missing: evalute_outputs/adapters/Llama-3.1-8B/score_rationale/chat/all/step-0/review_evaluation_human_annotation/results_combined_main_aspects_full.txt
File missing: evalute_outputs/adapters/SciLitLLM/score_only/chat/all/step-0/review_evaluation_automatic_labels/results_all_test.txt
File missing: evalute_outputs/adapters/SciLitLLM/score_only/chat/all/step-0/review_evaluation_human_annotation/results_combined_main_aspects_full.txt
File missing: evalute_outputs/adapters/SciLitLLM/score_rationale/chat/all/step-0/review_evaluation_automatic_labels/results_all_test