In [1]:
import json
import pandas as pd
import numpy as np
from utils import *
import datasets

FINETUNE_MODEL_OUTPUTS = '/home/abdelrahman.sadallah/mbzuai/review_rewrite/inference/evalute_outputs/adapters/Llama-3.1-8B/score_rationale/instruction/step-843/review_evaluation_human_annotation'
BASE_MODEL_OUTPUTS = '/home/abdelrahman.sadallah/mbzuai/review_rewrite/inference/evalute_outputs/base_model/Llama-3.1-8B/score_rationale/instruction/step-0/review_evaluation_human_annotation' 

aspects = [ 'actionability', 'grounding_specificity','verifiability', 'helpfulness']

gold_data = {}

for aspect in aspects:
    ds = datasets.load_dataset('boda/review_evaluation_human_annotation', name=aspect, split='gold')
    ds = ds.to_pandas()
    gold_data[aspect] = ds

In [2]:
from inference.inference_utils import extract_predictions


INFO 03-19 22:24:23 __init__.py:194] No platform detected, vLLM is running on UnspecifiedPlatform


In [None]:
chosen_data = {}
final_data = {}

for aspect in aspects:
    finetuning_aspect_outputs_pah = f'{FINETUNE_MODEL_OUTPUTS}/raw_outputs_{aspect}_gold.jsonl'
    basemodel_aspect_outputs_pah = f'{BASE_MODEL_OUTPUTS}/raw_outputs_{aspect}_gold.jsonl'
    finetuning_outputs = []
    basemodel_outputs = []
    with open(finetuning_aspect_outputs_pah, 'r') as f:
        for line in f:
            finetuning_outputs.append(json.loads(line))
    with open(basemodel_aspect_outputs_pah, 'r') as f:
        for line in f:
            basemodel_outputs.append(json.loads(line))

    finetuning_outputs = extract_predictions(finetuning_outputs)
    basemodel_outputs = extract_predictions(basemodel_outputs)

    finetuning_outputs = pd.DataFrame(finetuning_outputs)
    basemodel_outputs = pd.DataFrame(basemodel_outputs)

    ## drop columns that do not have the aspect in the name
    finetuning_outputs = finetuning_outputs.filter(like=aspect, axis=1)
    basemodel_outputs = basemodel_outputs.filter(like=aspect, axis=1)

    considered = []
    chosen = {'correct':[], 'incorrect':[]}
    while  (len(chosen['correct']) + len(chosen['incorrect'])) < 10:
        idx = np.random.randint(0, len(finetuning_outputs))
        if idx in considered:
            continue
        considered.append(idx)

        ## if this row has null values, or empty string skip it
        if not finetuning_outputs.iloc[idx][f'{aspect}_rationale'] or not basemodel_outputs.iloc[idx][f'{aspect}_rationale']:
            continue
        if not finetuning_outputs.iloc[idx][f'{aspect}_label'] or not basemodel_outputs.iloc[idx][f'{aspect}_label']:
            continue

        assert finetuning_outputs.iloc[idx][f'{aspect}_rationale'] != ''
        assert basemodel_outputs.iloc[idx][f'{aspect}_rationale'] != ''

        if finetuning_outputs.iloc[idx][f'{aspect}_label'] == gold_data[aspect].iloc[idx][f'{aspect}_label'] and len(chosen['correct']) < 5:
            chosen['correct'].append(idx)
        elif finetuning_outputs.iloc[idx][f'{aspect}_label'] != gold_data[aspect].iloc[idx][f'{aspect}_label'] and len(chosen['incorrect']) < 5:
            chosen['incorrect'].append(idx)
    
    chosen_data[aspect] = chosen

    final_data[aspect] = []
    for id in chosen_data[aspect]['correct']:
        row_data = gold_data[aspect].iloc[id].copy()
        row_data['finetuning_label'] = finetuning_outputs.iloc[id][f'{aspect}_label']
        row_data['basemodel_label'] = basemodel_outputs.iloc[id][f'{aspect}_label']
        row_data['finetuning_rationale'] = finetuning_outputs.iloc[id][f'{aspect}_rationale']
        row_data['basemodel_rationale'] = basemodel_outputs.iloc[id][f'{aspect}_rationale']
        row_data['correct_finetuning_label']  = True
        row_data['human_finetuning_rationale_score'] = ''
        row_data['human_basemodel_raionale_score'] = ''
        row_data['Notes'] = ''
        row_data['id'] = id
        final_data[aspect].append(row_data)
    for id in chosen_data[aspect]['incorrect']:
        row_data = gold_data[aspect].iloc[id].copy()
        row_data['finetuning_label'] = finetuning_outputs.iloc[id][f'{aspect}_label']
        row_data['basemodel_label'] = basemodel_outputs.iloc[id][f'{aspect}_label']
        row_data['finetuning_rationale'] = finetuning_outputs.iloc[id][f'{aspect}_rationale']
        row_data['basemodel_rationale'] = basemodel_outputs.iloc[id][f'{aspect}_rationale']
        row_data['correct_finetuning_label']  = False
        row_data['human_finetuning_rationale_score'] = ''
        row_data['human_basemodel_raionale_score'] = ''
        row_data['Notes'] = ''
        row_data['id'] = id
        final_data[aspect].append(row_data)    

In [20]:
## convert final_data to pandas dataframe and save them into a excel file
final_data = {k: pd.DataFrame(v) for k,v in final_data.items()}
with pd.ExcelWriter('finetuning_vs_basemodel_comparison.xlsx') as writer:
    for aspect in aspects:
        final_data[aspect].to_excel(writer, sheet_name=aspect, index=False)
        