In [3]:
import os
import logging
import joblib
import numpy as np
import pandas as pd

In [4]:
logger = logging.basicConfig()

## Evaluation Results

### Training Pipeline Test Evaluation Results

In [18]:
def load_evaluation_scores(path: str, label: str) -> pd.DataFrame:
    '''Load a precision_recall_fscore_file.xlsx from the path
    Args:
        path (`str`): path to evaluation scores file
        label (`str`): label to distinguish the pipeline run
    Returns:
        Evaluation results dataframe formatted for further evaluation steps
    Raises:
        FileNotFoundError
    '''

    try:
        df = pd.read_excel(path)
    except FileNotFoundError:
        logger.error('Path to evaluation file is incorrect.')
        raise

    df['label'] = label

    return_df = df.melt(id_vars=['label', 'Category'], value_vars=['Precision', 'Recall', 'fscore'], value_name='values', var_name='measure')

    return_df['values'] = return_df['values'].astype(float)

    return return_df
    

In [16]:
path = './all_egm/results/train/precision_recall_fscore/precision_recall_fscore_file.xlsx'

post_df = load_evaluation_scores(path=path, label='all_egm')

path = './no_egm/results/train/precision_recall_fscore/precision_recall_fscore_file.xlsx'

pre_df = load_evaluation_scores(path=path, label='no_egm')

In [19]:
def eval_evaluation_scores(
    pre_scores: pd.DataFrame,
    post_scores: pd.DataFrame,
    pre_label: str,
    post_label: str) -> pd.DataFrame:
    '''Create an evaluation dataframe to easily compare the evaluation results from
    different pipeline runs
    Args:
        pre_scores (`pd.DataFrame`): testing evaluation results from base run
        post_scores (`pd.DataFrame`): testing evaluation results from new run
        pre_label (`str`): value of the `label` column in the pre dataframe
        post_label (`str`): value of the `label` column in the post dataframe
    Return:
        Pandas dataframe for easier cross-runs comparisons of eval results
    '''

    if not isinstance(pre_scores, pd.DataFrame):
        logger.error('Argument `pre_scores` is not a pandas dataframe.')
        raise TypeError

    if not isinstance(post_scores, pd.DataFrame):
        logger.error('Argument `post_scores` is not a pandas dataframe.')
        raise TypeError

    # Merge on category and measure
    merged_df = pre_scores.merge(post_scores, on=['Category', 'measure'])

    # Drop label columns
    merged_df = merged_df[['Category', 'measure', 'values_x', 'values_y']]

    # Add delta
    merged_df['delta'] = merged_df['values_y'] - merged_df['values_x']

    # Flag if improved
    merged_df['improve_flag'] = np.where(merged_df['delta'] > 0, 1, 0)

    # Relabel columns
    merged_df = merged_df.rename(
        columns={
            'values_x': f'{pre_label}_value',
            'values_y': f'{post_label}_value'
        }
    )

    return merged_df
    

In [20]:
eval_evaluation_scores(pre_scores=pre_df, post_scores=post_df, pre_label='no_egm', post_label='all_egm')

Unnamed: 0,Category,measure,no_egm_value,all_egm_value,delta,improve_flag
0,Other,Precision,0.996667,0.998331,0.001664,1
1,Cardiology,Precision,1.0,1.0,0.0,0
2,Other,Recall,1.0,1.0,0.0,0
3,Cardiology,Recall,0.996283,0.998141,0.001859,1
4,Other,fscore,0.998331,0.999165,0.000834,1
5,Cardiology,fscore,0.998138,0.99907,0.000932,1
