In [2]:
import os
import logging
import typing
import joblib
import numpy as np
import pandas as pd

In [3]:
logger = logging.basicConfig()

## Evaluation Results

### Training Pipeline Test Evaluation Results

In [18]:
def load_evaluation_scores(path: str, label: str) -> pd.DataFrame:
    '''Load a precision_recall_fscore_file.xlsx from the path
    Args:
        path (`str`): path to evaluation scores file
        label (`str`): label to distinguish the pipeline run
    Returns:
        Evaluation results dataframe formatted for further evaluation steps
    Raises:
        FileNotFoundError
    '''

    try:
        df = pd.read_excel(path)
    except FileNotFoundError:
        logger.error('Path to evaluation file is incorrect.')
        raise

    df['label'] = label

    return_df = df.melt(id_vars=['label', 'Category'], value_vars=['Precision', 'Recall', 'fscore'], value_name='values', var_name='measure')

    return_df['values'] = return_df['values'].astype(float)

    return return_df
    

In [16]:
path = './all_egm/results/train/precision_recall_fscore/precision_recall_fscore_file.xlsx'

post_df = load_evaluation_scores(path=path, label='all_egm')

path = './no_egm/results/train/precision_recall_fscore/precision_recall_fscore_file.xlsx'

pre_df = load_evaluation_scores(path=path, label='no_egm')

In [19]:
def eval_evaluation_scores(
        pre_scores: pd.DataFrame,
        post_scores: pd.DataFrame,
        pre_label: str,
        post_label: str) -> pd.DataFrame:
    '''Create an evaluation dataframe to easily compare the evaluation results from
    different pipeline runs
    Args:
        pre_scores (`pd.DataFrame`): testing evaluation results from base run
        post_scores (`pd.DataFrame`): testing evaluation results from new run
        pre_label (`str`): value of the `label` column in the pre dataframe
        post_label (`str`): value of the `label` column in the post dataframe
    Return:
        Pandas dataframe for easier cross-runs comparisons of eval results
    '''

    if not isinstance(pre_scores, pd.DataFrame):
        logger.error('Argument `pre_scores` is not a pandas dataframe.')
        raise TypeError

    if not isinstance(post_scores, pd.DataFrame):
        logger.error('Argument `post_scores` is not a pandas dataframe.')
        raise TypeError

    # Merge on category and measure
    merged_df = pre_scores.merge(post_scores, on=['Category', 'measure'])

    # Drop label columns
    merged_df = merged_df[['Category', 'measure', 'values_x', 'values_y']]

    # Add delta
    merged_df['delta'] = merged_df['values_y'] - merged_df['values_x']

    # Flag if improved
    merged_df['improve_flag'] = np.where(merged_df['delta'] > 0, 1, 0)

    # Relabel columns
    merged_df = merged_df.rename(
        columns={
            'values_x': f'{pre_label}_value',
            'values_y': f'{post_label}_value'
        }
    )

    return merged_df


In [20]:
eval_evaluation_scores(pre_scores=pre_df, post_scores=post_df, pre_label='no_egm', post_label='all_egm')

Unnamed: 0,Category,measure,no_egm_value,all_egm_value,delta,improve_flag
0,Other,Precision,0.996667,0.998331,0.001664,1
1,Cardiology,Precision,1.0,1.0,0.0,0
2,Other,Recall,1.0,1.0,0.0,0
3,Cardiology,Recall,0.996283,0.998141,0.001859,1
4,Other,fscore,0.998331,0.999165,0.000834,1
5,Cardiology,fscore,0.998138,0.99907,0.000932,1


## Prediction Stats

### How Many in Each Class?

In [21]:
def load_predictions(path: str) -> pd.DataFrame:
    '''Load a prediction file
    Args:
        path (`str`): path to the prediction file
    Returns:
        Dataframe of the predictions
    '''

    try:
        df = pd.read_excel(path)
    except FileNotFoundError:
        logger.error('Could not find predictions at specified path.')
        raise

    return df

In [45]:
pred_path = './all_egm/results/propagate/prediction/prediction.xlsx'

egm_preds = load_predictions(pred_path)

pred_path = './no_egm/results/propagate/prediction/prediction.xlsx'

no_preds = load_predictions(pred_path)

In [42]:
def gen_class_counts(pred_df: pd.DataFrame, label: str) -> pd.DataFrame:
    '''Count the number of predictions for each class in the passed df
    Args:
        pred_df (`pd.DataFrame`): prediction dataframe
        label (`str`): label of the predictions' run
    Returns:
        Pandas dataframe of counts
    '''
    
    # Generate counts
    counts = pred_df['label_1'].value_counts()

    # Store counts as a dataframe
    counts_df = pd.DataFrame(counts)

    # Add label
    counts_df['label'] = label

    # Extract category from index
    counts_df['category'] = counts_df.index

    # Relabel
    counts_df = counts_df.rename(
        columns={
            'label_1': 'counts'
        }
    )

    # Reset index and reduce columns
    counts_df = counts_df.reset_index()[
        ['label', 'category', 'counts']
    ]

    return counts_df


In [43]:
r = gen_class_counts(pred_df, 'all_egm')

r

Unnamed: 0,label,category,counts
0,all_egm,Cardiology,28557
1,all_egm,Other,2785


In [48]:
def eval_class_counts(
        pre_counts: pd.DataFrame,
        post_counts: pd.DataFrame,
        pre_label: str,
        post_label: str) -> pd.DataFrame:
    '''Generate evaluation dataframe for class counts across runs
    Args:
        pre_counts (`pd.DataFrame`): class counts in initial pipeline run
        post_counts (`pd.DataFrame`): class counts in changed pipeline run
        pre_label (`str`): string label of initial run
        post_label (`str`): string label of changed run
    Returns:
        Evaluation dataframe
    '''

    # Merge dfs
    merge_df = pre_counts.merge(post_counts, on='category', how='inner')

    # Add delta
    merge_df['delta'] = merge_df['counts_y'] - merge_df['counts_x']

    # Relabel
    merge_df = merge_df.rename(
        columns={
            'counts_x': f'{pre_label}_count',
            'counts_y': f'{post_label}_count'
        }
    )

    # Select columns
    merge_df = merge_df[
        [
            'category',
            f'{pre_label}_count',
            f'{post_label}_count',
            'delta'
        ]
    ]

    return merge_df

In [49]:
pre_df = gen_class_counts(no_preds, 'no_egm')
post_df = gen_class_counts(egm_preds, 'all_egm')

eval_class_counts(pre_df, post_df, 'no_egm', 'all_egm')

Unnamed: 0,category,no_egm_count,all_egm_count,delta
0,Cardiology,28949,28557,-392
1,Other,2393,2785,392


### Changed Providers

In [55]:
def id_changed_labels(
    pre_preds: pd.DataFrame,
    post_preds: pd.DataFrame,
    pre_label: str,
    post_label: str) -> pd.DataFrame:
    '''Identify and isolate providers who changed labels between pipeline
    runs
    Args:
        pre_preds (`pd.DataFrame`): initial pipeline run predictions
        post_preds (`pd.DataFrame`): changed pipeline run predictions
        pre_label (`str`): string label of initial run
        post_label (`str`): string label of changed run
    Returns:
        Dataframe containing providers who changed and their respective labels and
        probabitlies
    '''

    # Merge together
    merge_df = pre_preds.merge(post_preds, how='inner', on='npi')

    filtered_merge_df = merge_df[merge_df['label_1_x'] != merge_df['label_1_y']]

    return_df = filtered_merge_df[
        [
            'npi',
            'label_1_x',
            'probability_1_x',
            'label_1_y',
            'probability_1_y'
        ]
    ].rename(
        columns={
            'label_1_x': f'{pre_label}_label_1',
            'probability_1_x': f'{pre_label}_probability_1',
            'label_1_y': f'{post_label}_label_1',
            'probability_1_y': f'{post_label}_probability_1'
        }
    )

    return return_df

In [60]:
changed_labels = id_changed_labels(no_preds, egm_preds, 'no_egm', 'all_egm')

changed_labels

Unnamed: 0,npi,no_egm_label_1,no_egm_probability_1,all_egm_label_1,all_egm_probability_1
18083,1669630968,Other,0.976,Cardiology,0.66800
19533,1033157714,Other,0.968,Cardiology,0.50800
21031,1114998044,Other,0.956,Cardiology,0.54200
22994,1003892779,Other,0.934,Cardiology,0.50525
23126,1871522524,Other,0.932,Cardiology,0.52000
...,...,...,...,...,...
31327,1114929809,Cardiology,0.502,Other,0.51200
31328,1861485831,Other,0.502,Cardiology,0.58200
31332,1447207436,Cardiology,0.500,Other,0.59800
31335,1205973542,Cardiology,0.500,Other,0.56000


### Get Selection of Changed Providers

In [117]:
def gen_switched_label_subset(changed_labels_df: pd.DataFrame, train_data_path: str = None) -> pd.DataFrame:
    '''Generate a randomized selection of providers who changed labels for manual evaluation
    Args:
        changed_labels_df (`pd.Dataframe`): dataframe of providers who changed labels between runs
        train_data_path (`str`): path to the joblib file storing the training dataframe. This can be used
        to get the ground truth for a selected NPI if it was available in the training set
    Returns:
        Dataframe of randomized selection of providers who changed labels
    '''

    npi_selection = list(np.random.choice(changed_labels_df['npi'], size=30, replace=False))

    return_df = changed_labels_df[changed_labels['npi'].isin(npi_selection)]
    
    if train_data_path is None:
        return_df['manual_label'] = ''
    
    # Get the ground truth if it was available in the training subset
    else:
        train_data = joblib.load(train_data_path)
        train_data = train_data[train_data['npi'].isin(npi_selection)][['npi', 'label']]
        return_df = return_df.merge(train_data, how='left', on='npi')
        return_df = return_df.rename(
            columns={
                'label': 'manual_label'
            }
        )
    return return_df

In [115]:
t = joblib.load('./all_egm/joblib/data.joblib')

t['label']

0             Other
1        Cardiology
2             Other
3        Cardiology
4             Other
            ...    
11354    Cardiology
11355         Other
11356         Other
11357         Other
11358         Other
Name: label, Length: 11359, dtype: object

In [118]:
gen_switched_label_subset(changed_labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return_df['manual_label'] = ''


Unnamed: 0,npi,no_egm_label_1,no_egm_probability_1,all_egm_label_1,all_egm_probability_1,manual_label
23814,1417002908,Other,0.92,Cardiology,0.65,
28317,1760559496,Cardiology,0.74,Other,0.552,
29049,1992883599,Cardiology,0.68,Other,0.608,
29085,1790746790,Cardiology,0.678,Other,0.592,
29292,1407858749,Other,0.658,Cardiology,0.574,
29320,1376639237,Cardiology,0.656,Other,0.568,
29340,1891716403,Cardiology,0.654,Other,0.548,
29375,1811908700,Cardiology,0.65,Other,0.544,
29586,1245433416,Cardiology,0.634,Other,0.594,
29682,1154398329,Cardiology,0.626,Other,0.51,


## Probability Stats

In [65]:
def gen_proba_stats(
    df: pd.DataFrame,  
    proba_col: str,
    category_col: str = None,
    category_value: str = None) -> pd.Series:
    '''Evaluate the distribution of the model's probabilities for label 1 in
    a provided dataframe.
    
    Args:
        df (`pd.DataFrame`): dataframe to evaluate. Should have class and probabilities cols
        proba_col (`str`): string name of the probability column
        category_col (`str`): string name of the category column
        category_value (`str`): category to to subset to before generating stats
    Returns:
        Series description of the probability columns
    '''

    if category_col is None:
        summary = df[proba_col].describe()
    else:
        try:
            filtered_df = df[df[category_col] == category_value]
        except KeyError:
            logger.error('Provided category value is not in provided category column.')

        summary = filtered_df[proba_col].describe()

    return summary

In [64]:
gen_proba_stats(changed_labels, 'no_egm_label_1', 'no_egm_probability_1')

count    680.000000
mean       0.583406
std        0.077750
min        0.500000
25%        0.528000
50%        0.566000
75%        0.614500
max        0.976000
Name: no_egm_probability_1, dtype: float64

In [71]:
def eval_proba_distributions(
    pre_series: pd.Series,
    post_series: pd.Series,
    pre_label: str,
    post_label: str) -> pd.DataFrame:
    '''Generate an evaluation dataframe comparing the probability stats across runs
    Args:
        pre_series (`pd.Series`): Summary stats of label 1 probabilities in initial run
        post_series (`pd.Series`): Summary stats of label 1 probabilities in changed run
        pre_label (`str`): Label for initial run
        post_label (`str`): Label for changed run
    Returns:
        Dataframe of summary stats
    '''

    return_df = pd.DataFrame()

    # Add initial run's summary stats
    return_df[pre_label] = pre_series

    # Add changed run's summary stats
    return_df[post_label] = post_series

    # Calculate deltas
    return_df['delta'] = return_df[post_label] - return_df[pre_label]

    return return_df

In [72]:
no_egm_overall_stats = gen_proba_stats(no_preds, 'probability_1')
all_egm_overall_stats = gen_proba_stats(egm_preds, 'probability_1')

eval_proba_distributions(no_egm_overall_stats, all_egm_overall_stats, 'no_egm', 'all_egm')

Unnamed: 0,no_egm,all_egm,delta
count,31342.0,31342.0,0.0
mean,0.929478,0.925833,-0.003645
std,0.116038,0.119864,0.003826
min,0.5,0.5,0.0
25%,0.924,0.918,-0.006
50%,0.984,0.984,0.0
75%,0.996,0.996,0.0
max,1.0,1.0,0.0


In [73]:
changed_stats_no_egm = gen_proba_stats(changed_labels, 'no_egm_probability_1')
changed_stats_all_egm = gen_proba_stats(changed_labels, 'all_egm_probability_1')

eval_proba_distributions(changed_stats_no_egm, changed_stats_all_egm, 'no_egm', 'all_egm')


Unnamed: 0,no_egm,all_egm,delta
count,680.0,680.0,0.0
mean,0.583406,0.57409,-0.009316
std,0.07775,0.066686,-0.011065
min,0.5,0.5,0.0
25%,0.528,0.524,-0.004
50%,0.566,0.554,-0.012
75%,0.6145,0.604,-0.0105
max,0.976,0.878,-0.098


## Generate Evaluation Report

In [120]:
# Testing subset evaluation scores
path_eval_scores_pre = './no_egm/results/train/precision_recall_fscore/precision_recall_fscore_file.xlsx'
path_eval_scores_post = './all_egm/results/train/precision_recall_fscore/precision_recall_fscore_file.xlsx'

pre_eval_scores = load_evaluation_scores(path_eval_scores_pre, 'no_egm')
post_eval_scores = load_evaluation_scores(path_eval_scores_post, 'all_egm')

compare_eval_scores = eval_evaluation_scores(pre_eval_scores, post_eval_scores, pre_label='no_egm', post_label='all_egm')

# Class counts
path_preds_pre = './no_egm/results/propagate/prediction/prediction.xlsx'
path_preds_post = './all_egm/results/propagate/prediction/prediction.xlsx'

pre_preds = load_predictions(path_preds_pre)
post_preds = load_predictions(path_preds_post)

pre_class_counts = gen_class_counts(pre_preds, 'no_egm')
post_class_counts = gen_class_counts(post_preds, 'all_egm')

compare_eval_counts = eval_class_counts(
    pre_class_counts, 
    post_class_counts,
    'no_egm',
    'all_egm'
)

# Changed Labels
changed_labels = id_changed_labels(
    pre_preds,
    post_preds,
    'no_egm',
    'all_egm'
)

# Get a subset of NPIs to check
changed_labels_to_check = gen_switched_label_subset(
    changed_labels,
    train_data_path='./all_egm/joblib/data.joblib'
)

# Probability distributions

## Overall
pre_overall_stats = gen_proba_stats(
    pre_preds,
    'probability_1'
)

post_overall_stats = gen_proba_stats(
    post_preds,
    'probability_1'
)

compare_eval_proba_dist_overall = eval_proba_distributions(
    pre_overall_stats,
    post_overall_stats,
    'no_egm',
    'all_egm'
)

## Classes - Cardiology
pre_cardio_stats = gen_proba_stats(
    pre_preds[pre_preds['label_1'] == 'Cardiology'],
    'probability_1'
)

post_cardio_stats = gen_proba_stats(
    post_preds[pre_preds['label_1'] == 'Cardiology'],
    'probability_1'
)

compare_eval_proba_dist_cardio = eval_proba_distributions(
    pre_cardio_stats,
    post_cardio_stats,
    'no_egm',
    'all_egm'
)

## Changed Labels
pre_switched_stats = gen_proba_stats(
    changed_labels,
    'no_egm_probability_1'
)

post_switched_stats = gen_proba_stats(
    changed_labels,
    'all_egm_probability_1'
)

compare_eval_proba_dist_switched = eval_proba_distributions(
    pre_switched_stats,
    post_switched_stats,
    'no_egm',
    'all_egm'
)

In [95]:
def gen_evaluation_report(
    comparison_df_list: list[pd.DataFrame],
    comparison_df_labels: list[str],
    output_dir: str,
    file_name: str) -> None:
    '''Write overall report based on the provided comparison dfs. Persists
    dfs to output directory as well.
    
    Args:
        comparison_df_list (`list[pd.DataFrame]`): list of evaluation dataframes to include in report
        comparison_df_labels (`list[str]`): list of labels for evaluation dataframes
        output_dir (`str`): output directory for report and persisted dataframes
        file_name (`str`): file name for evaluation report
    Returns:
        None
    '''

    # Generate output dir if doesn't exist
    if os.path.isdir(output_dir) == False:
        os.mkdir(output_dir)

    if os.path.isdir(f'{output_dir}persisted_dataframes/') == False:
        os.mkdir(f'{output_dir}persisted_dataframes/')

    if len(comparison_df_labels) != len(comparison_df_list):
        logger.error('List of dataframes must be the same length as list of dfs.')
        raise ValueError

    full_path = output_dir + file_name + '.txt'

    with open(full_path, 'w') as file:

        for df, label in zip(comparison_df_list, comparison_df_labels):
            file.write(label)
            file.write('\n')
            file.write(df.to_string())
            file.write('\n\n')
    file.close()

    for df, label in zip(comparison_df_list, comparison_df_labels):
        output_path = f'{output_dir}persisted_dataframes/{label.replace(" ", "_")}.xlsx'
        df.to_excel(output_path)

In [121]:
comparison_df_list = [
    compare_eval_scores,
    compare_eval_counts,
    compare_eval_proba_dist_overall,
    compare_eval_proba_dist_cardio,
    compare_eval_proba_dist_switched,
    changed_labels_to_check
]

comparison_df_labels = [
    'Testing Subset Evaluation Scores',
    'Category Counts',
    'Overall Probability Description',
    'Cardiology Labeled Probability Description',
    'Switched Labels Probability Description',
    'Switched Labels Manual NPI List'
]

gen_evaluation_report(
    comparison_df_list, 
    comparison_df_labels,
    output_dir='./evaluation_report/',
    file_name='evaluation_report'
)

## Confusion Matrices

In [7]:
def load_confusion_matrix(path: str) -> pd.DataFrame:
    '''Load a persisted confusion matrix
    
    Args:
        path (`str`): path to confusion matrix
    Returns:
        Pandas dataframe of the confusion matrix
    '''
    cm = pd.read_excel(path)

    cm = cm.rename(
        columns={
            'Unnamed: 0': 'Category'
        }
    )

    return cm

In [89]:
cm1 = load_confusion_matrix('./all_egm/results/train/confusion_matrix/confusion_matrix_file.xlsx')

cm1

Unnamed: 0,Category,Cardiology,Other
0,Cardiology,537,1
1,Other,0,598


In [10]:
cm2 = cm1.copy(deep=True)
cm2['Category'] = 'test_' + cm1['Category']

cm2

Unnamed: 0,Category,Cardiology,Other
0,test_Cardiology,537,1
1,test_Other,0,598


In [98]:
def eval_confusion_matrices(
    pre_matrix: pd.DataFrame,
    post_matrix: pd.DataFrame,
    pre_label: str,
    post_label: str,
    col_count: int) -> pd.DataFrame:
    '''Generate an evaluation of the confusion matrices produced between pipeline runs
    
    Args:
        pre_matrix (`pd.DataFrame`): matrix from original run
        post_matrix (`pd.DataFrame`): matrix from changed run
        pre_label (`str`): label for original run
        post_label (`str`): label from changed run
    Returns:
        Evaluation dataframe
    '''
    pre_matrix_copy = pre_matrix.copy(deep=True)
    post_matrix_copy = post_matrix.copy(deep=True)

    # Add labels to category values
    pre_matrix_copy['Category'] = pre_label + pre_matrix_copy['Category']
    post_matrix_copy['Category'] = post_label + post_matrix_copy['Category']

    # Add empty row to separate confusion matrices for greater readability
    # row = pd.DataFrame([['', '' , '']] , columns=pre_matrix_copy.columns)
    empty_array = np.empty(shape=(1, col_count))
    empty_array[:] = np.nan
    row = pd.DataFrame(empty_array, columns=pre_matrix_copy.columns)
    # row = row.fillna('')

    # Concat matrices together
    return_df = pd.concat([pre_matrix_copy, row])
    return_df = pd.concat([return_df, post_matrix_copy])

    return return_df



In [99]:
eval_confusion_matrices(
    cm1,
    cm1,
    'pre_',
    'post_',
    3
)

Unnamed: 0,Category,Cardiology,Other
0,pre_Cardiology,537.0,1.0
1,pre_Other,0.0,598.0
0,,,
0,post_Cardiology,537.0,1.0
1,post_Other,0.0,598.0
