In [None]:
# imports
import json
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import seaborn as sns
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import src.evaluation.util as util

In [None]:
# define path and file filter
report_log_path = '../logs/class_report'
filter = 'class_report_19epoch.json'

# TODO: define metrics to be extracted as list
logs = [['accuracy']
        , ['macro avg', 'precision']
        , ['macro avg', 'recall']
        , ['macro avg', 'f1-score']
        , ['weighted avg', 'precision']
        , ['weighted avg', 'recall']
        , ['weighted avg', 'f1-score']
]

# init dataframe
df_headers = ['run', 'model', 'ratio', 'classes', 'metric', 'value']
df = pd.DataFrame(columns=df_headers)

# iter through logs and get metrics from classification report json files
counter = 0
for (dirpath, dirnames, filenames) in os.walk(report_log_path):

    if filter in filenames:
        counter += 1

        # set filter file path
        file_path = dirpath+'/'+filter
        class_report = json.load(open(file_path))

        # get relevant metrics
        for log in logs:

            # transform metrics for readability
            run = dirpath.split('\\', 1)[-1].replace('\\', '_')
            model = re.search('(^[a-z]{5,6})_', run).group(1)
            ratio = float(re.search('(\d{5})r', run).group(1)) / 100
            classes = re.search('(\d{1,2})c', run).group(1)
            
            # based on log definition get metric and metric value from json
            if len(log) == 1:
                metric = log[0]
                value = class_report[log[0]]
            elif len(log) == 2:
                metric = log[0] + ' ' + log[1]
                value = class_report[log[0]][log[1]]
            
            # init row
            row = [
                str(run),
                str(model),
                float(ratio),
                str(classes),
                str(metric),
                float(value)
            ]
            
            # save row to dataframe
            df_len = len(df)
            df.loc[df_len] = row

print(f'{counter} classification reports found - df.shape {df.shape}')
df.head()

In [None]:
# view found metrics
df['metric'].unique()

In [None]:
# check which training setups have less than X runs - TODO: set X
x_runs = 20
df_runs = df.groupby(['model', 'ratio', 'classes', 'metric']).count().reset_index()
df_runs = df_runs.drop('value', axis=1).query('metric == "accuracy"')
df_runs.query(f'run < {x_runs}').sort_values(by=['model', 'ratio', 'classes'], axis=0)

In [None]:
# update model naming
def return_model_name(model:str) -> str:
    return 'bCNN' if model == 'basic' else 'ResNet'

In [None]:
# visualize classification metrics for all four setups >> including outliers <<
font_size = 15

df_temp_main = df.query(f'metric not in ["macro avg f1-score", "macro avg precision", "macro avg recall"]')

df_temp_hue = df_temp_main[['model', 'classes']].apply(
    lambda row: f'{return_model_name(row.model)}, {row.classes} classes', axis=1
)

f, axes = plt.subplots(nrows = 1, ncols = 4, figsize = (20, 5))

for ind, metric in enumerate(df_temp_main['metric'].unique()):

    sns.set_style("whitegrid")
    sns.despine(left=True, bottom=True)
    
    df_temp_vis = df_temp_main.query(f'metric == "{metric}"')
    df_temp_vis = df_temp_vis.rename(columns={'value': metric})
    # df_temp_vis = util.filter_outliers(df_temp_vis, metric, False, 0)

    sns.lineplot(
        data=df_temp_vis,
        x='ratio',
        y=metric,
        hue=df_temp_hue,
        palette='colorblind',
        linewidth=2,
        ax = axes[ind]
    ).set(
        ylim=(0.5,1)
    )
    
    axes[ind].set_xlabel('corrupted labels ratio (%)', size=font_size)
    axes[ind].set_ylabel(metric, size=font_size)
    axes[ind].set_title(f'{metric.title()} over\nCorrupted Labels Ratio (with Outlier)\n', size=font_size)
    
    if ind == 0:
        axes[0].legend(loc='lower left', fontsize=font_size-1)
    else:
        axes[ind].get_legend().remove()

plt.xticks(fontsize=font_size-2)
plt.subplots_adjust(wspace=0.25, hspace=0.25)
plt.draw()
    
f.savefig(f'../assets/classification_metrics_overview_withOutliers.png', dpi=300, facecolor='white', bbox_inches='tight')

In [None]:
# visualize classification metrics for all four setups >> excluding outliers <<
font_size = 15

df_temp_main = df.query(f'metric not in ["macro avg f1-score", "macro avg precision", "macro avg recall"]')

df_temp_hue = df_temp_main[['model', 'classes']].apply(
    lambda row: f'{return_model_name(row.model)}, {row.classes} classes', axis=1
)

f, axes = plt.subplots(nrows = 1, ncols = 4, figsize = (20, 5))

for ind, metric in enumerate(df_temp_main['metric'].unique()):
    
    sns.set_style("whitegrid")
    sns.despine(left=True, bottom=True)
    
    df_temp_vis = df_temp_main.query(f'metric == "{metric}"')
    df_temp_vis = df_temp_vis.rename(columns={'value': metric})
    df_temp_vis = util.filter_outliers(df_temp_vis, metric, False, 3)

    sns.lineplot(
        data=df_temp_vis,
        x='ratio',
        y=metric,
        hue=df_temp_hue,
        palette='colorblind',
        linewidth=1.5,
        ax = axes[ind]
    ).set(
        ylim=(0.5,1)
    )

    axes[ind].set_xlabel('corrupted labels ratio (%)', size=font_size)
    axes[ind].set_ylabel(metric, size=font_size)
    axes[ind].set_title(f'{metric.title()} over\nCorrupted Labels Ratio (with Outlier)\n', size=font_size)
    
    if ind == 0:
        axes[0].legend(loc='lower left', fontsize=font_size-1)
    else:
        axes[ind].get_legend().remove()

plt.xticks(fontsize=font_size-2)
plt.subplots_adjust(wspace=0.25, hspace=0.25)
plt.draw()

f.savefig(f'../assets/classification_metrics_overview_withoutOutliers.png', dpi=300, facecolor='white', bbox_inches='tight')

In [None]:
# get base stats for classification metrics over average model x classes x ratio setup
group_columns = ['model', 'classes', 'ratio', 'metric']

df_stats = df.query('metric not in \
            ["macro avg f1-score", "macro avg precision", "macro avg recall"]')
df_stats = df_stats.groupby(by=group_columns, as_index=False) \
            .agg({'value':['mean','std']}) \
            .sort_values(group_columns)
            
df_stats_pivot = df_stats.pivot(index=df_stats[['model', 'classes', 'ratio']], columns='metric')['value'].reset_index()
df_stats_pivot.columns = df_stats_pivot.columns.map(' | '.join).str.strip(' | ')

print(df_stats_pivot.shape)
df_stats_pivot.head()

df_stats_pivot.to_excel('../logs/classification_results/classification_metrics_stats_pivot.xlsx', index=False)

--------------

## non-aggregated metric evaluation

--------------

#### accuracy plot per training setup

In [None]:
# filter classification results
df_accuracy_setup = df.query("metric == 'accuracy'")
df_accuracy_setup['color'] = df_accuracy_setup['model'] + ' ' + df_accuracy_setup['classes']

In [None]:
# visualize classification accuracy for all four setups as grid plot
sns.set_style("whitegrid")
g = sns.FacetGrid(
    df_accuracy_setup
    , col='model'
    , row='classes'
    , col_order=['basic', 'resnet']
    , height = 5
    , hue = 'color'
    , palette = 'colorblind'
    , ylim = (0, 1)
    # , xlim = (0,11)
    , sharex = True
    , sharey = True
    , despine = True
)
g.map(sns.scatterplot, 'ratio', 'value')
g.set_axis_labels('corrupted labels ratio', 'model accuracy')

g.savefig('../assets/train_setup_model_accuracy.png', dpi=300, facecolor='white')

--------------

#### accuracy plot

In [None]:
# filter classification results
df_accuracy = df.query("metric == 'accuracy'")
print(df_accuracy.shape)
df_accuracy.head()

In [None]:
# visualize classification accuracy for all four setups as scatter plot
df_hue = df_accuracy[['model', 'classes']].apply(
    lambda row: f'{row.model} CNN, {row.classes} classes', axis=1
)

sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
f, (ax1) = plt.subplots(1, 1, figsize=(10, 6))
sns.despine(left=True, bottom=True)

sns.scatterplot(
            data=df_accuracy,
            x='ratio',
            y='value',
            ax=ax1,
            hue=df_hue,
            palette='colorblind',
            # s=25,
            # marker='X'
        ).set(
            xlabel='corrupted labels ratio (%)',
            ylabel='model accuracy',
            ylim=(0.5,1),
            title='Model Accuracy over Corrupted Labels Ratio'
        )
plt.legend(loc='lower left', title='Model Setup', facecolor='white', framealpha=1)

f.savefig('../assets/model_accuracy.png', dpi=300, facecolor='white')

In [None]:
# visualize classification accuracy for all four setups as strip plot
df_hue = df_accuracy[['model', 'classes']].apply(
    lambda row: f'{row.model} CNN, {row.classes} classes', axis=1
)

sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
f, (ax1) = plt.subplots(1, 1, figsize=(10, 6))
sns.despine(left=True, bottom=True)

sns.stripplot(
            data=df_accuracy,
            x='ratio',
            y='value',
            ax=ax1,
            hue=df_hue,
            palette='colorblind',
            jitter=0.3,
            # s=25,
            # marker='X'
        ).set(
            xlabel='corrupted labels ratio (%)',
            ylabel='model accuracy',
            ylim=(0.5,1),
            title='Model Accuracy over Corrupted Labels Ratio'
        )
plt.legend(loc='lower left', title='Model Setup', facecolor='white', framealpha=1)

f.savefig('../assets/model_accuracy_strip.png', dpi=300, facecolor='white')

------------

#### weighted avg precision plot

In [None]:
# filter classification results
df_precision_w = df.query("metric == 'weighted avg precision'")
print(df_precision_w.shape)
df_precision_w.head()

In [None]:
# visualize classification weighted avg precision for all four setups
df_hue = df_precision_w[['model', 'classes']].apply(
    lambda row: f'{row.model} CNN, {row.classes} classes', axis=1
)

sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
f, (ax1) = plt.subplots(1, 1, figsize=(10, 6))
sns.despine(left=True, bottom=True)

sns.scatterplot(
            data=df_precision_w,
            x='ratio',
            y='value',
            ax=ax1,
            hue=df_hue,
            palette='colorblind'
        ).set(
            xlabel='corrupted labels ratio (%)',
            ylabel='model precision (weighted)',
            ylim=(0.5,1),
            title='Model Precision (weighted) over Corrupted Labels Ratio'
        )
plt.legend(loc='lower left', title='Model Setup', facecolor='white', framealpha=1)

f.savefig('../assets/model_precision.png', dpi=300, facecolor='white')

------------

#### weighted avg recall plot

In [None]:
# filter classification results
df_recall_w = df.query("metric == 'weighted avg recall'")
print(df_recall_w.shape)
df_recall_w.head()

In [None]:
# visualize classification weighted avg recall for all four setups
df_hue = df_recall_w[['model', 'classes']].apply(
    lambda row: f'{row.model} CNN, {row.classes} classes', axis=1
)

sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
f, (ax1) = plt.subplots(1, 1, figsize=(10, 6))
sns.despine(left=True, bottom=True)

sns.scatterplot(
            data=df_recall_w,
            x='ratio',
            y='value',
            ax=ax1,
            hue=df_hue,
            palette='colorblind'
        ).set(
            xlabel='corrupted labels ratio (%)',
            ylabel='model recall (weighted)',
            ylim=(0.5,1),
            title='Model Recall (weighted) over Corrupted Labels Ratio'
        )
plt.legend(loc='lower left', title='Model Setup', facecolor='white', framealpha=1)

f.savefig('../assets/model_recall.png', dpi=300, facecolor='white')

------------

#### weighted avg f1-score plot

In [None]:
# filter classification results
df_f1score_w = df.query("metric == 'weighted avg f1-score'")
print(df_f1score_w.shape)
df_f1score_w.head()

In [None]:
# visualize classification weighted avg f1-score for all four setups
df_hue = df_f1score_w[['model', 'classes']].apply(
    lambda row: f'{row.model} CNN, {row.classes} classes', axis=1
)

sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
f, (ax1) = plt.subplots(1, 1, figsize=(10, 6))
sns.despine(left=True, bottom=True)

sns.scatterplot(
            data=df_f1score_w,
            x='ratio',
            y='value',
            ax=ax1,
            hue=df_hue,
            palette='colorblind'
        ).set(
            xlabel='corrupted labels ratio (%)',
            ylabel='model f1-score (weighted)',
            ylim=(0.5,1),
            title='Model f1-score (weighted) over Corrupted Labels Ratio'
        )
plt.legend(loc='lower left', title='Model Setup', facecolor='white', framealpha=1)

f.savefig('../assets/model_f1score.png', dpi=300, facecolor='white')

--------------

## aggregated (mean) metric evaluation

In [None]:
# aggregate classification results
df_aggr = df.query('metric==["accuracy", "weighted avg precision", "weighted avg recall", "weighted avg f1-score"]')
df_aggr = df_aggr.groupby(['model', 'classes', 'ratio', 'metric']).mean().reset_index()

df_aggr['delta'] = df_aggr.apply(lambda row:
    row['value'] - df_aggr.query(f'model=="{row.model}" & classes=="{row.classes}" & ratio==0.0 & metric=="{row.metric}"').iloc[0]['value'], axis=1
)

df_aggr

--------------

#### aggr accuracy plot

In [None]:
# filter classification results
df_aggr_accuracy = df_aggr.query("metric == 'accuracy'")
print(df_aggr_accuracy.shape)
df_aggr_accuracy.head()

In [None]:
# visualize classification aggregated avg accuracy for all four setups as line plot
df_hue = df_aggr_accuracy[['model', 'classes']].apply(
    lambda row: f'{row.model} CNN, {row.classes} classes', axis=1
)

sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
f, (ax1) = plt.subplots(1, 1, figsize=(10, 6))
sns.despine(left=True, bottom=True)

sns.lineplot(
            data=df_aggr_accuracy,
            x='ratio',
            y='value',
            ax=ax1,
            hue=df_hue,
            palette='colorblind'
        ).set(
            xlabel='corrupted labels ratio (%)',
            ylabel='model accuracy',
            ylim=(0.5,1),
            title='Avg. Model Accuracy over Corrupted Labels Ratio'
        )
plt.legend(loc='lower left', title='Model Setup', facecolor='white', framealpha=1)

f.savefig('../assets/avg_model_accuracy.png', dpi=300, facecolor='white')

In [None]:
# visualize classification aggregated avg accuracy delta for all four setups as line plot
font_size = 18

df_temp_hue = df_aggr_accuracy[['model', 'classes']].apply(
    lambda row: f'{return_model_name(row.model)}, {row.classes} classes', axis=1
)

f, (ax1) = plt.subplots(1, 1, figsize=(10, 6))

sns.set_style("whitegrid")
sns.despine(left=True, bottom=True)

sns.lineplot(
            data=df_aggr_accuracy,
            x='ratio',
            y='delta',
            ax=ax1,
            hue=df_hue,
            palette='colorblind',
            linewidth=2.5
        ).set(
            ylim=(-0.2,0.05),
        )

ax1.set_xlabel('corrupted labels ratio (%)', size=font_size)
ax1.set_ylabel('avg. accuracy delta', size=font_size)
ax1.set_title('Avg. Model Accuracy Delta over Corrupted Labels Ratio\n', size=font_size)
ax1.legend(loc='upper right', fontsize=font_size-1)

plt.xticks(fontsize=font_size-2)

f.savefig('../assets/avg_model_accuracy_delta.png', dpi=300, facecolor='white')

------------

#### aggr weighted avg precision plot

In [None]:
# filter classification results
df_aggr_precision_w = df_aggr.query("metric == 'weighted avg precision'")
print(df_aggr_precision_w.shape)
df_aggr_precision_w.head()

In [None]:
# visualize classification aggregated avg weighted precision for all four setups as line plot
df_hue = df_aggr_precision_w[['model', 'classes']].apply(
    lambda row: f'{row.model} CNN, {row.classes} classes', axis=1
)

sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
f, (ax1) = plt.subplots(1, 1, figsize=(10, 6))
sns.despine(left=True, bottom=True)

sns.lineplot(
            data=df_aggr_precision_w,
            x='ratio',
            y='value',
            ax=ax1,
            hue=df_hue,
            palette='colorblind'
        ).set(
            xlabel='corrupted labels ratio (%)',
            ylabel='model precision (weighted)',
            ylim=(0.5,1),
            title='Avg. Model Precision (weighted) over Corrupted Labels Ratio'
        )
plt.legend(loc='lower left', title='Model Setup', facecolor='white', framealpha=1)

f.savefig('../assets/avg_model_precision.png', dpi=300, facecolor='white')

In [None]:
# visualize classification aggregated avg weighted precision delta for all four setups as line plot
df_hue = df_aggr_precision_w[['model', 'classes']].apply(
    lambda row: f'{row.model} CNN, {row.classes} classes', axis=1
)

sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
f, (ax1) = plt.subplots(1, 1, figsize=(10, 6))
sns.despine(left=True, bottom=True)

sns.lineplot(
            data=df_aggr_precision_w,
            x='ratio',
            y='delta',
            ax=ax1,
            hue=df_hue,
            palette='colorblind'
        ).set(
            xlabel='corrupted labels ratio (%)',
            ylabel='model precision (weighted) delta',
            ylim=(-0.15,0.15),
            title='Avg. Model Precision (weighted) Delta over Corrupted Labels Ratio'
        )
plt.legend(loc='upper right', title='Model Setup', facecolor='white', framealpha=1)

f.savefig('../assets/avg_model_precision_delta.png', dpi=300, facecolor='white')

------------

#### aggr weighted avg recall plot

In [None]:
# filter classification results
df_aggr_recall_w = df_aggr.query("metric == 'weighted avg recall'")
print(df_aggr_recall_w.shape)
df_aggr_recall_w.head()

In [None]:
# visualize classification aggregated avg weighted recall for all four setups as line plot
df_hue = df_aggr_recall_w[['model', 'classes']].apply(
    lambda row: f'{row.model} CNN, {row.classes} classes', axis=1
)

sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
f, (ax1) = plt.subplots(1, 1, figsize=(10, 6))
sns.despine(left=True, bottom=True)

sns.lineplot(
            data=df_aggr_recall_w,
            x='ratio',
            y='value',
            ax=ax1,
            hue=df_hue,
            palette='colorblind'
        ).set(
            xlabel='corrupted labels ratio (%)',
            ylabel='model recall (weighted)',
            ylim=(0.5,1),
            title='Avg. Model Recall (weighted) over Corrupted Labels Ratio'
        )
plt.legend(loc='lower left', title='Model Setup', facecolor='white', framealpha=1)

f.savefig('../assets/avg_model_recall.png', dpi=300, facecolor='white')

In [None]:
# visualize classification aggregated avg weighted recall delta for all four setups as line plot
df_hue = df_aggr_recall_w[['model', 'classes']].apply(
    lambda row: f'{row.model} CNN, {row.classes} classes', axis=1
)

sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
f, (ax1) = plt.subplots(1, 1, figsize=(10, 6))
sns.despine(left=True, bottom=True)

sns.lineplot(
            data=df_aggr_recall_w,
            x='ratio',
            y='delta',
            ax=ax1,
            hue=df_hue,
            palette='colorblind'
        ).set(
            xlabel='corrupted labels ratio (%)',
            ylabel='model recall (weighted) delta',
            ylim=(-0.2,0.2),
            title='Avg. Model Recall (weighted) Delta over Corrupted Labels Ratio'
        )
plt.legend(loc='upper right', title='Model Setup', facecolor='white', framealpha=1)

f.savefig('../assets/avg_model_recall_delta.png', dpi=300, facecolor='white')

------------

#### aggr weighted avg f1-score plot

In [None]:
# filter classification results
df_aggr_f1score_w = df_aggr.query("metric == 'weighted avg f1-score'")
print(df_aggr_f1score_w.shape)
df_aggr_f1score_w.head()

In [None]:
# visualize classification aggregated avg weighted f1-score for all four setups as line plot
df_hue = df_aggr_f1score_w[['model', 'classes']].apply(
    lambda row: f'{row.model} CNN, {row.classes} classes', axis=1
)

sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
f, (ax1) = plt.subplots(1, 1, figsize=(10, 6))
sns.despine(left=True, bottom=True)

sns.lineplot(
            data=df_aggr_f1score_w,
            x='ratio',
            y='value',
            ax=ax1,
            hue=df_hue,
            palette='colorblind'
        ).set(
            xlabel='corrupted labels ratio (%)',
            ylabel='model f1-score (weighted)',
            ylim=(0.5,1),
            title='Avg. Model f1-score (weighted) over Corrupted Labels Ratio'
        )
plt.legend(loc='lower left', title='Model Setup', facecolor='white', framealpha=1)

f.savefig('../assets/avg_model_f1score.png', dpi=300, facecolor='white')

In [None]:
# visualize classification aggregated avg weighted f1-score delta for all four setups as line plot
df_hue = df_aggr_f1score_w[['model', 'classes']].apply(
    lambda row: f'{row.model} CNN, {row.classes} classes', axis=1
)

sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
f, (ax1) = plt.subplots(1, 1, figsize=(10, 6))
sns.despine(left=True, bottom=True)

sns.lineplot(
            data=df_aggr_f1score_w,
            x='ratio',
            y='delta',
            ax=ax1,
            hue=df_hue,
            palette='colorblind'
        ).set(
            xlabel='corrupted labels ratio (%)',
            ylabel='model f1-score (weighted) delta',
            ylim=(-0.2,0.2),
            title='Avg. Model f1-score (weighted) Delta over Corrupted Labels Ratio'
        )
plt.legend(loc='upper right', title='Model Setup', facecolor='white', framealpha=1)

f.savefig('../assets/avg_model_f1score_delta.png', dpi=300, facecolor='white')