# Characterising Pre-processing Parameter Importance for ModelPerformance and Fairness in Embedded Machine Learning
## Results Analysis Notebook
Wiebke Hutiri and Akhil Mathur

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np

import seaborn as sns
sns.set_context("paper")
sns.set_theme(style="whitegrid")
import matplotlib.pyplot as plt
plt.show()
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display
    
from fair_embedded_ml.metrics import domain_bias, model_bias
from fair_embedded_ml import results_analysis 
from fair_embedded_ml.results_analysis import exp_names
from fair_embedded_ml import results_plot

from fair_embedded_ml.results_analysis import exp_names
x_dir = '~/Projects/fair_embedded_ml_results/' #'/data/experiments/'#
    
class fixed_copy(fixed):
    def get_interact_value(self):
        return self.value.copy()

## Get Results

In [None]:
results = results_analysis.get_results(x_dir)
compression_results = results_analysis.get_compression_results(x_dir)
results_mcc = results_analysis.get_results_for_domains(results)
results_bias = results_analysis.get_bias_results(results_mcc)

In [None]:
# compression_results.groupby(['dataset_name','exp_name']).agg({'run_name':'count'})
# 648, 576, 504, 432 --> expected run_name count for 9, 8 ,7 and 6 experiments

In [None]:
datasets={'speech_commands_gender':'sc', 'audio_mnist_gender':'am', 'speech_commands_rain':'scr', 'speech_commands_wind':'scw', 'speech_commands_laughter':'scl',
         'mswc_de':'mswc_de','mswc_en':'mswc_en','mswc_fr':'mswc_fr','mswc_rw':'mswc_rw'}
sample_rates = {'8kHz':[8,8],'16kHz':[16,16],'8kHz-16kHz':[8,16],'16kHz-8kHz':[16,8]}
model_arch = {'cnn':'cnn', 'low_latency_cnn':'llcnn'}
models = {'8kHz CNN':'sc8_cnn', '8kHz low latency CNN':'sc8_llcnn', '16kHz CNN':'sc16_cnn', '16kHz low latency CNN':'sc16_llcnn'}
domains = ['all','female','male']
metrics = ['mcc','cohen_kappa','f1_weighted','precision','recall']
hparams = ['frame_length', 'frame_step','window_fn','input_features','mfccs', 'mel_bins']
compression_params = ['pruning_schedule','pruning_learning_rate','pruning_frequency','pruning_final_sparsity','quantization_optimization']
results_filter = [x for x in exp_names.keys() if 'compress' not in x]#['all_results','sc_train','sc_resample','sc_laughter','sc_wind','sc_rain','am_gender','mswc35_de','mswc35_en','mswc35_fr','mswc35_rw']
hparams.sort()
metrics.sort()

In [None]:
experiment_prefixes = ['sc','mswc35_de','mswc35_en','mswc35_fr','mswc35_rw']
model_arches = ['8_cnn','16_cnn','8_llcnn','16_llcnn']
experiment_names = [i + j for i in experiment_prefixes for j in model_arches]

In [None]:
results_bias = results_bias[results_bias.exp_name.isin(experiment_names)] #two sets of mswc runs, which impacts preprocessing parameter importance analysis

In [None]:
results_bias.groupby(['dataset_name','model_arch','resample_rate'])['run_name'].nunique()

## Analysis of Predictive Performance

### Distribution of experiment results

In [None]:
def plot_results_distribution(results, metric='all_mcc', selection='all_results'):
    f, ax = plt.subplots(figsize=(8,3))
    ax.set_title('Distribution of Model Performance Scores', fontsize='large')
    ax.set_xlabel('Model performance (MCC)', fontsize='large')
#     ax.set_xlabel('Model bias (0 is unbiased)', fontsize='large')
    ax.set_ylabel('Experiment count', fontsize='large')
    ax.tick_params(axis='both', which='major', labelsize=14)
    sns.despine(bottom=True, left=True)
    sns.histplot(results_analysis.select_results_by_name(results, exp_names, selection), 
                 x=metric, hue='exp_name', palette='Set1', stat='count', kde=True)
    
results_buttons0 = widgets.RadioButtons(description='Results:', options = results_filter, value = 'sc_train', rows=5)
metric_buttons0 = widgets.RadioButtons(description='Metrics:', options = ['_'.join(['all', m]) for m in metrics], value = 'all_mcc', rows=len(metrics))

ui0 = widgets.HBox([results_buttons0, metric_buttons0])
out0 = widgets.interactive_output(plot_results_distribution, {'results':fixed_copy(results), 
                                                              'metric':metric_buttons0,'selection':results_buttons0})
display(ui0, out0)

### Performance across hyperparameter settings

In [None]:
results_buttons1 = widgets.RadioButtons(description='Results:', options = results_filter, value = 'sc_train', rows=5)
metric_buttons1 = widgets.RadioButtons(description='Metrics:', options = metrics, value = 'mcc', rows=len(metrics))
hparam_buttons1 = widgets.RadioButtons(description='Hparams:', options = hparams, value = 'input_features', rows=len(hparams))
xaxis_buttons1 = widgets.RadioButtons(description='xaxis', options = ['exp','hparam'], value = 'exp', rows=2)

ui1 = widgets.HBox([widgets.VBox([results_buttons1]), metric_buttons1, hparam_buttons1, xaxis_buttons1])
out1 = widgets.interactive_output(results_plot.plot_param_performance, {'results':fixed_copy(results), 'hparam':hparam_buttons1, 'metric':metric_buttons1, 
# out1 = widgets.interactive_output(plot_hparam_performance, {'results':fixed_copy(results[results['exp_name'].isin(['sc16_cnn','sc16_llcnn'])]), 'hparam':hparam_buttons1, 'metric':metric_buttons1, 
                                                            'exp_names':fixed_copy(exp_names), 'selection':results_buttons1, 'x_axis':xaxis_buttons1})

display(ui1, out1)

In [None]:
def plot_parameters_top_results(df):
    """
    This function plots the frequency count of pre-processing parameters per dataset across the experiments in df.
    """
    
    the_best_plot_data = df[df['exp_name'].isin(experiment_names)]
    the_best_plot_data.loc[:,'input_features'] = pd.Categorical(the_best_plot_data.loc[:,'input_features'].astype(str), ['log_mel_spectrogram','mfcc'])
    the_best_plot_data.loc[:,'mel_bins'] = pd.Categorical(the_best_plot_data.loc[:,'mel_bins'].astype(int).astype(str), ['20','26','32','40','60','80'])
    the_best_plot_data.loc[:,'mfccs'] = pd.Categorical(the_best_plot_data.loc[:,'mfccs'].astype(int).astype(str), ['-1','10','11','12','13','14'])
    the_best_plot_data.loc[:,'frame_length'] = pd.Categorical(the_best_plot_data.loc[:,'frame_length'].astype(str), ['0.02','0.025','0.03','0.04'])
    the_best_plot_data.loc[:,'frame_step'] = pd.Categorical(the_best_plot_data.loc[:,'frame_step'].astype(str), ['0.4','0.5','0.6'])
    the_best_plot_data.loc[:,'frame_length_and_step'] = the_best_plot_data.loc[:,['frame_length','frame_step']].astype(str).agg('; '.join, axis=1)
    the_best_plot_data.loc[:,'mel_bins_and_mfccs'] = the_best_plot_data.loc[:,['mel_bins','mfccs']].astype(str).agg('; '.join, axis=1)
    hues = the_best_plot_data['dataset_name'].unique()
    hues.sort()

    # Set up the matplotlib figure for individual preprocessing parameters
    f, ((ax0, ax1, ax2), (ax4, ax5, ax6)) = plt.subplots(2, 3, figsize=(25, 5))
    f.suptitle('Frequency count of preprocessing parameters across top experiments')

    sns.histplot(data=the_best_plot_data, x='input_features', palette="rocket", hue='dataset_name', hue_order=hues, legend=False, multiple="stack", ax=ax0)
    sns.histplot(data=the_best_plot_data, x='mel_bins', palette="rocket", hue='dataset_name', hue_order=hues, legend=False, binwidth=3, multiple="stack", ax=ax1)
    sns.histplot(data=the_best_plot_data, x='mfccs', palette="rocket", hue='dataset_name', hue_order=hues, legend=True,  multiple="stack", ax=ax2)
    # sns.histplot(data=the_best_plot_data, x='mel_bins_and_mfccs', palette="rocket", hue='dataset_name', legend=True,  multiple="stack", ax=ax3)

    sns.histplot(data=the_best_plot_data, x='window_fn', palette="rocket", hue='dataset_name', hue_order=hues, legend=False,  multiple="stack", ax=ax4)
    sns.histplot(data=the_best_plot_data, x='frame_length', palette="rocket", hue='dataset_name', hue_order=hues, legend=False,  multiple="stack", ax=ax5)
    sns.histplot(data=the_best_plot_data, x='frame_step', palette="rocket", hue='dataset_name', hue_order=hues, legend=False,  multiple="stack", ax=ax6)
    # sns.histplot(data=the_best_plot_data, x='frame_length_and_step', palette="rocket", hue='dataset_name', legend=False,  multiple="stack", ax=ax7)

    sns.move_legend(ax2, "center right", bbox_to_anchor=(1.5, 0))
    sns.despine(bottom=True)
    plt.setp(f.axes, yticks=[])
    plt.tight_layout(h_pad=2)

    # Set up the figure for combined parameter features
    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(25, 3.5))

    sns.histplot(data=the_best_plot_data.sort_values(['mel_bins','mfccs']), x='mel_bins_and_mfccs', palette="rocket", hue='dataset_name', hue_order=hues,
                 shrink=.9, legend=False, multiple="stack", ax=ax1).tick_params(rotation=30)

    sns.histplot(data=the_best_plot_data.sort_values(['frame_length','frame_step']), x='frame_length_and_step', palette="rocket", hue_order=hues,
                 shrink=.9, hue='dataset_name', legend=False, multiple="stack", ax=ax2)
    
    sns.despine(bottom=True)
    plt.setp(f.axes, yticks=[])
    plt.tight_layout(h_pad=2)

#### Pre-processing parameter frequency count for highest performance runs for each model architecture and resample_rate

In [None]:
the_best = results_analysis.get_top_result(results_bias, metric='all_mcc', best=50).reset_index(drop=True)
# print(the_best[the_best.dataset_name=='mswc_rw']['exp_id'])

In [None]:
plot_parameters_top_results(the_best)

## Analysis of Model Fairness

### Female VS male performance across pre-processing parameter settings 

In [None]:
results_buttons2 = widgets.RadioButtons(description='Results:', options = results_filter, value = 'sc_train', rows=5)
domain_pair_buttons2 = widgets.RadioButtons(description='Compare:', options = ['male, female','all, female','all, male'], value = 'male, female', rows=3)
filter_buttons2 = widgets.RadioButtons(description='Filter Architecture:', options = ['all','cnn','low_latency_cnn'], value = 'all', rows=3)
metric_buttons2 = widgets.RadioButtons(description='Metrics:', options = metrics, value = 'mcc', rows=len(metrics))
hue_buttons2 = widgets.RadioButtons(description='Hparams:', options = hparams, value = 'input_features', rows=len(hparams))

ui2 = widgets.HBox([results_buttons2, widgets.VBox([filter_buttons2, domain_pair_buttons2]), metric_buttons2, hue_buttons2])
out2 = widgets.interactive_output(results_plot.plot_domain_performance, {'results':fixed_copy(results), 'metric':metric_buttons2, 'domain':domain_pair_buttons2, 
                                                            'filter_key':fixed(filter_buttons2.description.replace(':','').lower()), 'exp_names':fixed_copy(exp_names),
                                                            'filter_value':filter_buttons2, 'hue':hue_buttons2, 'results_selection':results_buttons2})

display(ui2, out2)

### Domain performance across pre-processing parameter settings 

In [None]:
results_buttons3 = widgets.RadioButtons(description='Results:', options = results_filter, value = 'sc_train', rows=5)
filter_buttons3 = widgets.RadioButtons(description='Filter Input:', options = ['all','log_mel_spectrogram','mfcc'], value = 'all', rows=3)
hue_buttons3 = widgets.RadioButtons(description='Colours:', options = ['domain']+hparams, value = 'domain', rows=len(hparams)+1)
style_buttons3 = widgets.RadioButtons(description='Markers:', options = ['domain']+hparams, value = 'input_features', rows=len(hparams)+1)

ui3 = widgets.HBox([results_buttons3, filter_buttons3, hue_buttons3, style_buttons3])
out3 = widgets.interactive_output(results_plot.plot_domain_performance, {'results':fixed_copy(results_mcc), 'metric':fixed('mcc'),'domain':fixed('all, domain'), 
                                                            'results_selection':results_buttons3, 'hue':hue_buttons3, 'filter_key':fixed(filter_buttons3.description.replace(':','').lower()),
                                                            'exp_names':fixed_copy(exp_names), 'filter_value':filter_buttons3, 'style':style_buttons3})

display(ui3, out3)

### Model Fairness across pre-processing parameter settings

In [None]:
results_buttons4 = widgets.RadioButtons(description='Results:', options = results_filter, value = 'sc_train', rows=5)
hparam_buttons4 = widgets.RadioButtons(description='Hparams:', options = hparams, value = 'input_features', rows=len(hparams))
xaxis_buttons4 = widgets.RadioButtons(description='y-axis', options = ['exp','hparam'], value = 'exp', rows=2)

ui4 = widgets.HBox([results_buttons4, xaxis_buttons4, hparam_buttons4])
out4 = widgets.interactive_output(results_plot.plot_param_performance, {'results':fixed_copy(results_bias), 'hparam':hparam_buttons4, 'metric':fixed('fairness'),
                                                            'exp_names':fixed_copy(exp_names), 'selection':results_buttons4, 'domain':fixed('model'), 'x_axis':xaxis_buttons4})

display(ui4, out4)

### Fairest runs for each model architecture and resample_rate

In [None]:
the_fairest = results_analysis.get_top_result(results_bias, 'model_bias', best=3)
print(the_fairest[the_fairest.dataset_name=='mswc_rw'][['run_name']])

In [None]:
output = widgets.Output()

def unique_sorted_values(array):
    unique = array.unique().tolist()
    unique.sort()
    return unique

cols = ['exp_name','exp_id','run_name','dataset_name','model_arch','model_base','resample_rate','input_features','frame_length','frame_step',
        'mel_bins','mfccs','window_fn','all_mcc','male_mcc','female_mcc','model_bias']

model_arch_ = widgets.SelectMultiple(description='models', options = model_arches, value = model_arches, rows=4)
resample_rate = widgets.SelectMultiple(description='resample rate', options = unique_sorted_values(results['resample_rate']), value = unique_sorted_values(results['resample_rate']), rows=2)
input_features = widgets.SelectMultiple(description='input features', options = unique_sorted_values(results['input_features']), value = unique_sorted_values(results['input_features']), rows=2)
dataset_name = widgets.SelectMultiple(description='dataset', options = unique_sorted_values(results['dataset_name']), value = ['speech_commands_gender'], rows=5)
hue_options = widgets.RadioButtons(description='colour', options = ['exp_name','input_features','frame_length','frame_step','mel_bins','mfccs','window_fn'], value = 'input_features', rows=5)
column_options = widgets.RadioButtons(description='columns', options = [None, 'model_arch','model_base'], value = 'model_base', rows=2)
row_options = widgets.RadioButtons(description='rows', options = [None, 'resample_rate', 'dataset_name'], value = 'dataset_name', rows=3)

input_widgets = widgets.HBox([hue_options, widgets.VBox([column_options, row_options]), widgets.VBox([dataset_name, model_arch_]), 
                              widgets.VBox([resample_rate, input_features])])

def common_filtering(dataset_name, model_arch_, resample_rate, input_features, hue_options, column_options, row_options):
    output.clear_output()
    kwargs = dict(zip(['dataset_name','model_arch_','resample_rate','input_features'],
                        [dataset_name, model_arch_, resample_rate, input_features]))
    common_filter = results_bias
    for k, v in kwargs.items():
        if k == 'model_arch_':
            k = 'exp_name'
            v = [exp for exp in unique_sorted_values(results['exp_name']) if any(model in exp for model in experiment_names)]#[models[val] for val in v])]
        common_filter = common_filter.loc[common_filter[k].isin(v), cols]
    common_filter.reset_index(drop=True, inplace=True)
    
    display(sns.relplot(data=common_filter, x="all_mcc", y="model_bias", hue=hue_options,
                              col=column_options, row=row_options, palette="tab10", height=4.5, aspect=1.3))
    display(results_analysis.get_top_result(common_filter, best=1, metric='all_mcc'))
    
output = widgets.interactive_output(common_filtering, dict(zip(['dataset_name','model_arch_','resample_rate','input_features','hue_options','column_options', 'row_options'],
                        [dataset_name, model_arch_, resample_rate, input_features, hue_options, column_options, row_options])))

display(input_widgets, output)

In [None]:
sns.jointplot(data = results_bias[results_bias['dataset_name']=='mswc_fr'], x="all_mcc", y="model_bias", hue='model_base')

In [None]:
sns.jointplot(data = results_bias[results_bias['exp_name'].isin(['sc16_cnn','sc16_llcnn','sc8_cnn','sc8_llcnn'])], x="all_mcc", y="model_bias", hue='input_features')

## Pre-processing Parameter Importance Analysis

In [None]:
pre_processing_params = ['mel_bins','frame_step','frame_length','mfccs','input_features','window_fn']
fcrit_preprocessing = results_analysis.fcrit(results, pre_processing_params, 0.01)
fcrit_preprocessing

In [None]:
def print_importance_tables(importance_tables, dataset, keys):
    print_out = pd.concat([importance_tables[dataset+k] for k in keys], 
                          axis=1, keys = keys).sort_index()
    return(print_out)

In [None]:
results_analysis.generate_importance_tables(results_bias[results_bias.equal_weighted==True], 'mswc_fr',
                                            'all_mcc', parameters="preprocessing", model_arch='cnn')['16000_']

In [None]:
results_analysis.generate_importance_tables(results_bias[results_bias.equal_weighted==True], 'mswc_fr',
                                            'model_bias', parameters="preprocessing", model_arch='low_latency_cnn')['16000_']

In [None]:
importance_tables_mcc = {}
for d in ['speech_commands_gender','mswc_de','mswc_en','mswc_fr','mswc_rw']:
    for a in model_arch.keys():
        for r in ['16000_', '8000_']:
            # check for cross-sampling with for x in [True, False]
            key = ''.join([datasets[d], r, a])
            importance_tables_mcc[key] = results_analysis.generate_importance_tables(
                results_bias[(results_bias.equal_weighted==True)], 
                d,'all_mcc', parameters="preprocessing", model_arch=a)[r]

In [None]:
# Parameter importance for models trained and evaluated on 16k data (accuracy)
print_importance_tables(importance_tables_mcc, 'mswc_en', ['16000_cnn', '8000_low_latency_cnn'])

In [None]:
print_importance_tables(importance_tables_mcc, 'mswc_de', ['16000_cnn', '8000_low_latency_cnn'])

In [None]:
importance_tables_bias = {}
for d in ['speech_commands_gender','mswc_de','mswc_en','mswc_fr','mswc_rw']:
    for a in model_arch.keys():
        for r in ['16000_', '8000_']:
            # check for cross-sampling with for x in [True, False]
            key = ''.join([datasets[d], r, a])
            importance_tables_bias[key] = results_analysis.generate_importance_tables(
                results_bias[(results_bias.equal_weighted==True)], 
                d,'model_bias', parameters="preprocessing", model_arch=a)[r]

In [None]:
print_importance_tables(importance_tables_bias, 'mswc_en', ['16000_cnn', '8000_low_latency_cnn'])

In [None]:
print_importance_tables(importance_tables_bias, 'mswc_de', ['16000_cnn', '8000_low_latency_cnn'])

In [None]:
results_bias[results_bias['exp_name'].isin(['sc8_llcnn-sc16','sc16_llcnn-sc8'])].groupby(['exp_name','mel_bins'])[['all_mcc','model_bias']].agg(['mean','std'])

The next 4 cells are currently not running %%%%%%%%%%%%%%%%%%%%%%%%%%

In [None]:
# Parameter importance for MCC accuracy of models and inputs at different sample rates
pd.concat({'16k input on 8k model':importance_tables['16000_cnn_x'].iloc[:,:2], 
           '8k input on 16k model':importance_tables['8000_cnn_x'].iloc[:,:2]}, axis=1).sort_index()

In [None]:
# Parameter importance for fairness of models and inputs at different sample rates
pd.concat({'16k input on 8k model':importance_tables['16000_cnn_x'].iloc[:,2:4], 
           '8k input on 16k model':importance_tables['8000_cnn_x'].iloc[:,2:4]}, axis=1).sort_index()

In [None]:
# Parameter importance for MCC accuracy of models and inputs at different sample rates
pd.concat({'16k input on 8k model':importance_tables['16000_low_latency_cnn_x'].iloc[:,:2], 
           '8k input on 16k model':importance_tables['8000_low_latency_cnn_x'].iloc[:,:2]}, axis=1).sort_index()

In [None]:
# Parameter importance for fairness of models and inputs at different sample rates
pd.concat({'16k input on 8k model':importance_tables['16000_low_latency_cnn_x'].iloc[:,2:4], 
           '8k input on 16k model':importance_tables['8000_low_latency_cnn_x'].iloc[:,2:4]}, axis=1).sort_index()

## Model Selection

#### Pre-processing parameter frequency count for models selected based on acceptable performance range

In [None]:
fair_and_accurate = pd.DataFrame()

for x in experiment_names:
    fair_and_accurate = fair_and_accurate.append(results_analysis.select_fairest_models_in_mcc_range(results_bias, x, best=6, min_percentage_of_mcc=0.985))
    
fair_and_accurate.sort_values(by=['exp_name','resample_rate'], inplace=True)

In [None]:
print(fair_and_accurate[fair_and_accurate.dataset_name=='mswc_rw'][['exp_id','run_name']])

In [None]:
plot_parameters_top_results(fair_and_accurate)

In [None]:
results_analysis.select_fairest_models_in_mcc_range(results_bias, 'sc8_llcnn', best=3, min_percentage_of_mcc=0.99)

#### Investigating performance and fairness at different pre-processing parameter settings

Compare performance of best model against default settings typically used in tutorials:
`'frame_length':0.025`, `'frame_step':0.4`, `'input_features':'log_mel_spectrogram'`, `'mel_bins':40`

In [None]:
results_analysis.compare_param_performance(results_bias, 'all_mcc', 'mswc35_de16_cnn','mswc35_de16_cnn', swap_param={'frame_length':0.025, 'frame_step':0.4, 'input_features':'log_mel_spectrogram',
                                                                                                                         'mfccs':-1, 'mel_bins':40, 'window_fn':'hamming'})[
    ['exp_name','model_arch','input_features','frame_length','frame_step',
     'mel_bins','mfccs','window_fn','all_mcc','model_bias'
    ]].sort_values(['exp_name','all_mcc'], ascending=False)

In [None]:
results_analysis.compare_param_performance(results_bias, 'all_mcc', best_model='mswc35_de8_cnn', compare_model='mswc35_de8_cnn', swap_param={'frame_length':0.025, 'frame_step':0.4, 'input_features':'log_mel_spectrogram',
                                                                                                                         'mfccs':-1, 'mel_bins':40, 'window_fn':'hamming'})[
    ['exp_name','model_arch','input_features','frame_length','frame_step',
     'mel_bins','mfccs','window_fn','all_mcc','model_bias'
    ]].sort_values(['exp_name','all_mcc'], ascending=False)

In [None]:
results_analysis.compare_param_performance(results_bias, 'all_mcc', 'sc16_llcnn','sc16_llcnn', swap_param={'frame_length':0.025, 'frame_step':0.4, 'input_features':'log_mel_spectrogram',
                                                                                                                         'mfccs':-1, 'mel_bins':40, 'window_fn':'hamming'})[
    ['exp_name','model_arch','input_features','frame_length','frame_step',
     'mel_bins','mfccs','window_fn','all_mcc','model_bias'
    ]].sort_values(['exp_name','all_mcc'], ascending=False)

Compare performance of best model for architecture and sample rate against naive selection of pre-processing parameters for best 16k CNN model

What would be the performance and fairness of an 8k model if we choose the same hyperparameter settings that produce the best 16k model with the same architecture (and vice versa)? 

In [None]:
results_analysis.compare_param_performance(results_bias, 'all_mcc', best_model='sc8_cnn',compare_model='sc16_cnn')#[
    #['exp_name','model_arch','input_features','frame_length','frame_step','mel_bins','mfccs','window_fn','all_mcc','model_bias']].sort_values(['exp_name','all_mcc'], ascending=False)

In [None]:
results_analysis.compare_param_performance(results_bias, 'all_mcc', best_model='sc16_cnn',compare_model='sc16_llcnn')[
   ['exp_name','model_arch','input_features','frame_length','frame_step','mel_bins','mfccs','window_fn','all_mcc','model_bias']].sort_values(['exp_name','all_mcc'], ascending=False)

In [None]:
results_analysis.compare_param_performance(results_bias, 'all_mcc', best_model='sc16_cnn',compare_model='sc16_llcnn')[
    ['exp_name','model_arch','input_features','frame_length','frame_step','mel_bins','mfccs','window_fn','all_mcc','model_bias']].sort_values(['exp_name','all_mcc'], ascending=False)

In [None]:
results_analysis.compare_param_performance(results_bias, 'all_mcc', best_model='sc16_llcnn',compare_model='sc8_llcnn')

In [None]:
results_analysis.compare_param_performance(results_bias, 'all_mcc', best_model='sc8_llcnn',compare_model='sc16_llcnn')

In [None]:
results_bias.loc[(results_bias['exp_name']=='sc16_llcnn')&
                 (results_bias['input_features']=='mfcc')&
                 (results_bias['frame_length']==0.02)&
                 (results_bias['frame_step']==0.4)&
#                  (results_bias['mel_bins']==40)&
                 (results_bias['mfccs']==10)&
                 (results_bias['window_fn']=='hamming'), ['mel_bins','all_mcc','model_bias']]

## Effects of Compression

In [None]:
def pruning_results(pruning_experiment, train_experiment, best_runs, fairest_runs, accurate_fair_runs):
    results_compress = pd.merge(compression_results[compression_results['exp_name'].isin(exp_names[pruning_experiment])], results[results['exp_name'].isin(exp_names[train_experiment])],#.iloc[:,6:22], 
                                how='left',left_on=['exp_id','trained_model_path'], right_on=['exp_id','run_name'], suffixes=[None,'_trained'])
    results_compress['equal_weighted'].fillna(False, inplace=True)
    results_compress['model_bias'] = results_compress.apply(lambda x: model_bias([x['female_mcc'], x['male_mcc']],x['all_mcc']), axis=1)
    results_compress['model_bias_trained'] = results_compress.apply(lambda x: model_bias([x['female_mcc_trained'], x['male_mcc_trained']],x['all_mcc_trained']), axis=1)
    results_compress['delta_all_mcc'] = results_compress['all_mcc'] - results_compress['all_mcc_trained']
    results_compress['delta_male_mcc'] = results_compress['male_mcc'] - results_compress['male_mcc_trained']
    results_compress['delta_female_mcc'] = results_compress['female_mcc'] - results_compress['female_mcc_trained']
    results_compress['delta_model_bias'] = results_compress['model_bias'] - results_compress['model_bias_trained']
    results_compress['model_selected_because'] = np.where(results_compress['trained_model_path'].isin(best_runs), 'best', 
                                                        np.where(results_compress['trained_model_path'].isin(fairest_runs), 'fairest',
                                                                          np.where(results_compress['trained_model_path'].isin(accurate_fair_runs), 
                                                                                   'accurate_fair_runs', np.nan)))#)
    results_compress.rename(columns={'model_bias':'model_bias_pruned','model_bias_trained':'model_bias_trained',
                    'delta_model_bias':'delta_model_bias'}, inplace=True)
    
    return results_compress

In [None]:
def duplicated_results_compress(results_compress):   
    duplicate_experiments = results_compress[results_compress.duplicated(
        subset=['exp_name','trained_model_path','pruning_schedule','pruning_learning_rate',
                'pruning_frequency','pruning_final_sparsity','quantize','quantization_optimization'], 
        keep=False)==True]
    duplicate_experiments.groupby(['exp_name','trained_model_path','pruning_schedule','pruning_learning_rate','pruning_frequency','pruning_final_sparsity'])['trained_model_path'].count().unstack()
    
    print("Duplicated experiments:", len(duplicate_experiments))
    
    return 

In [None]:
def missed_experiments(results_compress, best_fairest_accuratefair_runs:dict, equal_weighted):

    for k, v in best_fairest_accuratefair_runs.items():
        missed = list(set(v)\
                      - set((results_compress[results_compress.trained_model_path.isin(v)].groupby(['equal_weighted','trained_model_path'])['exp_id'].count().reset_index()
                             .pipe(lambda x: x[x['equal_weighted']==equal_weighted])
                             .pipe(lambda x: x['trained_model_path'])).values))
    
        print('{} runs, equal_weighted={}: Planned {}, Missed {} {}'.format(k, equal_weighted, len(v), len(missed), missed))
        
    return missed

In [None]:
def stats_compression_experiments(results_compress):

    print("\nBIAS: \n", results_compress.loc[:,['model_bias_pruned','model_bias_trained','delta_model_bias']].describe())
    print("\nACCURACY: \n", results_compress.loc[:,['all_mcc','all_mcc_trained','delta_all_mcc']].describe())

### Pruning Results per Dataset

In [None]:
pruning_params = ['pruning_learning_rate','pruning_schedule', 'pruning_frequency', 'pruning_final_sparsity']#,'trained_model_path','model_arch']
pruning_params_pretty = ['learning rate','schedule', 'frequency', 'final sparsity']#,'trained model','architecture']

#### Google Speech Commands

In [None]:
sc_best_runs = ['run-1628708435','run-1628757640','run-1628732129','run-1628728028','run-1628733620','run-1628787124','run-1628745492','run-1628778770','run-1628729675','run-1628795284','run-1628762389','run-1628794609']
sc_fairest_runs = ['run-1628769835','run-1628786232','run-1628785897','run-1628742549','run-1628765724','run-1628793147','run-1628763273','run-1628776241','run-1628796466','run-1628758822','run-1628753364','run-1628809058']
sc_accurate_fair_runs = ['run-1628726178','run-1628729666','run-1628733199','run-1628743272','run-1628759090','run-1628715119','run-1628737906','run-1628782987','run-1628799888','run-1628735838','run-1628806790']

sc_results_compress = pruning_results('compress_sc','sc_train',sc_best_runs,sc_fairest_runs,sc_accurate_fair_runs)
sc_results_compress.fillna(value={'dataset_name':'speech_commands_gender'}, inplace=True) # for a mysterious reason some datasets show up as nan
duplicated_results_compress(sc_results_compress)
sc_results_compress.drop_duplicates(subset=['exp_name','trained_model_path','pruning_schedule','pruning_learning_rate','pruning_frequency','pruning_final_sparsity','quantize','quantization_optimization'], 
                                    keep='last', inplace=True)

missed_experiments(sc_results_compress, 
                   {'best':sc_best_runs, 'fairest':sc_fairest_runs, 'accurate fair':sc_accurate_fair_runs}, 
                   equal_weighted=True)

print(sc_results_compress.groupby(['model_selected_because','exp_name','trained_model_path'])['exp_id'].count()
 .pipe(lambda x: x[x != 72]))
# # stats_compression_experiments(sc_results_compress)

In [None]:
results_analysis.generate_importance_tables(sc_results_compress[sc_results_compress.equal_weighted==True], 
                                            'speech_commands_gender','all_mcc', 
                                            parameters="compression", model_arch='low_latency_cnn')['16000_prune']

In [None]:
sc_dof_pruning, sc_fcrit_pruning = results_analysis.fcrit(sc_results_compress, pruning_params, 0.01)
print('dof:', sc_dof_pruning, '\nfcrit:', sc_fcrit_pruning)

sns.set_theme(style="whitegrid", font_scale=1.8)
results_plot.plot_param_importance(df=sc_results_compress[(sc_results_compress.equal_weighted==True)],
                                   dataset='speech_commands_gender',
                                   metrics={'all_mcc':'MCC','model_bias_pruned':'bias'}, 
                                   parameters="compression", 
                                   select_tables=['8000_prune','16000_prune'],
                                   fcrit=sc_fcrit_pruning, 
                                   save_fig=True,
                                   pretty_params=dict(zip(pruning_params, pruning_params_pretty)),
                                   plot_title = "speech commands pruning parameter importance for reliability bias and accuracy (MCC)")

results_plot.plot_param_importance(df=sc_results_compress,
                                   dataset='speech_commands_gender',
                                   metrics={'delta_all_mcc':'delta MCC','delta_model_bias':'delta bias'}, 
                                   parameters="compression", 
                                   select_tables=['8000_prune','16000_prune'],
                                   fcrit=sc_fcrit_pruning,  
                                   pretty_params=dict(zip(pruning_params, pruning_params_pretty)),
                                   plot_title = "speech commands pruning parameter importance for CHANGE in accuracy and reliability bias")

In [None]:
# results_plot.relplot_pruning_results(sc_results_compress, exp_names['compress_sc'])

In [None]:
results_plot.boxplot_pruning_results(sc_results_compress, exp_names['compress_sc'])

#### MSWC de

In [None]:
de_best_runs = ['run-1664180817','run-1664229064','run-1664250798','run-1664183108','run-1664261879','run-1664411664','run-1664181900','run-1664206013','run-1664214242','run-1664180873','run-1664184317','run-1664323171']
de_fairest_runs = ['run-1664213870','run-1664280611','run-1664355580','run-1664269786','run-1664314689','run-1664316426','run-1664208514','run-1664302583','run-1664379577','run-1664193259','run-1664246067','run-1664329149',]
de_accurate_fair_runs = ['run-1664186536','run-1664280930','run-1664195328','run-1664262566','run-1664265796','run-1664336740','run-1664318691','run-1664305528','run-1664304376','run-1664182005','run-1664182187','run-1664323528',]

de_results_compress = pruning_results('compress_mswc_de','mswc_de',de_best_runs,de_fairest_runs,de_accurate_fair_runs)
duplicated_results_compress(de_results_compress)

de_results_compress.drop_duplicates(subset=['exp_name','trained_model_path','pruning_schedule','pruning_learning_rate','pruning_frequency','pruning_final_sparsity','quantize','quantization_optimization'], 
                                    keep='last', inplace=True)

missed_experiments(de_results_compress, 
                   {'best':de_best_runs, 'fairest':de_fairest_runs, 'accurate fair':de_accurate_fair_runs}, 
                   equal_weighted=True)

print(de_results_compress.groupby(['model_selected_because','exp_name','trained_model_path'])['exp_id'].count()
 .pipe(lambda x: x[x != 72]))
# stats_compression_experiments(de_results_compress)

In [None]:
de_dof_pruning, de_fcrit_pruning = results_analysis.fcrit(de_results_compress, pruning_params, 0.01)
print('dof:', de_dof_pruning, '\nfcrit:', de_fcrit_pruning)

sns.set_theme(style="whitegrid", font_scale=1.8)
results_plot.plot_param_importance(df=de_results_compress[(de_results_compress.equal_weighted==True)],
                                   dataset='mswc_de',
                                   metrics={'all_mcc':'MCC','model_bias_pruned':'bias'}, 
                                   parameters="compression", 
                                   select_tables=['8000_prune','16000_prune'],
                                   fcrit=de_fcrit_pruning, 
                                   save_fig=True,
                                   pretty_params=dict(zip(pruning_params, pruning_params_pretty)),
                                   plot_title = "mswc_de pruning parameter importance for reliability bias and accuracy (MCC)")

results_plot.plot_param_importance(df=de_results_compress,
                                   dataset='mswc_de',
                                   metrics={'delta_all_mcc':'delta MCC','delta_model_bias':'delta bias'}, 
                                   parameters="compression", 
                                   select_tables=['8000_prune','16000_prune'],
                                   fcrit=de_fcrit_pruning,  
                                   pretty_params=dict(zip(pruning_params, pruning_params_pretty)),
                                   plot_title = "mswc_de pruning parameter importance for CHANGE in accuracy and reliability bias")

In [None]:
# results_plot.relplot_pruning_results(de_results_compress, exp_names['compress_mswc_de'])

In [None]:
results_plot.boxplot_pruning_results(de_results_compress, exp_names['compress_mswc_de'])

#### MSWC fr

In [None]:
#for mswc_fr

fr_best_runs = ['run-1664062899','run-1664067479','run-1664122910','run-1664108246','run-1664108466','run-1664154522','run-1664021255','run-1664043976','run-1664090960','run-1664017583','run-1664116909','run-1664124001']
fr_fairest_runs = ['run-1664051632','run-1664082316','run-1664134988','run-1664018358','run-1664032036','run-1664063647','run-1664085357','run-1664086441','run-1664096317','run-1664045404','run-1664097051','run-1664103387']
fr_accurate_fair_runs = ['run-1664042826','run-1664049510','run-1664061342','run-1664121834','run-1664109926','run-1664096743','run-1664075625','run-1664028003','run-1664053130','run-1664118505','run-1664077198','run-1664118377']

fr_results_compress = pruning_results('compress_mswc_fr','mswc_fr',fr_best_runs,fr_fairest_runs,fr_accurate_fair_runs)
duplicated_results_compress(fr_results_compress)

fr_results_compress.drop_duplicates(subset=['exp_name','trained_model_path','pruning_schedule','pruning_learning_rate','pruning_frequency','pruning_final_sparsity','quantize','quantization_optimization'], 
                                    keep='last', inplace=True)

missed_experiments(fr_results_compress, 
                   {'best':fr_best_runs, 'fairest':fr_fairest_runs, 'accurate fair':fr_accurate_fair_runs}, 
                   equal_weighted=True)

print(fr_results_compress.groupby(['model_selected_because','exp_name','trained_model_path'])['exp_id'].count()
 .pipe(lambda x: x[x != 72]))
# stats_compression_experiments(fr_results_compress)

In [None]:
fr_dof_pruning, fr_fcrit_pruning = results_analysis.fcrit(fr_results_compress, pruning_params, 0.01)
print('dof:', fr_dof_pruning, '\nfcrit:', fr_fcrit_pruning)

sns.set_theme(style="whitegrid", font_scale=1.8)
results_plot.plot_param_importance(df=fr_results_compress,
                                   dataset='mswc_fr',
                                   metrics={'all_mcc':'MCC','model_bias_pruned':'bias'}, 
                                   parameters="compression", 
                                   select_tables=['8000_prune','16000_prune'],
                                   fcrit=fr_fcrit_pruning, 
                                   save_fig=True,
                                   pretty_params=dict(zip(pruning_params, pruning_params_pretty)),
                                   plot_title = "mswc_fr pruning parameter importance for reliability bias and accuracy (MCC)")

results_plot.plot_param_importance(df=fr_results_compress,
                                   dataset='mswc_fr',
                                   metrics={'delta_all_mcc':'delta MCC','delta_model_bias':'delta bias'}, 
                                   parameters="compression", 
                                   select_tables=['8000_prune','16000_prune'],
                                   fcrit=fr_fcrit_pruning,  
                                   pretty_params=dict(zip(pruning_params, pruning_params_pretty)),
                                   plot_title = "mswc_fr pruning parameter importance for CHANGE in accuracy and reliability bias")

In [None]:
# results_plot.relplot_pruning_results(fr_results_compress, exp_names['compress_mswc_fr'])

In [None]:
results_plot.boxplot_pruning_results(fr_results_compress, exp_names['compress_mswc_fr'])

#### MSWC en

In [None]:
#for mswc_en

en_best_runs = ['run-1664275396','run-1664386479','run-1664491179','run-1664201605','run-1664379390','run-1664508832','run-1664214160','run-1664253756','run-1664403188','run-1664378805','run-1664502091','run-1664512368']
en_fairest_runs = ['run-1664188494','run-1664216627','run-1664494629','run-1664591096','run-1664634078','run-1664656518','run-1664313713','run-1664462393','run-1664492924','run-1664387434','run-1664569767','run-1664636409']
en_accurate_fair_runs = ['run-1664215347','run-1664198161','run-1664487562','run-1664508099','run-1664378626','run-1664508446','run-1664186500','run-1664427588','run-1664268595','run-1664217620','run-1664185374','run-1664189422']

en_results_compress = pruning_results('compress_mswc_en','mswc_en',en_best_runs,en_fairest_runs,en_accurate_fair_runs)
duplicated_results_compress(en_results_compress)

en_results_compress.drop_duplicates(subset=['exp_name','trained_model_path','pruning_schedule','pruning_learning_rate','pruning_frequency','pruning_final_sparsity','quantize','quantization_optimization'], 
                                    keep='last', inplace=True)

missed_experiments(en_results_compress, 
                   {'best':en_best_runs, 'fairest':en_fairest_runs, 'accurate fair':en_accurate_fair_runs}, 
                   equal_weighted=True)

print(en_results_compress.groupby(['model_selected_because','exp_name','trained_model_path'])['exp_id'].count()
 .pipe(lambda x: x[x != 72]))
# stats_compression_experiments(en_results_compress)

In [None]:
en_dof_pruning, en_fcrit_pruning = results_analysis.fcrit(en_results_compress, pruning_params, 0.01)
print('dof:', en_dof_pruning, '\nfcrit:', en_fcrit_pruning)

sns.set_theme(style="whitegrid", font_scale=1.8)
results_plot.plot_param_importance(df=en_results_compress,
                                   dataset='mswc_en',
                                   metrics={'all_mcc':'MCC','model_bias_pruned':'bias'}, 
                                   parameters="compression", 
                                   select_tables=['8000_prune','16000_prune'],
                                   fcrit=en_fcrit_pruning, 
                                   save_fig=True,
                                   pretty_params=dict(zip(pruning_params, pruning_params_pretty)),
                                   plot_title = "mswc_en pruning parameter importance for reliability bias and accuracy (MCC)")

results_plot.plot_param_importance(df=en_results_compress,
                                   dataset='mswc_en',
                                   metrics={'delta_all_mcc':'delta MCC','delta_model_bias':'delta bias'}, 
                                   parameters="compression", 
                                   select_tables=['8000_prune','16000_prune'],
                                   fcrit=en_fcrit_pruning,  
                                   pretty_params=dict(zip(pruning_params, pruning_params_pretty)),
                                   plot_title = "mswc_en pruning parameter importance for CHANGE in accuracy and reliability bias")

In [None]:
# results_plot.relplot_pruning_results(en_results_compress, exp_names['compress_mswc_en'])

In [None]:
results_plot.boxplot_pruning_results(en_results_compress, exp_names['compress_mswc_en'])

#### MSWC rw

In [None]:
rw_best_runs = ['run-1664017217','run-1664034824','run-1664076207','run-1664031270','run-1664052670','run-1664060100','run-1664017072','run-1664034345','run-1664060358','run-1664057681','run-1664115743','run-1664115971']
rw_fairest_runs = ['run-1664018113','run-1664036302','run-1664076284','run-1664040385','run-1664053265','run-1664104614','run-1664033769','run-1664038064','run-1664091561','run-1664042712','run-1664110816','run-1664127256']
rw_accurate_fair_runs = ['run-1664048908','run-1664049806']

rw_results_compress = pruning_results('compress_mswc_rw','mswc_rw',rw_best_runs,rw_fairest_runs,rw_accurate_fair_runs)
duplicated_results_compress(rw_results_compress)
rw_results_compress.drop_duplicates(subset=['exp_name','trained_model_path','pruning_schedule','pruning_learning_rate','pruning_frequency','pruning_final_sparsity','quantize','quantization_optimization'], 
                                    keep='last', inplace=True)

missed_experiments(rw_results_compress, 
                   {'best':rw_best_runs, 'fairest':rw_fairest_runs, 'accurate fair':rw_accurate_fair_runs}, 
                   equal_weighted=True)

print('\n',rw_results_compress.groupby(['model_selected_because','exp_name','trained_model_path'])['exp_id'].count()
 .pipe(lambda x: x[x != 72]))
# stats_compression_experiments(rw_results_compress)

In [None]:
rw_dof_pruning, rw_fcrit_pruning = results_analysis.fcrit(rw_results_compress, pruning_params, 0.01)
print('dof:', rw_dof_pruning, '\nfcrit:', rw_fcrit_pruning)

sns.set_theme(style="whitegrid", font_scale=1.8)
results_plot.plot_param_importance(df=rw_results_compress[(rw_results_compress.equal_weighted==True)],
                                   dataset='mswc_rw',
                                   metrics={'all_mcc':'MCC','model_bias_pruned':'bias'}, 
                                   parameters="compression", 
                                   select_tables=['8000_prune','16000_prune'],
                                   fcrit=rw_fcrit_pruning, 
                                   save_fig=True,
                                   pretty_params=dict(zip(pruning_params, pruning_params_pretty)),
                                   plot_title = "mswc_rw pruning parameter importance for reliability bias and accuracy (MCC)")

results_plot.plot_param_importance(df=rw_results_compress,
                                   dataset='mswc_rw',
                                   metrics={'delta_all_mcc':'delta MCC','delta_model_bias':'delta bias'}, 
                                   parameters="compression", 
                                   select_tables=['8000_prune','16000_prune'],
                                   fcrit=rw_fcrit_pruning,  
                                   pretty_params=dict(zip(pruning_params, pruning_params_pretty)),
                                   plot_title = "mswc_rw pruning parameter importance for CHANGE in accuracy and reliability bias")

In [None]:
# results_plot.relplot_pruning_results(rw_results_compress, exp_names['compress_mswc_rw'])

In [None]:
results_plot.boxplot_pruning_results(rw_results_compress, exp_names['compress_mswc_rw'])

### Pruning Results Across Datasets

In [None]:
results_compress_compare = pd.concat([sc_results_compress, de_results_compress, fr_results_compress,
                                     en_results_compress, rw_results_compress])

In [None]:
results_analysis.generate_importance_tables(results_compress_compare, 'mswc_de', 'all_mcc', parameters="compression", 
                                                    model_arch='cnn')['16000_prune']

In [None]:
importance_tables_mcc_pruned = {}
for d in ['speech_commands_gender','mswc_de','mswc_en','mswc_fr','mswc_rw']:
    for a in model_arch.keys():
        for r in ['16000_prune', '8000_prune']:
            key = ''.join([datasets[d], r, a])
            importance_tables_mcc_pruned[key] = results_analysis.generate_importance_tables(
                results_compress_compare, 
                d,'all_mcc', parameters="compression", model_arch=a)[r]

In [None]:
print_importance_tables(importance_tables_mcc_pruned, 'mswc_en', '16000_prune')

In [None]:
importance_tables_bias_pruned = {}
for d in ['speech_commands_gender','mswc_de','mswc_en','mswc_fr','mswc_rw']:
    for a in model_arch.keys():
        for r in ['16000_prune', '8000_prune']:
            key = ''.join([datasets[d], r, a])
            importance_tables_bias_pruned[key] = results_analysis.generate_importance_tables(
                results_compress_compare, 
                d,'model_bias_pruned', parameters="compression", model_arch=a)[r]

In [None]:
print_importance_tables(importance_tables_bias_pruned, 'mswc_en', '16000_prune')

In [None]:
data = pd.DataFrame()
for m in ['all_mcc','model_bias_pruned']:
    for d in ['speech_commands_gender','mswc_de','mswc_fr','mswc_en','mswc_rw']:
        imp_tab = results_analysis.generate_importance_tables(results_compress_compare, d, m, parameters="compression", 
                                                              model_arch='cnn')['16000_prune']
        imp_tab['dataset']=d
        imp_tab['metric']=m
        data = pd.concat([data, imp_tab], axis=0)
data = data.reset_index().rename(columns={'index': 'parameters'})
cat_order = np.sort(data['parameters'].unique())
g = sns.catplot(data=data.sort_values(by=['dataset','parameters']), 
                x='parameters', y='F Score', hue='metric', col='dataset', kind='bar', 
                order= cat_order, aspect=1.2,
                palette = ['orange', 'blue'])

for ax in g.axes.flat:
    ax.set_yscale('log')
    xlabels = [dict(zip(pruning_params, pruning_params_pretty))[x.get_text()] for x in ax.get_xticklabels()]
    ax.set_xticklabels(xlabels, rotation=45, horizontalalignment='right')
    ax.hlines(y=sc_fcrit_pruning, xmin=-0.5, xmax=0.9*len(cat_order), color="black", ls='--', linewidth=2)
    ax.set_xlabel('')
    ax.set_ylabel('$F\ Score$ (log scale)')
    ax.set_title('{}{}'.format(ax.get_title().split(' = ')[-1][:-3], ax.get_title().split(' = ')[-1][-3:]))
    for bar in ax.patches:
        if bar.get_height() < sc_fcrit_pruning:
            bar.set_color('lightgrey') 
            
plt.suptitle("Pruning parameter importance for 16k CNN models trained on 5 datasets ", fontsize='x-large', va='top', y=1.1);

In [None]:
g = sns.catplot(data=data.sort_values(by=['dataset','parameters']), 
                x='parameters', y='Ridge coef', hue='metric', col='dataset', kind='bar', 
                order= cat_order, aspect=1.1,
                palette = ['orange', 'blue'])

for ax in g.axes.flat:
    xlabels = [dict(zip(pruning_params, pruning_params_pretty))[x.get_text()] for x in ax.get_xticklabels()]
    ax.set_xticklabels(xlabels, rotation=45, horizontalalignment='right')
    ax.set_xlabel('')
    ax.set_ylabel('Ridge coefficient')
    ax.set_title('{}{}'.format(ax.get_title().split(' = ')[-1][:-3], ax.get_title().split(' = ')[-1][-3:]))
    for bar in ax.patches:
        if -0.01 < bar.get_height() < 0.01:
            bar.set_color('lightgrey') 
            
plt.suptitle("Pruning parameter importance for 16k CNN models trained on 5 datasets ", fontsize='x-large', va='top', y=1.1);

In [None]:
g = sns.catplot(data=data.sort_values(by=['dataset','parameters']), 
                x='parameters', y='MI', hue='metric', col='dataset', kind='bar', 
                order= cat_order, aspect=1.1,
                palette = ['orange', 'blue'])

for ax in g.axes.flat:
    xlabels = [dict(zip(pruning_params, pruning_params_pretty))[x.get_text()] for x in ax.get_xticklabels()]
    ax.set_xticklabels(xlabels, rotation=45, horizontalalignment='right')
    ax.set_xlabel('')
    ax.set_ylabel('Mutual Information')
    ax.set_title('{}{}'.format(ax.get_title().split(' = ')[-1][:-3], ax.get_title().split(' = ')[-1][-3:]))
    for bar in ax.patches:
        if -0.01 < bar.get_height() < 0.01:
            bar.set_color('lightgrey') 
            
plt.suptitle("Pruning parameter importance for 16k CNN models trained on 5 datasets ", fontsize='x-large', va='top', y=1.1);

In [None]:
results_plot.boxplot_pruning_results(results_compress_compare, 
                        exp_names['compress_sc']+exp_names['compress_mswc_de']+exp_names['compress_mswc_fr']+exp_names['compress_mswc_en']+exp_names['compress_mswc_rw'])

In [None]:
sns.set_context('paper', font_scale=2.5)
df = results_compress_compare.melt(id_vars=results_compress_compare.columns.to_list()[:-4]+['delta_model_bias','model_selected_because'], 
                            value_vars=['delta_female_mcc','delta_male_mcc'], var_name='domain')

df = df.replace({'domain': {'delta_male_mcc': 'male', 'delta_female_mcc': 'female'}, 
                           'dataset_name': {'speech_commands_gender':'google_sc'}}
                         ).sort_values(by=['exp_name', 'model_arch','domain'])
# df.rename(columns={'domain_bias':'domain_bias'}, inplace=True)
hue_order=df["domain"].unique().sort()

# .sort_values(by=['exp_name', 'model_arch'])
g = sns.FacetGrid(data=df, height=8, aspect=1.8, row="model_arch", col="resample_rate", #col_order=[16000,8000],
                  sharey=True, sharex=False)
g.map(sns.boxplot, "dataset_name", "value", "domain", palette='tab10', showfliers=False,
      hue_order=hue_order, 
      order=df.dataset_name.unique(), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3))

g.map(sns.stripplot, "dataset_name", "value", "domain", palette='tab10', dodge=True,
      hue_order=hue_order, 
      order=df.dataset_name.unique())

g.fig.tight_layout()
g.add_legend()
for ax in g.fig.axes:
    ax.xaxis.grid(True, which='major', linestyle='-')
    ax.set_xticks([0.5, 1.5, 2.5, 3.5, 4.5])
g.set_xticklabels(rotation=0, fontdict={'ha': 'right'})
display(g)

In [None]:
sns.set_context('paper', font_scale=2)

g = sns.FacetGrid(pd.melt(results_compress[results_compress['exp_name'].isin(exp_names['compress_sc'])
                ].sort_values(by=['model_arch','exp_name']), id_vars=['exp_id', 'exp_name', 'model_arch', 'dataset_name', 'resample_rate',
       'domains', 'trained_model_path', 'quantize', 'prune', 'run_name',
       'compression_time', 'pruning_learning_rate', 'pruning_schedule',
       'pruning_frequency', 'pruning_final_sparsity', 'model_selected_because',
       'all_mcc', ], value_vars=['male_mcc', 'female_mcc']),
                  col="exp_name", col_wrap=4, height=5, aspect=1.2, hue="model_selected_because",) #palette=['fuchsia','green'])
g.map(sns.kdeplot, "value",)
g.add_legend(title='Subgroup', frameon=True)
legend = g._legend
# legend.texts[0].set_text('male')
# legend.texts[1].set_text('female')
# g.map(sns.kdeplot, "all_mcc", color='black', ls='--')
# g.add_legend(title='', handles= [g._legend_data['male_mcc']], labels=['all'], bbox_to_anchor=(0.925, 0.41))

## Tables and Figures for SenSys AI Challenge IoT workshop paper

In [None]:
f, ax = results_plot.plot_hparam_performance(results[(results['resample_rate']==16000)], hparam='input_features', metric='mcc', exp_names=exp_names,
                                selection='sc_train', domain='all', x_axis='exp', save=False)
ax.set_title('Effect of # Mel bins on Model Performance for MFCC Features', fontsize='large', pad=30)
ax.set_xlabel('Model performance (MCC)', fontsize='large', labelpad=10)
ax.set_yticklabels(['16k CNN','16k llCNN'])
ax.tick_params(axis='both', which='major', labelsize=20)
ax.set_xlim([0.65,0.89])
ax.set_ylabel('Experiments', fontsize='large', labelpad=10)
ax.legend(title='feature type', loc='upper left', bbox_to_anchor=(0.9,0.5))
# ax.hlines(y=0.5, xmin=0.52, xmax=0.88, color="black", ls='--', linewidth=0.8)
sns.despine(bottom=True, left=True)
f.set_size_inches(11,5)

In [None]:
f, ax = results_plot.plot_hparam_performance(results[(results['resample_rate']==16000)&(results['mfccs']!=-1)], hparam='mel_bins', metric='mcc', 
                                 exp_names=exp_names, selection='sc_train', domain='all', x_axis='exp', save=False)
# f, ax = plot_hparam_performance(results, hparam='input_features', metric='mcc', selection='sc_resample', domain='all', x_axis='exp', save=False)
# ax.set_title('Effect of Feature Type on Model Performance when Varying Sample Rate', fontsize='large')
ax.set_title('Effect of # Mel bins on Model Performance for MFCC Features', fontsize='large')
ax.set_xlabel('Model performance (MCC)', fontsize='large')
ax.set_yticklabels(['16k CNN','16k llCNN'])#,'8k CNN','8k llCNN'])
# ax.set_yticklabels(['8k input|16k CNN','8k input|16k llCNN','16k input|8k CNN','16k input|8k llCNN'])
ax.tick_params(axis='both', which='major', labelsize=14)
# ax.set_xlim([0.65,0.89])
ax.set_ylabel('Experiments', fontsize='large')
ax.legend(title='feature type', loc='lower right')
ax.hlines(y=0.5, xmin=0.52, xmax=0.88, color="black", ls='--', linewidth=0.8)
sns.despine(bottom=True, left=True)
f.set_size_inches(11,5)

In [None]:
f, ax = results_plot.plot_hparam_performance(results_bias[(results_bias['resample_rate']==16000)&(results_bias['mfccs']==-1)], 
                                                 hparam='mel_bins', metric='bias', selection='sc_train', domain='model', exp_names=exp_names,
                                                 x_axis='exp', save=False)
# f, ax = plot_hparam_performance(results, hparam='input_features', metric='mcc', selection='sc_resample', domain='all', x_axis='exp', save=False)
# ax.set_title('Effect of Feature Type on Model Performance when Varying Sample Rate', fontsize='large')
ax.set_title('Effect of # Mel bins on Model Performance for Log Mel Spectrogram Features', fontsize='large')
ax.set_xlabel('Model bias', fontsize='large')
ax.set_yticklabels(['16k CNN','16k llCNN'])#,'8k CNN','8k llCNN'])
# ax.set_yticklabels(['8k input|16k CNN','8k input|16k llCNN','16k input|8k CNN','16k input|8k llCNN'])
ax.tick_params(axis='both', which='major', labelsize=14)
# ax.set_xlim([0.65,0.89])
ax.set_ylabel('Experiments', fontsize='large')
ax.legend(title='feature type', loc='lower right')
sns.despine(bottom=True, left=True)
f.set_size_inches(11,5)

In [None]:
g = sns.relplot(data=results_fairness[results_fairness['exp_name'].isin(['sc16_cnn','sc16_cnn-sc8'])].sort_values('frame_length'), x="all_mcc", y="model_fairness", hue='input_features',
                              col='exp_name', row=row_options, palette="tab10", height=5, aspect=1.1)

ax_titles = ['16k CNN', '8k input on 16k CNN']
for ax in g.axes.flat:
    ax_title = ax.get_title().split(' = ')[-1]
    arch = ax_title.split('-')[0].split('_')[-1].upper()    
    train_sr = ax_title.split('-')[0].split('_')[0].strip('sc')
    input_sr = ax_title.split('-')[-1].strip('sc')
    if len(input_sr) <= 2:
        ax.set_title('{}k input on {}k {} model'.format(input_sr, train_sr, arch), fontsize='large')
    else:
        ax.set_title('{}k {} model'.format(train_sr, arch), fontsize='large')
    ax.tick_params(axis='both', which='major', labelsize=14)
#     ax.set_xlim([0.55,0.89])
#     ax.set_ylim([0.55,0.89])
    ax.set_ylabel('Model fairness', fontsize='large')
    ax.set_xlabel('Model performance (MCC)', fontsize='large')
    g._legend.set_bbox_to_anchor((0.85, 0.826))
    g._legend.set_title('feature type')
    sns.despine(bottom=True, left=True)
    

Coswara dataset submission ID: omPMrLojwZgpYrhycAOXQ3QSfV63
Date submitted: 27 October 2021