# Results Analysis for TOSEM 2022 paper

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy

import seaborn as sns
sns.set_context("paper")
sns.set_theme(style="whitegrid")
import matplotlib.pyplot as plt
plt.show()
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display
    
from fair_embedded_ml.metrics import domain_bias, model_bias
from fair_embedded_ml import results_analysis 
from fair_embedded_ml.results_analysis import exp_names
from fair_embedded_ml import results_plot

from scipy.stats import f, shapiro
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices

x_dir = '~/Projects/fair_embedded_ml_results/' #'/data/experiments/'
    
class fixed_copy(fixed):
    def get_interact_value(self):
        return self.value.copy()

In [None]:
results = results_analysis.get_results(x_dir)
compression_results = results_analysis.get_compression_results(x_dir)
results_mcc = results_analysis.get_results_for_domains(results)
results_bias = results_analysis.get_bias_results(results_mcc)

In [None]:
experiment_prefixes = ['sc','mswc35_de','mswc35_en','mswc35_fr','mswc35_rw']
model_arches = ['8_cnn','16_cnn','8_llcnn','16_llcnn']
experiment_names = [i + j for i in experiment_prefixes for j in model_arches]

In [None]:
results_bias = results_bias[results_bias.exp_name.isin(experiment_names)]

In [None]:
fig_data = pd.DataFrame()

for x in experiment_names:
    fig_data = pd.concat([fig_data, results_analysis.select_fairest_models_in_mcc_range(results_bias, 
                                                                                        x, best=3456, 
                                                                                        min_percentage_of_mcc=0)])

fig_data['exp_cat'] = fig_data[['model_arch','resample_rate']
                              ].apply(lambda row: ' '.join(row.astype(str)).replace('_',' ')[:-5]+'k', axis=1)
cols = fig_data.columns.to_list()[-1:] + fig_data.columns.to_list()[:-1]
fig_data = fig_data[cols]

fig_data.rename(columns={'model_bias':'reliability bias', 'all_mcc':'MCC (accuracy)',
                         'male_fairness':'male_bias','female_fairness':'female_bias'}, inplace=True)  
fig_data.replace({'dataset_name': {'speech_commands_gender':'google_sc'}}, inplace=True)

# Statistical Analysis

Note: we have a balanced study desig - by design (we set our experiments up to iterate through parameters)  
Read more about factorial anova: https://learningstatisticswithr.com/book/anova2.html   
Read more about how to do a factorial anova analysis: https://www.pythonfordatascience.org/factorial-anova-python/#test_with_python  
Check if assumptions hold for model before doing anova: Levene test (homogeneity of variancy), normality of residuals, independence of observations
1. check overall effect (H0: overall model is significant)
2. check interaction effect (only if overall effect is significant)
3. check main effect (only if interaction effect is insignificant)

## Effect of pre-processing parameters on MCC (accuracy)

In [None]:
results_bias.groupby(['dataset_name','model_arch','resample_rate'])['all_mcc'].agg(['mean','std','var']).describe()

### Factorial Anova Tests

In [None]:
def model_stats(model):
    print('R_squared: {}\nR_squared_adj: {}\nF_val: {}\np_val: {}\ndof: {}'.format(
        model.rsquared, model.rsquared_adj, model.fvalue, model.f_pvalue, model.df_model))

formula = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum)\
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum)'

f_modx1 = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) +\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(mfccs, Sum)*C(mel_bins, Sum)\
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum) \
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(frame_length, Sum)*C(frame_step, Sum)*C(window_fn, Sum)'

#### These models result in more complex interaction effects ~~~~~~~~~~~~~~~~~~~~~~~

f_modx2a = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) +\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(model_arch, Sum)*C(resample_rate, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(mfccs, Sum) \
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum) \
+ C(model_arch, Sum)*C(resample_rate, Sum)*C(frame_length, Sum)*C(frame_step, Sum) + C(model_arch, Sum)*C(window_fn, Sum)*C(frame_length, Sum)*C(frame_step, Sum)'
# dataset_name * model_arch * mfccs * mel_bins interaction significant at 1% level
# dataset_name * model_arch * resample_rate * mfccs interaction significant at 5% level

f_modx2b = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(resample_rate, Sum)*C(mfccs, Sum)*C(mel_bins, Sum)\
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum) \
+ C(dataset_name, Sum)*C(resample_rate, Sum)*C(frame_length, Sum)*C(frame_step, Sum) + C(dataset_name, Sum)*C(window_fn, Sum)*C(frame_length, Sum)*C(frame_step, Sum)'
# no significant interactions

f_modx3a = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) +\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(resample_rate, Sum)*C(mfccs, Sum)   \
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum) \
+ C(model_arch, Sum)*C(frame_length, Sum)*C(frame_step, Sum) + C(window_fn, Sum)*C(frame_length, Sum)*C(frame_step, Sum)'
# dataset_name * model_arch * mfccs * mel_bins interaction significant at 5% level only
# dataset_name * resample_rate * mfccs interaction significant at 1% level
# model_arch * frame_length * frame_step interaction significant at 1% level

f_modx4a = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) +\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(resample_rate, Sum)*C(mfccs, Sum) \
+ C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) + C(dataset_name, Sum)*C(model_arch, Sum)*C(mfccs, Sum) \
+ C(dataset_name, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) + C(dataset_name, Sum)*C(model_arch, Sum)*C(mel_bins, Sum) \
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum) \
+ C(model_arch, Sum)*C(frame_length, Sum)*C(frame_step, Sum)'
# dataset_name * model_arch * mfccs interaction significant at 1% level
# model_arch * mfccs * mel_bins interaction significant at 1% level
# dataset_name * model_arch * mel_bins interaction significant at 5% level only
# dataset_name * mfccs * mel_bins interaction significant at 5% level only

f_modx5a = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) +\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(resample_rate, Sum)*C(mfccs, Sum) + C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) + C(dataset_name, Sum)*C(model_arch, Sum)*C(mfccs, Sum) \
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum) \
+ C(model_arch, Sum)*C(frame_length, Sum)*C(frame_step, Sum)'
# dataset_name * model_arch * resample_rate interaction significant at 5%

f_modx6a = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) +\
+ C(model_arch, Sum)*C(resample_rate, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(resample_rate, Sum)*C(mfccs, Sum) + C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) + C(dataset_name, Sum)*C(model_arch, Sum)*C(mfccs, Sum) \
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum) \
+ C(model_arch, Sum)*C(frame_length, Sum)*C(frame_step, Sum)'
# no significant interactions 

f_modx7a = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) +\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(resample_rate, Sum)*C(mfccs, Sum) + C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) + C(dataset_name, Sum)*C(model_arch, Sum)*C(mfccs, Sum) \
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum) \
+ C(model_arch, Sum)*C(frame_length, Sum)*C(frame_step, Sum)'
# window_fn no statistically significant effect

f_modx8a = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) +\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(resample_rate, Sum)*C(mfccs, Sum) + C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) + C(dataset_name, Sum)*C(model_arch, Sum)*C(mfccs, Sum) \
+ C(frame_length, Sum) + C(frame_step, Sum) \
+ C(model_arch, Sum)*C(frame_length, Sum)*C(frame_step, Sum)'
# final model of interaction effects of pre-processing parameters on MCC at 1% significance level

#### These models result in less complex interaction effects ~~~~~~~~~~~~~~

f_modx2 = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(model_arch, Sum)*C(resample_rate, Sum)*C(mfccs, Sum)*C(mel_bins, Sum)\
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum) \
+ C(model_arch, Sum)*C(resample_rate, Sum)*C(frame_length, Sum)*C(frame_step, Sum) + C(model_arch, Sum)*C(window_fn, Sum)*C(frame_length, Sum)*C(frame_step, Sum)'
# no significant interactions

f_modx3 = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) + C(resample_rate, Sum)*C(mfccs, Sum)*C(mel_bins, Sum)\
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum) \
+ C(model_arch, Sum)*C(frame_length, Sum)*C(frame_step, Sum) + C(window_fn, Sum)*C(frame_length, Sum)*C(frame_step, Sum)'
# model_arch * mfccs * mel_bins interaction significant at 1% level

f_modx4 = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) \
+ C(resample_rate, Sum)*C(mfccs, Sum) + C(resample_rate, Sum)*C(mel_bins, Sum)\
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum)\
+ C(frame_length, Sum)*C(frame_step, Sum) + C(model_arch, Sum)*C(frame_step, Sum) + C(model_arch, Sum)*C(frame_length, Sum)\
+ C(window_fn, Sum)*C(frame_step, Sum) + C(window_fn, Sum)*C(frame_length, Sum)'
# frame_length * frame_step interaction significant at 5% level
# resample_rate * mfccs interaction significant at 1% level

f_modx5 = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) \
+ C(resample_rate, Sum)*C(mfccs, Sum)\
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum)'
#+ C(frame_length, Sum)*C(frame_step, Sum)'
# dataset_name * model_arch * resample_rate interaction significant at 5%

f_modx6 = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(model_arch, Sum)*C(resample_rate, Sum) + C(dataset_name, Sum)*C(resample_rate, Sum) + C(dataset_name, Sum)*C(model_arch, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) \
+ C(resample_rate, Sum)*C(mfccs, Sum)\
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum)'
#+ C(frame_length, Sum)*C(frame_step, Sum)'
# dataset_name * model_arch interaction significant at 1%
# dataset_name * resample_rate interaction significant at 1%

f_modx6_5percent = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) \
+ C(resample_rate, Sum)*C(mfccs, Sum)\
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum)\
+ C(frame_length, Sum)*C(frame_step, Sum)'
# window_fn no statistically significant effect

f_modx7 = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(dataset_name, Sum)*C(resample_rate, Sum) + C(dataset_name, Sum)*C(model_arch, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) \
+ C(resample_rate, Sum)*C(mfccs, Sum)\
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum)'
#+ C(frame_length, Sum)*C(frame_step, Sum)'
# window_fn & frame_length no statistically significant effect

f_modx7_5percent = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) \
+ C(resample_rate, Sum)*C(mfccs, Sum)\
+ C(frame_length, Sum) + C(frame_step, Sum)\
+ C(frame_length, Sum)*C(frame_step, Sum)'
# final model of interaction effects of pre-processing parameters on MCC at 5% significance level

f_modx8 = 'all_mcc ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(dataset_name, Sum)*C(resample_rate, Sum) + C(dataset_name, Sum)*C(model_arch, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(model_arch, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) \
+ C(resample_rate, Sum)*C(mfccs, Sum)\
+ C(frame_step, Sum)'
# final model of interaction effects of pre-processing parameters on MCC at 1% significance level

lm = smf.ols(formula, data = results_bias).fit() #[results_bias.dataset_name=='mswc_de']
# lm_X1 = smf.ols(f_modx1, data = results_bias).fit()
# lm_X2 = smf.ols(f_modx2, data = results_bias).fit()
# lm_X2a = smf.ols(f_modx2a, data = results_bias).fit()
# lm_X3 = smf.ols(f_modx3, data = results_bias).fit()
# lm_X3a = smf.ols(f_modx3a, data = results_bias).fit()
# lm_X4 = smf.ols(f_modx4, data = results_bias).fit()
# lm_X4a = smf.ols(f_modx4a, data = results_bias).fit()
# lm_X5 = smf.ols(f_modx5, data = results_bias).fit()
# lm_X5a = smf.ols(f_modx5a, data = results_bias).fit()
# lm_X6 = smf.ols(f_modx6, data = results_bias).fit()
# lm_X6a = smf.ols(f_modx6a, data = results_bias).fit()
# lm_X6_5percent = smf.ols(f_modx6_5percent, data = results_bias).fit()
# lm_X7 = smf.ols(f_modx7, data = results_bias).fit()
# lm_X7a = smf.ols(f_modx7a, data = results_bias).fit()
# lm_X7_5percent = smf.ols(f_modx7_5percent, data = results_bias).fit()
lm_X8 = smf.ols(f_modx8, data = results_bias).fit()
lm_X8a = smf.ols(f_modx8a, data = results_bias).fit()
# print(lm_X.summary())

In [None]:
aov_table = sm.stats.anova_lm(lm_X8a, typ=3)
aov_table_sig = aov_table.reset_index().merge(sig_f_df, on='df')
aov_table_sig['factors'] = aov_table_sig['index'].apply(lambda x : len(x.split(':')))
aov_table_sig.set_index('index', inplace=True)
aov_table_sig = aov_table_sig[(aov_table_sig['PR(>F)']<=0.05) & (aov_table_sig['F']>=aov_table_sig['F_0.05'])].sort_values(by='factors')
pd.concat([aov_table_sig, aov_table.loc[['Residual']]])

In [None]:
model_stats(lm_X8a)

In [None]:
model_stats(lm_X8a)

In [None]:
f.ppf(q=1-0.05, dfn=1, dfd=1)

In [None]:
sm.stats.anova_lm(lm, lm_X8)

In [None]:
# An ANOVA requires 3 assumptions:
# https://www.spss-tutorials.com/spss-anova-levenes-test-significant/

# 1. independent observations (groups are mutually exclusive, no longitudinal measures) 
# 2. normality: the dependent variable must follow a normal distribution within each subpopulation (check normality of residuals)
# 3. homogeneity: the variance of the dependent variable must be equal over all subpopulations.
# Normality is only needed for small sample sizes of, say, N < 25 per subgroup.
# Homogeneity is only needed if sample sizes are sharply unequal. If so, run Levene's test. 

# 1. independence [x]
# 2. normality [x]
# 3. homogeneity

sns.histplot(lm_X8a.resid)

fig = plt.figure(figsize= (5, 5))
ax = fig.add_subplot(111)

normality_plot, stat = scipy.stats.probplot(lm_X8a.resid, plot= plt, rvalue= True)
ax.set_title("Probability plot of model residual's", fontsize= 20)
ax.set

plt.show()

### Plots

In [None]:
sns.set_context('paper', font_scale=1.6)
df = fig_data.melt(id_vars=fig_data.columns.to_list()[:-2], 
                            value_vars=['MCC (accuracy)','reliability bias'], var_name='metric')
df['mel_bins'] = df['mel_bins'].astype(int)
df['mfccs'] = df['mfccs'].astype(int)

#### Interaction of Dataset, Sample rate and MFCC dimensions --> Accuracy

In [None]:
g = sns.FacetGrid(df, height=4, aspect=3, col="resample_rate", row='metric', sharey=False, sharex=False)
g.map(sns.boxplot, "dataset_name", "value", "mfccs", palette="tab10", showfliers=False,
      hue_order=df["exp_cat"].unique().sort(), 
      order=df.dataset_name.unique(), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3))

g.map(sns.stripplot, "dataset_name", "value", "mfccs", palette="tab10", dodge=True,
      hue_order=df["exp_cat"].unique().sort(), 
      order=df.dataset_name.unique(), size=1.5)

plt.suptitle('Effect of MFCC dimensions on MCC (accuracy) and reliability bias')
plt.xlabel('dataset name', labelpad=10)
g.fig.axes[0].set_ylabel('MCC (accuracy)\n(higher is better)')
g.fig.axes[2].set_ylabel('reliability bias \n(lower is better)')
g.fig.tight_layout()
g.add_legend()
display(g)

In [None]:
g = sns.catplot(
    data=df[df.metric=='MCC (accuracy)'], x="mfccs", y="value", col="dataset_name", row="resample_rate",
    kind="box", showfliers=True, flierprops=dict(marker='.', markersize=3), height=4, aspect=1, palette='tab10')
for ax in g.fig.axes:
    ax.set_title(ax.title.get_text().split('|')[-1])
g.fig.axes[0].set_ylabel('8kHz')
g.fig.axes[5].set_ylabel('16kHz')
g.fig.tight_layout()

#### Interaction of architecture, Mel bins and MFCC dimensions --> Accuracy

In [None]:
sns.set_context('paper', font_scale=1.6)
df = fig_data.melt(id_vars=fig_data.columns.to_list()[:-2], 
                            value_vars=['MCC (accuracy)','reliability bias'], var_name='metric')
df['mel_bins'] = df['mel_bins'].astype(int)
df['mfccs'] = df['mfccs'].astype(int)
               
g = sns.FacetGrid(df[df.metric=='MCC (accuracy)'], height=4, aspect=1, col='mfccs', row='model_arch', sharey=False, sharex=False)
g.map(sns.boxplot, "dataset_name", "value", "mel_bins", palette="tab10", showfliers=False,
      hue_order=df["exp_cat"].unique().sort(), 
      order=df.dataset_name.unique(), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3))

g.map(sns.stripplot, "dataset_name", "value", "mel_bins", palette="tab10", dodge=True,
      hue_order=df["exp_cat"].unique().sort(), 
      order=df.dataset_name.unique(), size=1.5)

for ax in g.fig.axes:
    ax.set_title(ax.title.get_text().split('|')[-1])
    xlabels = [x.get_text() for x in ax.get_xticklabels()]
    ax.set_xticklabels(xlabels, rotation=45, horizontalalignment='right')
# plt.xlabel('dataset name', labelpad=10)
g.fig.axes[0].set_ylabel('CNN')
g.fig.axes[6].set_ylabel('low latency CNN')
g.fig.tight_layout()
g.add_legend()
display(g)

In [None]:
g = sns.catplot(
    data=df[df.metric=='MCC (accuracy)'], x="mel_bins", y="value", col="mfccs", row="model_arch",
    kind="box", showfliers=True, flierprops=dict(marker='.', markersize=1.5), height=4, aspect=1, palette='tab10')
for ax in g.fig.axes:
    ax.set_title(ax.title.get_text().split('|')[-1])
g.fig.axes[0].set_ylabel('CNN')
g.fig.axes[6].set_ylabel('low latency CNN')
g.fig.tight_layout()

#### Interaction of dataset, architecture and MFCC dimensions --> Accuracy

In [None]:
g = sns.catplot(
    data=df[df.metric=='MCC (accuracy)'], x="mfccs", y="value", col="dataset_name", row="model_arch",
    kind="box", showfliers=True, flierprops=dict(marker='.', markersize=1.5), height=4, aspect=1, palette='tab10')
for ax in g.fig.axes:
    ax.set_title(ax.title.get_text().split('|')[-1])
g.fig.axes[0].set_ylabel('CNN')
g.fig.axes[5].set_ylabel('low latency CNN')
g.fig.tight_layout()

#### Interaction of architecture, frame length and frame step --> Accuracy

In [None]:
g = sns.catplot(
    data=df[df.metric=='MCC (accuracy)'], x="frame_length", y="value", col="frame_step", row="model_arch",
    kind="box", showfliers=True, flierprops=dict(marker='.', markersize=1.5), height=4, aspect=1, palette='tab10')
for ax in g.fig.axes:
    ax.set_title(ax.title.get_text().split('|')[-1])
g.fig.axes[0].set_ylabel('CNN')
g.fig.axes[3].set_ylabel('low latency CNN')
g.fig.tight_layout()

In [None]:
g = sns.catplot(
    data=df[df.metric=='MCC (accuracy)'], x="frame_step", y="value", col="frame_length", row="model_arch",
    kind="box", showfliers=True, flierprops=dict(marker='.', markersize=3), height=4, aspect=1, palette='tab10')
for ax in g.fig.axes:
    ax.set_title(ax.title.get_text().split('|')[-1])
g.fig.axes[0].set_ylabel('CNN')
g.fig.axes[4].set_ylabel('low latency CNN')
g.fig.tight_layout()

## Effect of Pre-processing Parameters on Reliability Bias

In [None]:
results_bias.groupby(['dataset_name','model_arch','resample_rate'])['model_bias'].agg(['mean','std', 'var']).describe()

### Factorial Anova Tests

In [None]:
def model_stats(model):
    print('R_squared: {}\nR_squared_adj: {}\nF_val: {}\np_val: {}\ndof: {}'.format(
        model.rsquared, model.rsquared_adj, model.fvalue, model.f_pvalue, model.df_model))

f_b = 'model_bias ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum)\
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum)'

f_bmodx1 = 'model_bias ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) +\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(mfccs, Sum)*C(mel_bins, Sum)\
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum) \
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(frame_length, Sum)*C(frame_step, Sum)*C(window_fn, Sum)'
# No significant interactions

f_bmodx2 = 'model_bias ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(mfccs, Sum)*C(mel_bins, Sum) + C(dataset_name, Sum)*C(model_arch, Sum)*C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(resample_rate, Sum)*C(mfccs, Sum) + C(dataset_name, Sum)*C(model_arch, Sum)*C(mfccs, Sum)\
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum)'
# C(dataset_name, Sum):C(model_arch, Sum):C(mfccs, Sum) --> significant at 1%
# C(dataset_name, Sum):C(mfccs, Sum):C(mel_bins, Sum) --> significant at 5%
# C(dataset_name, Sum):C(model_arch, Sum):C(mel_bins, Sum) --> significant at 5%  
# C(dataset_name, Sum):C(resample_rate, Sum):C(mfccs, Sum) --> significant at 5%

f_bmodx3 = 'model_bias ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(dataset_name, Sum)*C(resample_rate, Sum) \
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(mfccs, Sum) \
+ C(dataset_name, Sum)*C(mel_bins, Sum) \
+ C(frame_length, Sum) + C(frame_step, Sum) + C(window_fn, Sum)'
# C(dataset_name, Sum):C(resample_rate, Sum) --> significant at 1%
# C(dataset_name, Sum):C(mel_bins, Sum) --> significant at 1%

f_bmodx4 = 'model_bias ~ C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum)\
+ C(dataset_name, Sum)*C(resample_rate, Sum) \
+ C(mfccs, Sum) + C(mel_bins, Sum) \
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(mfccs, Sum) \
+ C(dataset_name, Sum)*C(mel_bins, Sum) \
+ C(frame_length, Sum)'
# frame_length significant at 5% level only
# window_fn no statistically significant effect
# frame_step no statistically significant effect

lmb = smf.ols(f_b, data = results_bias).fit() #[results_bias.dataset_name=='mswc_de']
# lmb_X1 = smf.ols(f_bmodx1, data = results_bias).fit()
# lmb_X2 = smf.ols(f_bmodx2, data = results_bias).fit()
# lmb_X3 = smf.ols(f_bmodx3, data = results_bias).fit()
lmb_X4 = smf.ols(f_bmodx4, data = results_bias).fit()
# print(lmb_X4.summary())
# print('lmb_X1'), model_stats(lmb_X1)
# print('lmb_X2'), model_stats(lmb_X2)
# print('lmb_X3'),model_stats(lmb_X3)
print('lmb_X4'),model_stats(lmb_X4)

In [None]:
aov_table = sm.stats.anova_lm(lmb_X4, typ=3)
aov_table_sig = aov_table.reset_index().merge(sig_f_df, on='df')
aov_table_sig['factors'] = aov_table_sig['index'].apply(lambda x : len(x.split(':')))
aov_table_sig.set_index('index', inplace=True)
aov_table_sig = aov_table_sig[(aov_table_sig['PR(>F)']<=0.05) & (aov_table_sig['F']>=aov_table_sig['F_0.05'])].sort_values(by='factors')
pd.concat([aov_table_sig, aov_table.loc[['Residual']]])

In [None]:
f.ppf(q=1-0.01, dfn=1, dfd=3)

In [None]:
sm.stats.anova_lm(lmb, lmb_X4)

In [None]:
# Assumptions check
# 1. independence [x]
# 2. normality [x]
# 3. homogeneity

sns.histplot(lmb_X4.resid)

fig = plt.figure(figsize= (5, 5))
ax = fig.add_subplot(111)

normality_plot, stat = scipy.stats.probplot(lmb_X4.resid, plot= plt, rvalue= True)
ax.set_title("Probability plot of model residual's", fontsize= 20)
ax.set

plt.show()

### Plots

In [None]:
sns.set_context('paper', font_scale=1.6)
df = fig_data.melt(id_vars=fig_data.columns.to_list()[:-2], 
                            value_vars=['MCC (accuracy)','reliability bias'], var_name='metric')
df['mel_bins'] = df['mel_bins'].astype(int)
df['mfccs'] = df['mfccs'].astype(int)

#### Interaction of Dataset, architecture and MFCC dimensions --> Reliablity Bias

In [None]:
g = sns.catplot(
    data=df[df.metric=='reliability bias'], x="mfccs", y="value", col="dataset_name", row="model_arch",
    kind="box", showfliers=True, flierprops=dict(marker='.', markersize=1.5), height=4, aspect=1, palette='tab10')
for ax in g.fig.axes:
    ax.set_title(ax.title.get_text().split('|')[-1])
g.fig.axes[0].set_ylabel('CNN')
g.fig.axes[5].set_ylabel('low latency CNN')
g.fig.tight_layout()

#### Interaction of dataset and Mel bins --> Reliablity Bias

In [None]:
g = sns.catplot(
    data=df[df.metric=='reliability bias'], x="mel_bins", y="value", col="dataset_name", #row="resample_rate",
    kind="box", showfliers=True, flierprops=dict(marker='.', markersize=1.5), height=4, aspect=1, palette='tab10')
for ax in g.fig.axes:
    ax.set_title(ax.title.get_text().split('|')[-1])
# g.fig.axes[0].set_ylabel('8kHz')
# g.fig.axes[5].set_ylabel('16kHz')
g.fig.tight_layout()

#### Interaction of dataset and sample rate --> Reliablity Bias

In [None]:
#Similar to Figure 4

g = sns.catplot(
    data=df[df.metric=='reliability bias'], x="dataset_name", y="value", col="resample_rate", #row="resample_rate",
    kind="box", showfliers=True, flierprops=dict(marker='.', markersize=1.5), height=4, aspect=1.5, palette='tab10')
for ax in g.fig.axes:
    ax.set_title(ax.title.get_text().split('=')[-1][:-5]+'kHz')
g.fig.tight_layout()

In [None]:
g = sns.FacetGrid(data=df, height=8, aspect=1.8, col="resample_rate", #col_order=[16000,8000],
                  sharey=True, sharex=False)
g.map(sns.boxplot, "dataset_name", "domain_mcc", "domain", palette=['green','fuchsia'], showfliers=False,
      hue_order=hue_order, 
      order=df.dataset_name.unique(), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3))

g.map(sns.stripplot, "dataset_name", "domain_mcc", "domain", palette=['green','fuchsia'], dodge=True,
      hue_order=hue_order, 
      order=df.dataset_name.unique(), size=1.5)

g.fig.tight_layout()
g.add_legend()
for ax in g.fig.axes:
    ax.xaxis.grid(True, which='major', linestyle='-')
    ax.set_xticks([0.5, 1.5, 2.5, 3.5, 4.5])
g.set_xticklabels(rotation=0, fontdict={'ha': 'right'})
display(g)

## Effect of Pruning Hyperparameters on Delta Reliability Bias

### Results setup

In [None]:
# Google speech commands
sc_best_runs = ['run-1628708435','run-1628757640','run-1628732129','run-1628728028','run-1628733620','run-1628787124','run-1628745492','run-1628778770','run-1628729675','run-1628795284','run-1628762389','run-1628794609']
sc_fairest_runs = ['run-1628769835','run-1628786232','run-1628785897','run-1628742549','run-1628765724','run-1628793147','run-1628763273','run-1628776241','run-1628796466','run-1628758822','run-1628753364','run-1628809058']
sc_accurate_fair_runs = ['run-1628726178','run-1628729666','run-1628733199','run-1628743272','run-1628759090','run-1628715119','run-1628737906','run-1628782987','run-1628799888','run-1628735838','run-1628806790']
sc_results_compress = results_analysis.pruning_results(results, compression_results, 'compress_sc','sc_train',sc_best_runs,sc_fairest_runs,sc_accurate_fair_runs)
sc_results_compress.fillna(value={'dataset_name':'speech_commands_gender'}, inplace=True) # for a mysterious reason some datasets show up as nan
sc_results_compress.drop_duplicates(subset=['exp_name','trained_model_path','pruning_schedule','pruning_learning_rate','pruning_frequency','pruning_final_sparsity','quantize','quantization_optimization'], 
                                    keep='last', inplace=True)

# mswc_de
de_best_runs = ['run-1664180817','run-1664229064','run-1664250798','run-1664183108','run-1664261879','run-1664411664','run-1664181900','run-1664206013','run-1664214242','run-1664180873','run-1664184317','run-1664323171']
de_fairest_runs = ['run-1664213870','run-1664280611','run-1664355580','run-1664269786','run-1664314689','run-1664316426','run-1664208514','run-1664302583','run-1664379577','run-1664193259','run-1664246067','run-1664329149',]
de_accurate_fair_runs = ['run-1664186536','run-1664280930','run-1664195328','run-1664262566','run-1664265796','run-1664336740','run-1664318691','run-1664305528','run-1664304376','run-1664182005','run-1664182187','run-1664323528',]
de_results_compress = results_analysis.pruning_results(results, compression_results, 'compress_mswc_de','mswc_de',de_best_runs,de_fairest_runs,de_accurate_fair_runs)
de_results_compress.drop_duplicates(subset=['exp_name','trained_model_path','pruning_schedule','pruning_learning_rate','pruning_frequency','pruning_final_sparsity','quantize','quantization_optimization'], 
                                    keep='last', inplace=True)

# mswc_fr
fr_best_runs = ['run-1664062899','run-1664067479','run-1664122910','run-1664108246','run-1664108466','run-1664154522','run-1664021255','run-1664043976','run-1664090960','run-1664017583','run-1664116909','run-1664124001']
fr_fairest_runs = ['run-1664051632','run-1664082316','run-1664134988','run-1664018358','run-1664032036','run-1664063647','run-1664085357','run-1664086441','run-1664096317','run-1664045404','run-1664097051','run-1664103387']
fr_accurate_fair_runs = ['run-1664042826','run-1664049510','run-1664061342','run-1664121834','run-1664109926','run-1664096743','run-1664075625','run-1664028003','run-1664053130','run-1664118505','run-1664077198','run-1664118377']
fr_results_compress = results_analysis.pruning_results(results, compression_results, 'compress_mswc_fr','mswc_fr',fr_best_runs,fr_fairest_runs,fr_accurate_fair_runs)
fr_results_compress.drop_duplicates(subset=['exp_name','trained_model_path','pruning_schedule','pruning_learning_rate','pruning_frequency','pruning_final_sparsity','quantize','quantization_optimization'], 
                                    keep='last', inplace=True)

# mswc_en
en_best_runs = ['run-1664275396','run-1664386479','run-1664491179','run-1664201605','run-1664379390','run-1664508832','run-1664214160','run-1664253756','run-1664403188','run-1664378805','run-1664502091','run-1664512368']
en_fairest_runs = ['run-1664188494','run-1664216627','run-1664494629','run-1664591096','run-1664634078','run-1664656518','run-1664313713','run-1664462393','run-1664492924','run-1664387434','run-1664569767','run-1664636409']
en_accurate_fair_runs = ['run-1664215347','run-1664198161','run-1664487562','run-1664508099','run-1664378626','run-1664508446','run-1664186500','run-1664427588','run-1664268595','run-1664217620','run-1664185374','run-1664189422']
en_results_compress = results_analysis.pruning_results(results, compression_results, 'compress_mswc_en','mswc_en',en_best_runs,en_fairest_runs,en_accurate_fair_runs)
en_results_compress.drop_duplicates(subset=['exp_name','trained_model_path','pruning_schedule','pruning_learning_rate','pruning_frequency','pruning_final_sparsity','quantize','quantization_optimization'], 
                                    keep='last', inplace=True)

# mswc_rw
rw_best_runs = ['run-1664017217','run-1664034824','run-1664076207','run-1664031270','run-1664052670','run-1664060100','run-1664017072','run-1664034345','run-1664060358','run-1664057681','run-1664115743','run-1664115971']
rw_fairest_runs = ['run-1664018113','run-1664036302','run-1664076284','run-1664040385','run-1664053265','run-1664104614','run-1664033769','run-1664038064','run-1664091561','run-1664042712','run-1664110816','run-1664127256']
rw_accurate_fair_runs = ['run-1664048908','run-1664049806']
rw_results_compress = results_analysis.pruning_results(results, compression_results, 'compress_mswc_rw','mswc_rw',rw_best_runs,rw_fairest_runs,rw_accurate_fair_runs)
rw_results_compress.drop_duplicates(subset=['exp_name','trained_model_path','pruning_schedule','pruning_learning_rate','pruning_frequency','pruning_final_sparsity','quantize','quantization_optimization'], 
                                    keep='last', inplace=True)

results_compress_compare = pd.concat([sc_results_compress, de_results_compress, fr_results_compress,
                                     en_results_compress, rw_results_compress])

results_compress_compare['exp_cat'] = results_compress_compare[['model_arch','resample_rate']
                                                              ].apply(lambda row: ' '.join(row.astype(str)).replace('_',' ')[:-3]+'kHz', axis=1)

### Factorial Anova Tests

In [None]:
f_list = []
for dof in [1, 2, 4, 5, 8, 10, 20, 40, 51, 55]:
    f_list.append([dof, f.ppf(q=1-0.01, dfn=1, dfd=dof), f.ppf(q=1-0.05, dfn=1, dfd=dof)])

sig_f_df = pd.DataFrame(f_list, columns=['df', 'F_0.01', 'F_0.05']) 

In [None]:
sig_f_df

In [None]:
def model_stats(model):
    print('R_squared: {}\nR_squared_adj: {}\nF_val: {}\np_val: {}\ndof: {}'.format(
        model.rsquared, model.rsquared_adj, model.fvalue, model.f_pvalue, model.df_model))

hpformula = 'delta_model_bias ~ all_mcc_trained + model_bias_trained + C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) \
+ C(pruning_learning_rate, Sum) + C(pruning_schedule, Sum) + C(pruning_frequency, Sum) + C(pruning_final_sparsity, Sum)'

hpf_modx1 = 'delta_model_bias ~ all_mcc_trained + model_bias_trained + C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) \
+ C(pruning_learning_rate, Sum) + C(pruning_schedule, Sum) + C(pruning_frequency, Sum) + C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_frequency, Sum)*C(pruning_final_sparsity, Sum)'
# no significant interaction

hpf_modx2 = 'delta_model_bias ~ all_mcc_trained + model_bias_trained + C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) \
+ C(pruning_learning_rate, Sum) + C(pruning_schedule, Sum) + C(pruning_frequency, Sum) + C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_learning_rate, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)\
+ C(dataset_name, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)\
+ C(model_arch, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)'
# C(dataset_name, Sum):C(model_arch):C(resample_rate):C(pruning_learning_rate, Sum):C(pruning_final_sparsity, Sum) --> significant at 5%

hpf_modx3a = 'delta_model_bias ~ all_mcc_trained + model_bias_trained + C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) \
+ C(pruning_learning_rate, Sum) + C(pruning_schedule, Sum) + C(pruning_frequency, Sum) + C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_learning_rate, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum) \
+ C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)'

hpf_modx3 = 'delta_model_bias ~ all_mcc_trained + model_bias_trained + C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) \
+ C(pruning_learning_rate, Sum) + C(pruning_schedule, Sum) + C(pruning_frequency, Sum) + C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_learning_rate, Sum) \
+ C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_learning_rate, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(pruning_learning_rate, Sum)*C(pruning_final_sparsity, Sum) \
+ C(dataset_name, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum) \
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(pruning_final_sparsity, Sum)\
+ C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)'
# C(dataset_name, Sum):C(model_arch):C(resample_rate):C(pruning_learning_rate, Sum) --> significant at 1%
# C(dataset_name, Sum):C(pruning_learning_rate):C(pruning_final_sparsity, Sum) --> significant at 1%
# C(dataset_name, Sum):C(model_arch):C(pruning_final_sparsity, Sum) --> significant at 1%
# C(pruning_learning_rate, Sum):C(pruning_schedule, Sum):C(pruning_final_sparsity, Sum) --> significant at 5%

hpf_modx4 = 'delta_model_bias ~ all_mcc_trained + model_bias_trained + C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) \
+ C(pruning_learning_rate, Sum) + C(pruning_schedule, Sum) + C(pruning_frequency, Sum) + C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_learning_rate, Sum) \
+ C(dataset_name, Sum)*C(pruning_learning_rate, Sum)*C(pruning_final_sparsity, Sum) \
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(pruning_final_sparsity, Sum)\
+ C(model_arch, Sum)*C(pruning_learning_rate, Sum)*C(pruning_final_sparsity, Sum)\
+ C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)'

hpf_modx5 = 'delta_model_bias ~ all_mcc_trained + model_bias_trained + C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) \
+ C(pruning_learning_rate, Sum) + C(pruning_schedule, Sum) + C(pruning_frequency, Sum) + C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_learning_rate, Sum) \
+ C(dataset_name, Sum)*C(pruning_learning_rate, Sum)*C(pruning_final_sparsity, Sum) \
+ C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)\
+ C(model_arch, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(pruning_schedule, Sum)'

hpf_modx6 = 'delta_model_bias ~ all_mcc_trained + model_bias_trained + C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) \
+ C(pruning_learning_rate, Sum) + C(pruning_schedule, Sum) + C(pruning_frequency, Sum) + C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_learning_rate, Sum)\
+ C(dataset_name, Sum)*C(pruning_learning_rate, Sum)*C(pruning_final_sparsity, Sum) \
+ C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)\
+ C(model_arch, Sum)*C(pruning_final_sparsity, Sum)\
'

hplm = smf.ols(hpformula, data = results_compress_compare).fit() #[results_bias.dataset_name=='mswc_de']
hplm_X1 = smf.ols(hpf_modx1, data = results_compress_compare).fit()
# hplm_X2 = smf.ols(hpf_modx2, data = results_compress_compare).fit()
# hplm_X3 = smf.ols(hpf_modx3a, data = results_compress_compare).fit()
# hplm_X4 = smf.ols(hpf_modx4, data = results_compress_compare).fit()
# hplm_X5 = smf.ols(hpf_modx5, data = results_compress_compare).fit()
hplm_X6 = smf.ols(hpf_modx6, data = results_compress_compare).fit()

# print(hplm.summary())
# print('hplm_X1'), model_stats(hplm_X1)
# print('\nhplm_X2'), model_stats(hplm_X2)
# print('\nhplm_X3'),model_stats(hplm_X3)
# print('\nhplm_X4'),model_stats(hplm_X4)
# print('\nhplm_X5'),model_stats(hplm_X5)
print('\nhplm_X6'),model_stats(hplm_X6)

In [None]:
aov_table = sm.stats.anova_lm(hplm_X6, typ=3)
aov_table_sig = aov_table.reset_index().merge(sig_f_df, on='df')
aov_table_sig['factors'] = aov_table_sig['index'].apply(lambda x : len(x.split(':')))
aov_table_sig.set_index('index', inplace=True)
aov_table_sig = aov_table_sig[(aov_table_sig['PR(>F)']<=0.05) & (aov_table_sig['F']>=aov_table_sig['F_0.05'])].sort_values(by='factors')
pd.concat([aov_table_sig, aov_table.loc[['Residual']]])

In [None]:
sns.histplot(hplm_X6.resid)

fig = plt.figure(figsize= (5, 5))
ax = fig.add_subplot(111)

normality_plot, stat = scipy.stats.probplot(hplm_X6.resid, plot= plt, rvalue= True)
ax.set_title("Probability plot of model residual's", fontsize= 20)
ax.set

plt.show()

## Effect of Pruning Hyperparameters on Delta MCC (accuracy)

### Factorial Anova

In [None]:
def model_stats(model):
    print('R_squared: {}\nR_squared_adj: {}\nF_val: {}\np_val: {}\ndof: {}'.format(
        model.rsquared, model.rsquared_adj, model.fvalue, model.f_pvalue, model.df_model))

hp_adformula = 'delta_all_mcc ~ all_mcc_trained + model_bias_trained + C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) \
+ C(pruning_learning_rate, Sum) + C(pruning_schedule, Sum) + C(pruning_frequency, Sum) + C(pruning_final_sparsity, Sum)'

hpf_admodx1 = 'delta_all_mcc ~ all_mcc_trained + model_bias_trained + C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) \
+ C(pruning_learning_rate, Sum) + C(pruning_schedule, Sum) + C(pruning_frequency, Sum) + C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_frequency, Sum)*C(pruning_final_sparsity, Sum)'
# no significant interaction

hpf_admodx2 = 'delta_all_mcc ~ all_mcc_trained + model_bias_trained + C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) \
+ C(pruning_learning_rate, Sum) + C(pruning_schedule, Sum) + C(pruning_frequency, Sum) + C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_frequency, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_learning_rate, Sum)*C(pruning_frequency, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_frequency, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_frequency, Sum)*C(pruning_final_sparsity, Sum)'
# no significant interaction

hpf_admodx3 = 'delta_all_mcc ~ all_mcc_trained + model_bias_trained + C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) \
+ C(pruning_learning_rate, Sum) + C(pruning_schedule, Sum) + C(pruning_frequency, Sum) + C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)\
+ C(model_arch, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)\
+ C(resample_rate, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_final_sparsity, Sum)\
+ C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_frequency, Sum)*C(pruning_final_sparsity, Sum)'
# C(model_arch)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)  --> significant at 1%
# C(dataset_name, Sum)*C(model_arch)*C(resample_rate)*C(pruning_final_sparsity, Sum) --> significant at 5%

hpf_admodx4 = 'delta_all_mcc ~ all_mcc_trained + model_bias_trained + C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) \
+ C(pruning_learning_rate, Sum) + C(pruning_schedule, Sum) + C(pruning_frequency, Sum) + C(pruning_final_sparsity, Sum)\
+ C(model_arch, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)\
+ C(dataset_name, Sum)*C(pruning_learning_rate, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)\
+ C(resample_rate, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)\
+ C(resample_rate, Sum)*C(pruning_learning_rate, Sum)*C(pruning_final_sparsity, Sum)\
+ C(resample_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)\
+ C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_frequency, Sum)\
+ C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)\
+ C(pruning_learning_rate, Sum)*C(pruning_frequency, Sum)*C(pruning_final_sparsity, Sum)'
# C(dataset_name, Sum):C(pruning_learning_rate, Sum):C(pruning_final_sparsity, Sum)  --> significant at 1%
# C(dataset_name, Sum):C(resample_rate):C(pruning_final_sparsity, Sum)  --> significant at 1%
# C(dataset_name, Sum):C(model_arch):C(pruning_final_sparsity, Sum)  --> significant at 1%
# C(resample_rate):C(pruning_learning_rate, Sum):C(pruning_final_sparsity, Sum)  --> significant at 1%
# C(pruning_learning_rate, Sum):C(pruning_schedule, Sum):C(pruning_final_sparsity, Sum)  --> significant at 1%
# C(model_arch):C(pruning_learning_rate, Sum):C(pruning_final_sparsity, Sum)   --> significant at 1%
# C(dataset_name, Sum):C(model_arch):C(resample_rate)   --> significant at 1%
# C(dataset_name, Sum):C(pruning_learning_rate, Sum):C(pruning_schedule, Sum) --> significant at 5%
# C(dataset_name, Sum):C(pruning_schedule, Sum):C(pruning_final_sparsity, Sum)  --> significant at 5%

hpf_admodx5 = 'delta_all_mcc ~ all_mcc_trained + model_bias_trained + C(dataset_name, Sum) + C(model_arch, Sum) + C(resample_rate, Sum) \
+ C(pruning_learning_rate, Sum) + C(pruning_schedule, Sum) + C(pruning_frequency, Sum) + C(pruning_final_sparsity, Sum)\
+ C(model_arch, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(model_arch, Sum)*C(resample_rate, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(pruning_learning_rate, Sum)*C(pruning_schedule, Sum)\
+ C(dataset_name, Sum)*C(pruning_schedule, Sum)*C(pruning_final_sparsity, Sum)\
+ C(dataset_name, Sum)*C(pruning_learning_rate, Sum)*C(pruning_final_sparsity, Sum)\
+ C(resample_rate, Sum)*C(pruning_learning_rate, Sum)*C(pruning_final_sparsity, Sum)'

# hplmad_X1 = smf.ols(hpf_admodx1, data = results_compress_compare).fit()
# hplmad_X2 = smf.ols(hpf_admodx2, data = results_compress_compare).fit()
# hplmad_X3 = smf.ols(hpf_admodx3, data = results_compress_compare).fit()
# hplmad_X4 = smf.ols(hpf_admodx4, data = results_compress_compare).fit()
hplmad_X5 = smf.ols(hpf_admodx5, data = results_compress_compare).fit()

# print('\hplmad_X1'),model_stats(hplmad_X1)
# print('\hplmad_X2'),model_stats(hplmad_X2)
# print('\hplmad_X3'),model_stats(hplmad_X3)
# print('\hplmad_X4'),model_stats(hplmad_X4)
print('\hplmad_X5'),model_stats(hplmad_X5)

In [None]:
aov_table = sm.stats.anova_lm(hplmad_X5, typ=3)
aov_table_sig = aov_table.reset_index().merge(sig_f_df, on='df')
aov_table_sig['factors'] = aov_table_sig['index'].apply(lambda x : len(x.split(':')))
aov_table_sig.set_index('index', inplace=True)
aov_table_sig = aov_table_sig[(aov_table_sig['PR(>F)']<=0.05) & (aov_table_sig['F']>=aov_table_sig['F_0.05'])].sort_values(by='factors')
pd.concat([aov_table_sig, aov_table.loc[['Residual']]])

### Plots

In [None]:
sns.set_context('paper', font_scale=1.6)
df = fig_data.melt(id_vars=fig_data.columns.to_list()[:-2], 
                            value_vars=['MCC (accuracy)','reliability bias'], var_name='metric')
df['mel_bins'] = df['mel_bins'].astype(int)
df['mfccs'] = df['mfccs'].astype(int)

#### Interaction of  --> Delta Reliablity Bias

In [None]:
g = sns.catplot(
    data=df[df.metric=='reliability bias'], x="mfccs", y="value", col="dataset_name", row="model_arch",
    kind="box", showfliers=True, flierprops=dict(marker='.', markersize=1.5), height=4, aspect=1, palette='tab10')
for ax in g.fig.axes:
    ax.set_title(ax.title.get_text().split('|')[-1])
g.fig.axes[0].set_ylabel('CNN')
g.fig.axes[5].set_ylabel('low latency CNN')
g.fig.tight_layout()

#### Interaction of dataset and Mel bins --> Reliablity Bias

In [None]:
g = sns.catplot(
    data=df[df.metric=='reliability bias'], x="mel_bins", y="value", col="dataset_name", #row="resample_rate",
    kind="box", showfliers=True, flierprops=dict(marker='.', markersize=1.5), height=4, aspect=1, palette='tab10')
for ax in g.fig.axes:
    ax.set_title(ax.title.get_text().split('|')[-1])
# g.fig.axes[0].set_ylabel('8kHz')
# g.fig.axes[5].set_ylabel('16kHz')
g.fig.tight_layout()

#### Interaction of dataset and sample rate --> Reliablity Bias

In [None]:
#Similar to Figure 4

g = sns.catplot(
    data=df[df.metric=='reliability bias'], x="dataset_name", y="value", col="resample_rate", #row="resample_rate",
    kind="box", showfliers=True, flierprops=dict(marker='.', markersize=1.5), height=4, aspect=1.5, palette='tab10')
for ax in g.fig.axes:
    ax.set_title(ax.title.get_text().split('=')[-1][:-5]+'kHz')
g.fig.tight_layout()

In [None]:
# g = sns.FacetGrid(data=df, height=8, aspect=1.8, col="resample_rate", #col_order=[16000,8000],
#                   sharey=True, sharex=False)
# g.map(sns.boxplot, "dataset_name", "domain_mcc", "domain", palette=['green','fuchsia'], showfliers=False,
#       hue_order=hue_order, 
#       order=df.dataset_name.unique(), 
#       showmeans=False, meanline=True, meanprops=dict(color="black"),
#       saturation=0.7,boxprops=dict(alpha=.3))

# g.map(sns.stripplot, "dataset_name", "domain_mcc", "domain", palette=['green','fuchsia'], dodge=True,
#       hue_order=hue_order, 
#       order=df.dataset_name.unique(), size=1.5)

# g.fig.tight_layout()
# g.add_legend()
# for ax in g.fig.axes:
#     ax.xaxis.grid(True, which='major', linestyle='-')
#     ax.set_xticks([0.5, 1.5, 2.5, 3.5, 4.5])
# g.set_xticklabels(rotation=0, fontdict={'ha': 'right'})
# display(g)

# Figures

##  Impact of Dataset, Model Architecture and Sample Rate

### Figure 4: Distributions of accuracy and reliability bias

In [None]:
sns.set_context('paper', font_scale=1.6)
df = fig_data.melt(id_vars=fig_data.columns.to_list()[:-2], 
                            value_vars=['MCC (accuracy)','reliability bias'], var_name='metric')

g = sns.FacetGrid(df, height=4, aspect=3, row='metric', #col="resample_rate", col_order=[16000,8000],
                  sharey=False, sharex=False)
g.map(sns.boxplot, "dataset_name", "value", "exp_cat", palette="tab20", showfliers=False,
      hue_order=df["exp_cat"].unique().sort(), 
      order=df.dataset_name.unique(), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3))

g.map(sns.stripplot, "dataset_name", "value", "exp_cat", palette="tab20", dodge=True,
      hue_order=df["exp_cat"].unique().sort(), 
      order=df.dataset_name.unique(), size=1)

plt.xlabel('dataset name', labelpad=10)
g.fig.axes[0].set_ylabel('MCC (accuracy)\n(higher is better)')
g.fig.axes[1].set_ylabel('reliability bias \n(lower is better)')
g.fig.tight_layout()
g.add_legend()
sns.despine(bottom=True, left=False)
display(g)
plt.savefig('figures/experiment_results.png')

In [None]:
results_bias.rename(columns={'model_fairness':'model_bias', 
                         'male_fairness':'male_bias','female_fairness':'female_bias'}, inplace=True)    

agg_bias = results_bias[results_bias['exp_name'].isin(experiment_names)
                ].groupby('exp_name').agg({'female_bias': [lambda c: c.abs().mean(),
                                                               lambda c: c.abs().std()],
                                           'male_bias': [lambda c: c.abs().mean(),
                                                             lambda c: c.abs().std()]})
d = dict(zip(agg_bias.columns.levels[1], ['mean','std']))
agg_bias.rename(columns=d,level=1,inplace=True)
agg_bias

### Figure 5: Accuracy scores for males and females

In [None]:
results_mcc_unstacked = results_mcc.groupby(results_mcc.columns.to_list()[:-2])['domain_mcc'].aggregate('mean').unstack().reset_index()
results_mcc_unstacked['exp_cat'] = results_mcc_unstacked[['model_arch','resample_rate']
                              ].apply(lambda row: ' '.join(row.astype(str)), axis=1)
cols = results_mcc_unstacked.columns.to_list()[-1:] + results_mcc_unstacked.columns.to_list()[:-1]
results_mcc_unstacked = results_mcc_unstacked[cols]

In [None]:
sns.set_context('paper', font_scale=2.5)
df = results_mcc_unstacked[results_mcc_unstacked['exp_name'].isin(experiment_names)
                ].replace({'dataset_name': {'speech_commands_gender':'google_sc'}}
                         ).sort_values(by=['exp_cat','dataset_name'])
hue_order=df["dataset_name"].unique().sort()

g = sns.relplot(
    data=df, x="male_mcc", y="female_mcc", hue="dataset_name", 
    col="exp_cat", aspect=1.1, style="dataset_name",
    palette="Set1", hue_order=hue_order, s=10,
    kind="scatter"
)

for ax in g.fig.axes:
    ax.axline((0.89,0.89), (0.9,0.9), c="black", ls='--', linewidth=2)
    ax.set_xlabel('male MCC (accuracy)', labelpad=10)
    ax_title = ax.title.get_text()
    ax.set_title(ax_title.split(' = ')[-1].split(' ')[-1][:-3]+'kHz '+ax_title.split(' = ')[-1].split(' ')[0].replace('_',' '))
#     ax.xaxis.grid(True, which='major', linestyle='-')
#     ax.set_xticks(np.linspace(0.1, 0.9, 9))
# g.set_xticklabels(rotation=0, fontdict={'ha': 'right'})
# g.fig.tight_layout()
# g.add_legend()
g.fig.axes[0].set_ylabel('female \nMCC (accuracy)', labelpad=10)
sns.despine(bottom=True, left=True)
display(g)
plt.savefig('figures/subgroup_performance_accuracy.png', bbox_inches='tight')

In [None]:
# sns.set_context('paper', font_scale=2.5)
# df = results_mcc[results_mcc['exp_name'].isin(experiment_names)
#                 ].replace({'domain': {'male_mcc': 'male', 'female_mcc': 'female'}, 
#                            'dataset_name': {'speech_commands_gender':'google_sc'}}
#                          ).sort_values(by=['exp_name', 'model_arch','domain'])
# df.rename(columns={'domain_fairness':'domain_bias'}, inplace=True)
# hue_order=df["domain"].unique().sort()

# # .sort_values(by=['exp_name', 'model_arch'])
# g = sns.FacetGrid(data=df, height=8, aspect=1.8, row="model_arch", col="resample_rate", #col_order=[16000,8000],
#                   sharey=True, sharex=False)
# g.map(sns.boxplot, "dataset_name", "domain_bias", "domain", palette=['green','fuchsia'], showfliers=False,
#       hue_order=hue_order, 
#       order=df.dataset_name.unique(), 
#       showmeans=False, meanline=True, meanprops=dict(color="black"),
#       saturation=0.7,boxprops=dict(alpha=.3))

# g.map(sns.stripplot, "dataset_name", "domain_bias", "domain", palette=['green','fuchsia'], dodge=True,
#       hue_order=hue_order, 
#       order=df.dataset_name.unique(), size=1.5)

# g.fig.tight_layout()
# g.add_legend()
# for ax in g.fig.axes:
#     ax.xaxis.grid(True, which='major', linestyle='-')
#     ax.set_xticks([0.5, 1.5, 2.5, 3.5, 4.5])
# g.set_xticklabels(rotation=0, fontdict={'ha': 'right'})
# display(g)

## Impact of Pre-processing Parameters

### Unused figures

In [None]:
pre_processing_params = ['mel_bins','frame_step','frame_length','mfccs','input_features','window_fn']
dof_preprocessing, fcrit_preprocessing = results_analysis.fcrit(results, pre_processing_params, 0.01)
print('dof:', dof_preprocessing, '\nfcrit:', fcrit_preprocessing)

In [None]:
pre_processing_params2 = ['frame_step','frame_length','input_features','window_fn']
dof_preprocessing2, fcrit_preprocessing2 = results_analysis.fcrit(results, pre_processing_params2, 0.01)
print('dof:', dof_preprocessing2, '\nfcrit:', fcrit_preprocessing2)

In [None]:
sns.set_theme(style="whitegrid", font_scale=1.8)
results_plot.plot_param_importance(df=results_bias,
                                   dataset="speech_commands_gender",
                                   metrics={'all_mcc':'MCC','model_bias':'bias'}, 
                                   parameters="preprocessing", 
                                   select_tables=['8000_','16000_'],
                                   fcrit=fcrit_preprocessing,
                                   save_fig=False,
                                   plot_title = "Pre-processing parameter importance for reliability bias and accuracy (MCC)")
# plt.savefig('figures/metric_param_importance.png',bbox_inches='tight')

In [None]:
results_plot.plot_param_importance(df=results_bias,
                                   metrics={'male_mcc':'male','female_mcc':'female'}, 
                                   dataset="speech_commands_gender",
                                   parameters="preprocessing", 
                                   select_tables=['8000_','16000_'],
                                   fcrit=fcrit_preprocessing,
                                   save_fig=False,
                                   palette=['fuchsia', 'green'],
                                   plot_title = "Pre-processing parameter importance for subgroup accuracy (MCC)")

In [None]:
results_plot.plot_param_importance(df=results_bias,
                                   metrics={'male_bias':'male','female_bias':'female'}, 
                                   dataset="speech_commands_gender",
                                   parameters="preprocessing", 
                                   select_tables=['8000_','16000_'],
                                   fcrit=fcrit_preprocessing,
                                   save_fig=False,
                                   palette=['fuchsia', 'green'],
                                   plot_title = "Pre-processing parameter importance for bias across groups")

In [None]:
sns.set_context('paper', font_scale=3)
data = pd.DataFrame()
for m in ['all_mcc','model_bias']:
    for d in ['speech_commands_gender','mswc_de','mswc_fr','mswc_en','mswc_rw']:
        imp_tab = results_analysis.generate_importance_tables(results_bias, d, m, parameters="preprocessing", 
                                                              model_arch='cnn')['16000_']
        imp_tab['dataset']=d
        imp_tab['metric']=m
        data = pd.concat([data, imp_tab], axis=0)
        
data = data.replace({'dataset': {'speech_commands_gender':'google_sc'}}
                         ).sort_values(by=['dataset']).reset_index().rename(columns={'index': 'parameters'})        
cat_order = np.sort(data['parameters'].unique())
g = sns.catplot(data=data.sort_values(by=['dataset','parameters']), 
                x='parameters', y='F Score', hue='metric', col='dataset', kind='bar', 
                order= cat_order, aspect=1.3,
                palette = ['orange', 'blue'])

for ax in g.axes.flat:
    ax.set_yscale('log')
    xlabels = [x.get_text() for x in ax.get_xticklabels()]
    ax.set_xticklabels(xlabels, rotation=45, horizontalalignment='right')
    ax.hlines(y=fcrit_preprocessing, xmin=-0.5, xmax=0.9*len(cat_order), color="black", ls='--', linewidth=2)
    ax.set_xlabel('')
    ax.set_ylabel('$F\ Score$ (log scale)')
    ax.set_title('{}{}'.format(ax.get_title().split(' = ')[-1][:-3], ax.get_title().split(' = ')[-1][-3:]))
    for bar in ax.patches:
        if bar.get_height() < fcrit_preprocessing:
            bar.set_color('lightgrey') 
            
plt.suptitle("Pre-processing parameter importance for 16k CNN models trained on 5 datasets ", va='top', y=1.1);

In [None]:
data = pd.DataFrame()
for m in ['all_mcc','model_bias']:
    for d in ['speech_commands_gender','mswc_de','mswc_fr','mswc_en','mswc_rw']:
        imp_tab = results_analysis.generate_importance_tables(results_bias, d, m, parameters="preprocessing", 
                                                              model_arch='low_latency_cnn')['8000_']
        imp_tab['dataset']=d
        imp_tab['metric']=m
        data = pd.concat([data, imp_tab], axis=0)
data = data.replace({'dataset': {'speech_commands_gender':'google_sc'}}
                         ).sort_values(by=['dataset']).reset_index().rename(columns={'index': 'parameters'})
cat_order = np.sort(data['parameters'].unique())
g = sns.catplot(data=data.sort_values(by=['dataset','parameters']), 
                x='parameters', y='F Score', hue='metric', col='dataset', kind='bar', 
                order= cat_order, aspect=1.3,
                palette = ['orange', 'blue'])

for ax in g.axes.flat:
    ax.set_yscale('log')
    xlabels = [x.get_text() for x in ax.get_xticklabels()]
    ax.set_xticklabels(xlabels, rotation=45, horizontalalignment='right')
    ax.hlines(y=fcrit_preprocessing, xmin=-0.5, xmax=0.9*len(cat_order), color="black", ls='--', linewidth=2)
    ax.set_xlabel('')
    ax.set_ylabel('$F\ Score$ (log scale)')
    ax.set_title('{}{}'.format(ax.get_title().split(' = ')[-1][:-3], ax.get_title().split(' = ')[-1][-3:]))
    for bar in ax.patches:
        if bar.get_height() < fcrit_preprocessing:
            bar.set_color('lightgrey') 
            
plt.suptitle("Pre-processing parameter importance for 8k low latency CNN models trained on 5 datasets ", va='top', y=1.1);

In [None]:
# plot with RIDGE COEFFICIENT
g = sns.catplot(data=data.sort_values(by=['dataset','parameters']), 
                x='parameters', y='Ridge coef', hue='metric', col='dataset', kind='bar', 
                order= cat_order, aspect=1.2,
                palette = ['orange', 'blue'])

for ax in g.axes.flat:
    xlabels = [x.get_text() for x in ax.get_xticklabels()]
    ax.set_xticklabels(xlabels, rotation=45, horizontalalignment='right')
    ax.set_xlabel('')
    ax.set_ylabel('Ridge coefficient')
    ax.set_title('{}{}'.format(ax.get_title().split(' = ')[-1][:-3], ax.get_title().split(' = ')[-1][-3:]))
    for bar in ax.patches:
        if -0.01 < bar.get_height() < 0.01:
            bar.set_color('lightgrey')
            
plt.suptitle("Pre-processing parameter importance for 16k low latency CNN models trained on 5 datasets ", fontsize='x-large', va='top', y=1.1);

In [None]:
# plot with MUTUAL INFORMATION 
g = sns.catplot(data=data.sort_values(by=['dataset','parameters']), 
                x='parameters', y='MI', hue='metric', col='dataset', kind='bar', 
                order= cat_order, aspect=1.2,
                palette = ['orange', 'blue'])

for ax in g.axes.flat:
    xlabels = [x.get_text() for x in ax.get_xticklabels()]
    ax.set_xticklabels(xlabels, rotation=45, horizontalalignment='right')
    ax.set_xlabel('')
    ax.set_ylabel('Mutual Information')
    ax.set_title('{}{}'.format(ax.get_title().split(' = ')[-1][:-3], ax.get_title().split(' = ')[-1][-3:]))
    for bar in ax.patches:
        if -0.01 < bar.get_height() < 0.01:
            bar.set_color('lightgrey') 
            
plt.suptitle("Pre-processing parameter importance for 16k low latency CNN models trained on 5 datasets ", fontsize='x-large', va='top', y=1.1);

### Figure 8

In [None]:
sns.set_context('paper', font_scale=2.5)
df = results_mcc[(results_mcc['exp_name'].isin(experiment_names)) &(results_mcc['resample_rate']==8000)&(results_mcc['model_arch']=="low_latency_cnn")
                ].replace({'domain': {'male_mcc': 'MALE', 'female_mcc': 'FEMALE'}, 
                           'dataset_name': {'speech_commands_gender':'google_sc'}}
                         ).sort_values(by=['dataset_name', 'mfccs','exp_name'])
hue_order=df["domain"].unique().sort()

# .sort_values(by=['exp_name', 'model_arch'])
g = sns.FacetGrid(data=df, height=6, aspect=1.8, col="domain", #col_order=[16000,8000],
                  sharey=True, sharex=False)
g.map(sns.boxplot, "dataset_name", "domain_mcc", "input_features", palette='tab10',#['green','fuchsia'], 
      showfliers=False,
      hue_order=hue_order, 
      order=df.dataset_name.unique(), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3)
     )

g.map(sns.stripplot, "dataset_name", "domain_mcc", "input_features", palette='tab10',#['green','fuchsia'], 
      dodge=True,
      hue_order=hue_order, 
      size=1.5,
      order=df.dataset_name.unique())

for ax in g.fig.axes:
#     ax.xaxis.grid(True, which='major', linestyle='-')
#     ax.set_xticks([0.5, 1.5, 2.5, 3.5, 4.5])
    ax.set_xticks([0, 1, 2, 3, 4])
    ax.set_xlabel('dataset name', labelpad=10)
    ax.set_title(ax.title.get_text().split('=')[-1])
    sns.despine(left=True, bottom=True)
g.fig.axes[0].set_ylabel('MCC (accuracy)', labelpad=10)
g.set_xticklabels(rotation=0, fontdict={'ha': 'center'})

g.fig.tight_layout()
hue_labels=['log Mel spec','MFCC']
g.add_legend(legend_data={
    key: value for key, value in zip(hue_labels, g._legend_data.values())
})
g._legend.set_title('Feature type')

plt.suptitle("Effect of feature type on accuracy for male and female speakers", va='top', y=1.05)
display(g)
plt.savefig('figures/feature_type_subgroups.png', bbox_inches='tight')

In [None]:
sns.set_context('paper', font_scale=2.5)
df = results_mcc_unstacked[results_mcc_unstacked['exp_name'].isin(experiment_names)&(results_mcc_unstacked['resample_rate']==8000)&(results_mcc_unstacked['model_arch']=="low_latency_cnn")
                ].replace({'dataset_name': {'speech_commands_gender':'google_sc'}}
                         ).sort_values(by=['exp_cat','dataset_name'])
hue_order=df["dataset_name"].unique().sort()


g = sns.relplot(
    data=df, x="male_mcc", y="female_mcc", hue="dataset_name", col="input_features", aspect=1.5, style="dataset_name",
    palette="Set1", hue_order=hue_order, s=10,
    kind="scatter"
)

for ax in g.fig.axes:
    ax.axline((0.89,0.89), (0.9,0.9), c="black", ls='--', linewidth=2)
    ax.set_xlabel('male MCC (accuracy)', labelpad=10)
    ax.set_title(ax.title.get_text().replace('_',' '))
#     ax.xaxis.grid(True, which='major', linestyle='-')
#     ax.set_xticks(np.linspace(0.1, 0.9, 9))
# g.set_xticklabels(rotation=0, fontdict={'ha': 'right'})
# g.fig.tight_layout()
# g.add_legend()
g.fig.axes[0].set_ylabel('female \nMCC (accuracy)', labelpad=10)
sns.despine(bottom=True, left=True)

plt.suptitle("Effect of feature type on accuracy for males and females for 8kHz low latency CNN models", va='top', y=1.05)
display(g)
plt.savefig('figures/feature_type_subgroups_llcnn8.png', bbox_inches='tight')

In [None]:
sns.set_context('paper', font_scale=2.5)
# df = fig_data[fig_data.input_features=='log_mel_spectrogram'].melt(id_vars=fig_data.columns.to_list()[:-2], 
#                             value_vars=['MCC (accuracy)','reliability bias'], var_name='metric')
df = fig_data[(fig_data.model_arch=='low_latency_cnn')].melt(id_vars=fig_data.columns.to_list()[:-2], 
                            value_vars=['MCC (accuracy)','reliability bias'], var_name='metric')
df['mel_bins'] = df['mel_bins'].astype(int)

g = sns.FacetGrid(df, height=4, aspect=3, row='metric', col='input_features', sharey=False, sharex=False)
g.map(sns.boxplot, "dataset_name", "value", "mel_bins", palette="tab10", showfliers=False,
      hue_order=df["exp_cat"].unique().sort(), 
      order=df.dataset_name.unique(), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3))

g.map(sns.stripplot, "dataset_name", "value", "mel_bins", palette="tab10", dodge=True,
      hue_order=df["exp_cat"].unique().sort(), 
      order=df.dataset_name.unique(), size=1.5)

for ax in g.fig.axes:
    sns.despine(left=True, bottom=True)
    
g.fig.axes[0].set_yticks(np.linspace(0.2, 0.9, 6))
g.fig.axes[1].set_yticks(np.linspace(0.2, 0.9, 6))
g.fig.axes[2].set_yticks(np.linspace(0, 0.7, 6))
g.fig.axes[3].set_yticks(np.linspace(0, 0.7, 6))

g.fig.axes[0].set_title(g.fig.axes[0].title.get_text().split('|')[-1].replace('_',' '), pad=15)
g.fig.axes[1].set_title(g.fig.axes[1].title.get_text().split('|')[-1].replace('_',' '), pad=15)
g.fig.axes[2].set_title('', pad=15)
g.fig.axes[3].set_title('', pad=15)

plt.suptitle('Impact of Mel filter bank dimension on MFCC and log Mel spectrogram input features for low latency CNN architectures')

g.fig.axes[0].set_ylabel('MCC (accuracy)\n(higher is better)')
g.fig.axes[2].set_ylabel('reliability bias \n(lower is better)')
g.fig.axes[2].set_xlabel('dataset name', labelpad=10)
g.fig.axes[3].set_xlabel('dataset name', labelpad=10)
g.fig.tight_layout()
g.add_legend()
g._legend.set_title('# Mel \nfbanks')

display(g)
plt.savefig('figures/dataset_arch_melbins_feature_type_llcnn.png', bbox_inches='tight')

In [None]:
sns.set_context('paper', font_scale=2)
# df = fig_data[fig_data.input_features=='log_mel_spectrogram'].melt(id_vars=fig_data.columns.to_list()[:-2], 
#                             value_vars=['MCC (accuracy)','reliability bias'], var_name='metric')
df = fig_data[(fig_data.model_arch=='cnn') & (fig_data.resample_rate==16000)].melt(id_vars=fig_data.columns.to_list()[:-2], 
                            value_vars=['MCC (accuracy)','reliability bias'], var_name='metric')
df['mel_bins'] = df['mel_bins'].astype(int)

g = sns.FacetGrid(df, height=4, aspect=4, row='metric', sharey=False, sharex=False)
g.map(sns.boxplot, "dataset_name", "value", "mel_bins", palette="tab10", showfliers=False,
      hue_order=df["exp_cat"].unique().sort(), 
      order=df.dataset_name.unique(), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3))

g.map(sns.stripplot, "dataset_name", "value", "mel_bins", palette="tab10", dodge=True,
      hue_order=df["exp_cat"].unique().sort(), 
      order=df.dataset_name.unique(), size=1.5)

for ax in g.fig.axes:
    ax.set_title('')#ax.title.get_text().split('|')[-1].split('=')[-1].replace('_',' '), pad=15)
    sns.despine(left=True, bottom=True)
    
g.fig.axes[0].set_yticks(np.linspace(0.2, 0.9, 6))
g.fig.axes[1].set_yticks(np.linspace(0, 0.6, 7))

plt.suptitle('16kHz CNN architecture')
plt.xlabel('dataset name', labelpad=10)
g.fig.axes[0].set_ylabel('MCC (accuracy)\n(higher is better)')
g.fig.axes[1].set_ylabel('reliability bias \n(lower is better)')
g.fig.tight_layout()
g.add_legend()
g._legend.set_title('# Mel bins')
display(g)
plt.savefig('figures/dataset_arch_melbins_16.png', bbox_inches='tight')

In [None]:
sns.set_context('paper', font_scale=2)
df = fig_data[(fig_data['model_arch']=='cnn')  & (fig_data.resample_rate==16000)].melt(id_vars=fig_data.columns.to_list()[:-2], value_vars=['MCC (accuracy)','reliability bias'], var_name='metric')
df['mel_bins'] = df['mel_bins'].astype(int)
df['mfccs'] = df['mfccs'].astype(int)

g = sns.FacetGrid(df, height=4, aspect=4, row='metric', col='model_arch', sharey=False, sharex=False)
g.map(sns.boxplot, "dataset_name", "value", "mfccs", palette="tab10", 
      showfliers=True, flierprops=dict(marker='.', markersize=1.5),
      hue_order=df["mfccs"].unique().sort(), 
      order=df.dataset_name.unique(), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3))

g.map(sns.stripplot, "dataset_name", "value", "mfccs", palette="tab10", dodge=True,
      hue_order=df["mfccs"].unique().sort(), 
      order=df.dataset_name.unique(), size=1.5)

for ax in g.fig.axes:
    ax.set_title('')#ax.title.get_text().split('|')[-1].split('=')[-1].replace('_',' '), pad=15)
    sns.despine(left=True, bottom=True)

g.fig.axes[0].set_yticks(np.linspace(0.2, 0.9, 6))
# g.fig.axes[1].set_yticks(np.linspace(0.1, 0.9, 5))
g.fig.axes[1].set_yticks(np.linspace(0, 0.6, 7))
# g.fig.axes[3].set_yticks(np.linspace(0, 0.7, 5))

plt.suptitle('16kHz CNN architecture')
g.fig.axes[0].set_ylabel('MCC (accuracy)\n(higher is better)')
g.fig.axes[1].set_ylabel('reliability bias \n(lower is better)')
g.fig.axes[1].set_xlabel('dataset name', labelpad=15)
# g.fig.axes[3].set_xlabel('dataset name', labelpad=15)
g.fig.tight_layout()
hue_labels=['None',10,11,12,13,14]
g.add_legend(legend_data={
    key: value for key, value in zip(hue_labels, g._legend_data.values())
})
g._legend.set_title('# MFCCs')
display(g)
plt.savefig('figures/dataset_arch_mfccs_16.png', bbox_inches='tight')

In [None]:
sns.set_context('paper', font_scale=1.6)
df = fig_data.melt(id_vars=fig_data.columns.to_list()[:-2], 
                            value_vars=['MCC (accuracy)','reliability bias'], var_name='metric')

g = sns.FacetGrid(df, height=4, aspect=3, row='metric', sharey=False, sharex=False)
g.map(sns.boxplot, "dataset_name", "value", "frame_step", palette="tab10", showfliers=False,
      hue_order=df["exp_cat"].unique().sort(), 
      order=df.dataset_name.unique(), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3))

g.map(sns.stripplot, "dataset_name", "value", "frame_step", palette="tab10", dodge=True,
      hue_order=df["exp_cat"].unique().sort(), 
      order=df.dataset_name.unique(), size=1.5)

plt.suptitle('Effect of frame step on MCC (accuracy) and reliability bias')
plt.xlabel('dataset name', labelpad=10)
g.fig.axes[0].set_ylabel('MCC (accuracy)\n(higher is better)')
g.fig.axes[1].set_ylabel('reliability bias \n(lower is better)')
g.fig.tight_layout()
g.add_legend(title='frame step')
display(g)

In [None]:
sns.set_context('paper', font_scale=1.6)
df = fig_data.melt(id_vars=fig_data.columns.to_list()[:-2], 
                            value_vars=['MCC (accuracy)','reliability bias'], var_name='metric')

g = sns.FacetGrid(df, height=4, aspect=3, row='metric', sharey=False, sharex=False)
g.map(sns.boxplot, "dataset_name", "value", "frame_length", palette="tab10", showfliers=False,
      hue_order=df["exp_cat"].unique().sort(), 
      order=df.dataset_name.unique(), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3))

g.map(sns.stripplot, "dataset_name", "value", "frame_length", palette="tab10", dodge=True,
      hue_order=df["exp_cat"].unique().sort(), 
      order=df.dataset_name.unique(), size=1.5)

plt.suptitle('Effect of frame length on MCC (accuracy) and reliability bias')
plt.xlabel('dataset name', labelpad=10)
g.fig.axes[0].set_ylabel('MCC (accuracy)\n(higher is better)')
g.fig.axes[1].set_ylabel('reliability bias \n(lower is better)')
g.fig.tight_layout()
g.add_legend(title='frame length')
display(g)

In [None]:
sns.set_context('paper', font_scale=1.6)
df = fig_data.melt(id_vars=fig_data.columns.to_list()[:-2], 
                            value_vars=['MCC (accuracy)','reliability bias'], var_name='metric')
df['mel_bins'] = df['mel_bins'].astype(int)
df['mfccs'] = df['mfccs'].astype(int)

g = sns.FacetGrid(df, height=4, aspect=1.2, col='dataset_name', row='metric', sharey=True, sharex=True)
g.map(sns.boxplot, "frame_length", "value", "frame_step", palette="tab10", showfliers=False,
      hue_order=sorted(df["frame_step"].unique()), 
      order=sorted(df.frame_length.unique()), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3))

g.map(sns.stripplot, "frame_length", "value", "frame_step", palette="tab10", dodge=True,
      hue_order=sorted(df["frame_step"].unique()), 
      order=sorted(df.frame_length.unique()), 
      size=1.5)

for ax in g.fig.axes:
    ax.set_title(ax.title.get_text().split('|')[-1])
plt.xlabel('dataset name', labelpad=10)
g.fig.axes[0].set_ylabel('MCC (accuracy)\n(higher is better)')
g.fig.axes[6].set_ylabel('reliability bias \n(lower is better)')
g.fig.tight_layout()
g.add_legend()
display(g)

#### Input to Table 4

In [None]:
results_bias.head()

In [None]:
results_analysis.generate_importance_tables(results_bias[results_bias.equal_weighted==True], 
                                            dataset='speech_commands_gender', metric='all_mcc', parameters="preprocessing", 
                                            model_arch='cnn')['16000_']

In [None]:
results_analysis.generate_importance_tables(results_bias[results_bias.equal_weighted==True], 
                                            dataset='mswc_de', metric='model_bias', parameters="preprocessing", 
                                            model_arch='cnn')['16000_']

In [None]:
results_analysis.generate_importance_tables(results_bias[results_bias.equal_weighted==True], 
                                            dataset='mswc_de', metric='model_bias', 
                                            parameters="preprocessing", model_arch='low_latency_cnn')['16000_']

In [None]:
results_analysis.generate_importance_tables(results_bias[results_bias.equal_weighted==True], 
                                            dataset='mswc_de', metric='model_bias', 
                                            parameters="preprocessing", model_arch='cnn')['8000_']

In [None]:
results_analysis.generate_importance_tables(results_bias[results_bias.equal_weighted==True], 
                                            dataset='mswc_de', metric='model_bias', 
                                            parameters="preprocessing", model_arch='low_latency_cnn')['8000_']

#### Tables in Appendix

In [None]:
results_bias[results_bias['exp_name'].isin(['sc16_cnn','sc16_llcnn']) 
                 & (results_bias['input_features']=='log_mel_spectrogram')
                ].groupby(['exp_name','mel_bins'])[['all_mcc','model_bias']
                                                  ].agg(['mean','std']).T#.style.format('{:.1e}')

In [None]:
results_bias[results_bias['exp_name'].isin(['sc8_cnn','sc8_llcnn']) 
                 & (results_bias['input_features']=='log_mel_spectrogram')
                ].groupby(['exp_name','mel_bins'])[['all_mcc','model_bias']
                                                  ].agg(['mean','std']).T#.style.format('{:.1e}')

In [None]:
results_bias[results_bias['exp_name'].isin(['sc16_cnn','sc16_llcnn']) 
                 & (results_bias['input_features']=='mfcc')
                ].groupby(['exp_name','mfccs'])[['all_mcc','model_bias']
                                                  ].agg(['mean','std']).T#.style.format('{:.1e}')

In [None]:
results_bias[results_bias['exp_name'].isin(['sc8_cnn','sc8_llcnn']) 
                 & (results_bias['input_features']=='mfcc')
                ].groupby(['exp_name','mfccs'])[['all_mcc','model_bias']
                                                  ].agg(['mean','std']).T#.style.format('{:.1e}')

In [None]:
# sc16_cnn: mfcc / log Mel spec | fairest for each
0.016908/0.006664

In [None]:
# sc16_llcnn: mfcc / log Mel spec | fairest for each
0.019317/0.013510

In [None]:
# sc8_cnn: mfcc / log Mel spec | fairest for each
0.028245/0.010909

In [None]:
# sc8_llcnn: mfcc / log Mel spec | fairest for each
0.043740/0.017558

In [None]:
results_mcc.head()

## Impact of Pruning Hyperparameters

In [None]:
results_sparsity = results_compress.loc[:,['equal_weighted','exp_name','pruning_final_sparsity',
                                     'pruning_learning_rate','pruning_schedule','pruning_frequency',
                                     'trained_model_path','model_selected_because','all_mcc','model_fairness']]
results_sparsity.loc[:,'sparsity_max_mcc'] = results_sparsity.groupby(['equal_weighted','exp_name','pruning_final_sparsity'])['all_mcc'].transform('max')#agg({'all_mcc':'max', 'model_fairness':'min'})
results_sparsity.loc[:,'sparsity_min_fairness'] = results_sparsity.groupby(['equal_weighted','exp_name','pruning_final_sparsity'])['model_fairness'].transform('min')#agg({'all_mcc':'max', 'model_fairness':'min'})

In [None]:
results_compress_compare.columns

In [None]:
sns.set_context('paper', font_scale=2.5)
df = results_compress_compare.replace({'dataset_name': {'speech_commands_gender':'google_sc'}}
                         ).sort_values(by=['dataset_name'])
hue_order=df["dataset_name"].unique().sort()


g = sns.relplot(
    data=df, x="delta_male_mcc", y="delta_female_mcc", hue="dataset_name", col="exp_cat", aspect=1, 
    palette="tab10", hue_order=hue_order, s=10,
    # col_order=['cnn 16kHz', 'low latency cnn 16kHz','cnn 8kHz', 'low latency cnn 8kHz'],
    kind="scatter"
)

for ax in g.fig.axes:
    ax.axline((0.0,0.0), (0.1,0.1), c="black", ls='--', linewidth=2)
    ax.set_xlabel('delta male MCC', labelpad=10)
    ax.set_ylabel('delta female MCC', labelpad=10)
    ax.set_title(ax.title.get_text().split('=')[-1].replace('_',' '))
    ax.set_xticks(np.linspace(-0.75, 0, 4))
# g.set_xticklabels(rotation=0, fontdict={'ha': 'right'})
# g.fig.tight_layout()
# g.add_legend()
# g.fig.axes[0].set_ylabel('female \nMCC (accuracy)', labelpad=10)
sns.despine(bottom=True, left=True)

plt.suptitle("Effect of ", va='top', y=1.05)
display(g)

In [None]:
sns.set_context('paper', font_scale=2.5)
df = results_compress_compare.replace({'dataset_name': {'speech_commands_gender':'google_sc'}}
                         ).sort_values(by=['dataset_name'])
hue_order=df["dataset_name"].unique().sort()


g = sns.relplot(
    data=df, x="male_mcc", y="female_mcc", hue="dataset_name", col="exp_cat", aspect=1, 
    palette="tab10", hue_order=hue_order, s=10,
    kind="scatter"
)

for ax in g.fig.axes:
    ax.axline((0.0,0.0), (0.1,0.1), c="black", ls='--', linewidth=2)
    ax.set_xlabel('male MCC', labelpad=10)
    ax.set_ylabel('female MCC', labelpad=10)
    ax.set_title(ax.title.get_text().split('=')[-1].replace('_',' '))
# g.set_xticklabels(rotation=0, fontdict={'ha': 'right'})
# g.fig.tight_layout()
# g.add_legend()
# g.fig.axes[0].set_ylabel('female \nMCC (accuracy)', labelpad=10)
sns.despine(bottom=True, left=True)

plt.suptitle("Effect of ", va='top', y=1.05)
display(g)

In [None]:
sns.set_context('paper', font_scale=3)
df = results_compress_compare.melt(id_vars=results_compress_compare.columns.to_list()[:-6]+['model_selected_because'], 
                            value_vars=['delta_all_mcc','delta_model_bias'], var_name='delta_metric')

df = df.replace({'delta_metric': {'delta_all_mcc': 'delta_mcc', 'delta_model_bias': 'delta_reliability_bias'}, 
                 'dataset_name': {'speech_commands_gender':'google_sc'}}
               ).sort_values(by=['dataset_name', 'model_arch','delta_metric'])

# .sort_values(by=['exp_name', 'model_arch'])
g = sns.FacetGrid(data=df, height=8, aspect=1.8, col="delta_metric", #col_order=[16000,8000],
                  sharey=False, sharex=False)
g.map(sns.boxplot, "pruning_final_sparsity", "value", "pruning_schedule", palette='tab10', showfliers=False,
      hue_order=sorted(df["pruning_schedule"].unique()), 
      order=sorted(df.pruning_final_sparsity.unique()), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3))

g.map(sns.stripplot, "pruning_final_sparsity", "value", "pruning_schedule", palette='tab10', dodge=True,
      hue_order=sorted(df["pruning_schedule"].unique()), 
      order=sorted(df.pruning_final_sparsity.unique()),
      size=1.5)

for ax in g.fig.axes:
    ax.set_xlabel('final sparsity', labelpad=10)
    ax.set_title(ax.title.get_text().split('=')[-1].replace('_',' '))
    sns.despine(bottom=True, left=True)
g.fig.axes[0].set(ylim=(-0.9, 0.3))
g.fig.axes[1].set(ylim=(-0.5, 1.2))
g.fig.axes[0].set_ylabel('delta MCC \n(higher is better)', labelpad=10)
g.fig.axes[1].set_ylabel('delta reliability bias \n(lower is better)', labelpad=10)
g.set_xticklabels(rotation=0, fontdict={'ha': 'right'})
g.fig.tight_layout()
g.add_legend()
g._legend.set_title('pruning schedule')
sns.move_legend(g, loc='upper center', frameon=False, ncol=3, bbox_to_anchor=(0.48, 0.0) )
display(g)
plt.savefig('figures/pruninghps_sparsity-schedule.png', bbox_inches='tight')

In [None]:
sns.set_context('paper', font_scale=3)
df = results_compress_compare.melt(id_vars=results_compress_compare.columns.to_list()[:-6]+['model_selected_because'], 
                            value_vars=['delta_all_mcc','delta_model_bias'], var_name='delta_metric')

df = df.replace({'delta_metric': {'delta_all_mcc': 'delta_mcc', 'delta_model_bias': 'delta_reliability_bias'}, 
                 'dataset_name': {'speech_commands_gender':'google_sc'}}
               ).sort_values(by=['dataset_name', 'model_arch','delta_metric'])

# .sort_values(by=['exp_name', 'model_arch'])
g = sns.FacetGrid(data=df, height=8, aspect=1.8, col="delta_metric", #row='dataset_name', #col_order=[16000,8000],
                  sharey=False, sharex=False)
g.map(sns.boxplot, "pruning_final_sparsity", "value", "pruning_learning_rate", palette='tab10', showfliers=False,
      hue_order=sorted(df["pruning_learning_rate"].unique(), reverse=True), 
      order=sorted(df.pruning_final_sparsity.unique()), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3))

g.map(sns.stripplot, "pruning_final_sparsity", "value", "pruning_learning_rate", palette='tab10', dodge=True,
      hue_order=sorted(df["pruning_learning_rate"].unique(), reverse=True), 
      order=sorted(df.pruning_final_sparsity.unique()), 
      size=1.5)

for ax in g.fig.axes:
    title_list = ax.title.get_text().split(' | ')
    ax.set_title(ax.title.get_text().split('=')[-1].replace('_',' '))
    if 'delta_mcc' in title_list[-1]:
        ax.set_ylabel('delta MCC \n(higher is better)', labelpad=10)
        ax.set(ylim=(-0.9, 0.3))
    else:
        ax.set_ylabel('delta reliability bias \n(lower is better)', labelpad=10)
        ax.set(ylim=(-0.5, 1.2))
        
    ax.set_xlabel('final sparsity', labelpad=10)
    sns.despine(bottom=True, left=True)

g.fig.tight_layout()
hue_labels=['0.001', '0.0001', '0.00001']
g.add_legend(legend_data={
    key: value for key, value in zip(hue_labels, g._legend_data.values())
})
g._legend.set_title('learning rate')
sns.move_legend(g, loc='upper center', frameon=False, ncol=3, bbox_to_anchor=(0.48, 0.0) )
display(g)
plt.savefig('figures/pruninghps_sparsity-lr.png', bbox_inches='tight')

In [None]:
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

df.groupby(['delta_metric','pruning_final_sparsity','pruning_learning_rate'])['value'].agg(['mean', 'median', 'quantile'])#percentile(25), percentile(50), percentile(75)])

In [None]:
sns.set_context('paper', font_scale=3.2)
df = results_compress_compare.melt(id_vars=results_compress_compare.columns.to_list()[:-6]+['model_selected_because','exp_cat'], 
                            value_vars=['delta_all_mcc','delta_model_bias'], var_name='delta_metric')

df = df.replace({'delta_metric': {'delta_all_mcc': 'delta_mcc', 'delta_model_bias': 'delta_reliability_bias'}, 
                 'dataset_name': {'speech_commands_gender':'google_sc'}}
               ).sort_values(by=['dataset_name', 'model_arch','delta_metric'])


g = sns.FacetGrid(data=df[df.delta_metric=='delta_reliability_bias'], height=8, aspect=1, col="exp_cat", #row='model_arch',#col_order=[16000,8000],
                  sharey=True, sharex=True)
g.map(sns.boxplot, "dataset_name", "value", "pruning_learning_rate", palette='tab10', showfliers=False,
      hue_order=sorted(df["pruning_learning_rate"].unique(), reverse=True), 
      order=sorted(df.dataset_name.unique(), reverse=False), 
      showmeans=False, meanline=True, meanprops=dict(color="black"),
      saturation=0.7,boxprops=dict(alpha=.3))

g.map(sns.stripplot, "dataset_name", "value", "pruning_learning_rate", palette='tab10', dodge=True,
      hue_order=sorted(df["pruning_learning_rate"].unique(), reverse=True), 
      order=sorted(df.dataset_name.unique(), reverse=False),
      size=1.5)

  
for ax in g.fig.axes:
    title_list = ax.title.get_text().replace('_',' ').split(' | ')
    ax.set_title(title_list[0].split(' = ')[-1])
    if 'delta mcc' in title_list[-1]:
        ax.set_ylabel('delta MCC \n(higher is better)', labelpad=10)
        ax.set(ylim=(-0.9, 0.2))
    else:
        ax.set_ylabel('delta reliability bias \n(lower is better)', labelpad=10)
        ax.set(ylim=(-0.3, 0.9))
        
    ax.set_xlabel('', labelpad=10)
    sns.despine(bottom=True, left=True)

g.set_xticklabels(rotation=30, fontdict={'ha': 'center'})
g.fig.tight_layout()
hue_labels=['0.001', '0.0001', '0.00001']
g.add_legend(legend_data={
    key: value for key, value in zip(hue_labels, g._legend_data.values())
})
g._legend.set_title('learning rate')
sns.move_legend(g, loc='upper center', frameon=False, ncol=3, bbox_to_anchor=(0.48, 0.0) )
display(g)
plt.savefig('figures/pruninghps_dataset-lr.png', bbox_inches='tight')

In [None]:
sns.set_context('paper', font_scale=3)
df = results_compress_compare.melt(id_vars=results_compress_compare.columns.to_list()[:-6]+['model_selected_because','exp_cat'], 
                            value_vars=['delta_all_mcc','delta_model_bias'], var_name='delta_metric')

g = sns.displot(
    data=df, x="value", hue="model_selected_because", row="delta_metric", col='exp_cat',
    palette="tab10", hue_order=sorted(results_compress_compare['model_selected_because'].unique(), reverse=True),
    facet_kws={'sharex':False, 'sharey':False, 'legend_out':False}, height=5, aspect=1.2,
    kind='kde'
)


for ax in g.fig.axes:
    ax.axvline(c="black", ls='--', linewidth=1)
    title_list = ax.title.get_text().replace('_',' ').split(' | ')
    if 'mcc' in title_list[0]:
        ax.set_title(title_list[1].split(' = ')[-1], pad=25)
        ax.set(ylim=(0, 0.28))
        ax.set(xlim=(-0.8, 0.3))
        ax.set_xticks([-0.6, -0.3, 0, 0.3])
        ax.set_xlabel('delta MCC', labelpad=10)
    else:
        ax.set_title('')
        ax.set(ylim=(0, 0.44))
        ax.set(xlim=(-0.3, 0.3))
        ax.set_xlabel('delta reliability bias', labelpad=10)
    sns.despine(bottom=True, left=True)

g.fig.tight_layout()
sns.move_legend(g, loc='upper center', frameon=False, ncol=3, bbox_to_anchor=(0.5, 0.0) )
g._legend.set_title(title='selection strategy')
g._legend.get_texts()[0].set_text('low bias')
g._legend.get_texts()[1].set_text('high accuracy')
g._legend.get_texts()[2].set_text('low bias + high accuracy')

display(g)
plt.savefig('figures/pruninghps_selection_strategy.png', bbox_inches='tight')

## Section 5.4 Model Sel.ipynb_checkpoints/ction

### Tables

In [None]:
cnn_16_selection = results_analysis.model_selection(results_bias, 'mswc35_de16_cnn', 0.99)[
    ['input_features','frame_length','frame_step','mel_bins','mfccs','window_fn','all_mcc','model_bias']]
cnn_16_selection

In [None]:
results_analysis.model_selection(results_bias, 'mswc35_de16_llcnn', 0.99)[
    ['input_features','frame_length','frame_step','mel_bins','mfccs','window_fn','all_mcc','model_bias']]

In [None]:
cnn_8_selection = results_analysis.model_selection(results_bias, 'mswc35_de8_cnn', 0.99)[
    ['input_features','frame_length','frame_step','mel_bins','mfccs','window_fn','all_mcc','model_bias']]
cnn_8_selection

In [None]:
results_analysis.model_selection(results_bias, 'mswc35_de8_llcnn', 0.99)[
    ['input_features','frame_length','frame_step','mel_bins','mfccs','window_fn','all_mcc','model_bias']]

In [None]:
def fairest_pruned_model_in_mcc_range(df, exp_name, pruning_final_sparsity, min_percentage_of_mcc=0.99):
        
    min_mcc = df.where((df.exp_name == exp_name)
                       & (df.pruning_final_sparsity == pruning_final_sparsity)
                      )['all_mcc'].dropna(how='all').nlargest(1).values[0]*min_percentage_of_mcc

    selected_models = df[(df.all_mcc >= min_mcc)
                         & (df.exp_name == exp_name)
                         & (df.pruning_final_sparsity == pruning_final_sparsity)
                        ].dropna(how='all').reset_index(drop=True)
    
    selected_models = selected_models.iloc[selected_models['model_fairness'].idxmin(),:]
        
    return selected_models

In [None]:
def pruned_model_selection(df, min_percentage_of_mcc):
    df_mod = pd.DataFrame()

    for a in ['sc8_', 'sc8_ll','sc16_','sc16_ll']:
        arch = a+'cnn-compress_ew'
        for s in [0.2, 0.5, 0.75, 0.8, 0.85, 0.9]:
            df_s = fairest_pruned_model_in_mcc_range(df[df.equal_weighted==True], arch, s, min_percentage_of_mcc)
            df_mod = df_mod.append(df_s)
                
    return df_mod

In [None]:
pruned_model_selection(results_sparsity, 1).groupby('exp_name').agg({'all_mcc':['mean','std'],'model_fairness':['mean','std']})

In [None]:
pruned_model_selection(results_sparsity, 0.995).groupby('exp_name').agg({'all_mcc':['mean','std'],'model_fairness':['mean','std']})

In [None]:
# pd.set_option('display.float_format', '{:.1e}'.format)
pruned_model_selection(results_sparsity, 0.99).groupby('exp_name').agg({'all_mcc':['mean','std'],'model_fairness':['mean','std']})

In [None]:
pd.set_option('display.float_format', '{:.1e}'.format)
pruned_model_selection(results_sparsity, 0).groupby('exp_name').agg({'all_mcc':['mean','std'],'model_fairness':['mean','std']})

In [None]:
pd.set_option('display.float_format', '{:.4}'.format)

acc_df = pruned_model_selection(results_sparsity, 0.99).sort_values(['exp_name','pruning_final_sparsity'])
acc_df

In [None]:
acc_df.groupby(['exp_name','pruning_learning_rate'])['all_mcc'].count()

In [None]:
acc_df.groupby(['exp_name','pruning_frequency'])['all_mcc'].count()

In [None]:
acc_df.groupby(['exp_name','pruning_schedule'])['all_mcc'].count()

In [None]:
acc_df.groupby(['exp_name','trained_model_path'])['all_mcc'].count()

In [None]:
acc_df.groupby(['model_selected_because'])['all_mcc'].count()