# Evaluation

## Imports

In [None]:
import collections
import numpy as np
import pandas as pd
import csv
import altair as alt

## Load Data

In [None]:
data_dir = '/path/to/output/files'

In [None]:
# load performance data
baseline_heart_data = pd.read_csv(data_dir + 'heart/baseline/performance_complete.csv')
dp_heart_data = pd.read_csv(data_dir + 'heart/dp/performance.csv')

baseline_diabetes_data = pd.read_csv(data_dir + 'diabetes/baseline/performance_complete.csv')
dp_diabetes_data = pd.read_csv(data_dir + 'diabetes/dp/performance.csv')

In [None]:
# load fairness data for minority groups
baseline_heart_sex_data = pd.read_csv(data_dir + 'heart/baseline/sex_f.csv')
baseline_heart_age_data_1 = pd.read_csv(data_dir + 'heart/baseline/age_<40.csv')
baseline_heart_age_data_2 = pd.read_csv(data_dir + 'heart/baseline/age_>70.csv')

heart_sex_data = pd.read_csv(data_dir + 'heart/dp/sex_f.csv')
heart_age_data_1 = pd.read_csv(data_dir + 'heart/dp/age_<40.csv')
heart_age_data_2 = pd.read_csv(data_dir + 'heart/dp/age_>70.csv')

baseline_diabetes_preg_data_1 = pd.read_csv(data_dir + 'diabetes/baseline/preg_=0.csv')
baseline_diabetes_preg_data_2 = pd.read_csv(data_dir + 'diabetes/baseline/preg_>=9.csv')
baseline_diabetes_bmi_data_1 = pd.read_csv(data_dir + 'diabetes/baseline/bmi_0-18.5.csv')
baseline_diabetes_bmi_data_2 = pd.read_csv(data_dir + 'diabetes/baseline/bmi_18.5-25.csv')
baseline_diabetes_bmi_data_3 = pd.read_csv(data_dir + 'diabetes/baseline/bmi_>40.csv')
baseline_diabetes_age_data_1 = pd.read_csv(data_dir + 'diabetes/baseline/age_40-50.csv')
baseline_diabetes_age_data_2 = pd.read_csv(data_dir + 'diabetes/baseline/age_>50.csv')

diabetes_preg_data_1 = pd.read_csv(data_dir + 'diabetes/dp/preg_=0.csv')
diabetes_preg_data_2 = pd.read_csv(data_dir + 'diabetes/dp/preg_>=9.csv')
diabetes_bmi_data_1 = pd.read_csv(data_dir + 'diabetes/dp/bmi_0-18.5.csv')
diabetes_bmi_data_2 = pd.read_csv(data_dir + 'diabetes/dp/bmi_18.5-25.csv')
diabetes_bmi_data_3 = pd.read_csv(data_dir + 'diabetes/dp/bmi_>40.csv')
diabetes_age_data_1 = pd.read_csv(data_dir + 'diabetes/dp/age_40-50.csv')
diabetes_age_data_2 = pd.read_csv(data_dir + 'diabetes/dp/age_>50.csv')

## Average results


In [None]:
def average_baseline_performance(data):

  max_round = data['round'].max()
  last_round_data = data[data["round"] == max_round]

  mean_metrics = last_round_data[['binary_accuracy', 'precision', 'recall', 'f1_score']].mean()

  mean_data = pd.DataFrame(mean_metrics).transpose()

  return mean_data

In [None]:
def average_dp_performance(data):
  data_grouped = data.groupby('noise_multiplier').agg(
      {
          'binary_accuracy': 'mean',
          'precision': 'mean',
          'recall': 'mean',
          'f1_score': 'mean',
          'epsilon': 'mean'
      }
  ).reset_index()

  return data_grouped

In [None]:
def average_fairness_performance(data):
  data_grouped = data.groupby('noise_multiplier').agg(
      {
          'DI_degree': 'mean',
          'EOP_difference': 'mean',
          'EODD_difference': 'mean',
          'SP_difference': 'mean'
      }
  ).reset_index()

  return data_grouped

In [None]:
baseline_heart_performance = average_baseline_performance(baseline_heart_data)
baseline_diabetes_performance = average_baseline_performance(baseline_diabetes_data)

In [None]:
dp_heart_performance = average_dp_performance(dp_heart_data)
dp_diabetes_performance = average_dp_performance(dp_diabetes_data)

In [None]:
baseline_heart_sex_data = average_fairness_performance(baseline_heart_sex_data)
baseline_heart_age_data_1 = average_fairness_performance(baseline_heart_age_data_1)
baseline_heart_age_data_2 = average_fairness_performance(baseline_heart_age_data_2)

heart_sex_data = average_fairness_performance(heart_sex_data)
heart_age_data_1 = average_fairness_performance(heart_age_data_1)
heart_age_data_2 = average_fairness_performance(heart_age_data_2)

baseline_diabetes_preg_data_1 = average_fairness_performance(baseline_diabetes_preg_data_1)
baseline_diabetes_preg_data_2 = average_fairness_performance(baseline_diabetes_preg_data_2)
baseline_diabetes_bmi_data_1 = average_fairness_performance(baseline_diabetes_bmi_data_1)
baseline_diabetes_bmi_data_2 = average_fairness_performance(baseline_diabetes_bmi_data_2)
baseline_diabetes_bmi_data_3 = average_fairness_performance(baseline_diabetes_bmi_data_3)
baseline_diabetes_age_data_1 = average_fairness_performance(baseline_diabetes_age_data_1)
baseline_diabetes_age_data_2 = average_fairness_performance(baseline_diabetes_age_data_2)

diabetes_preg_data_1 = average_fairness_performance(diabetes_preg_data_1)
diabetes_preg_data_2 = average_fairness_performance(diabetes_preg_data_2)
diabetes_bmi_data_1 = average_fairness_performance(diabetes_bmi_data_1)
diabetes_bmi_data_2 = average_fairness_performance(diabetes_bmi_data_2)
diabetes_bmi_data_3 = average_fairness_performance(diabetes_bmi_data_3)
diabetes_age_data_1 = average_fairness_performance(diabetes_age_data_1)
diabetes_age_data_2 = average_fairness_performance(diabetes_age_data_2)

## Privacy budget

In [None]:
def plot_privacy_budget(data, title):
  chart = alt.Chart(data).mark_line().encode(
    x=alt.X('noise_multiplier', title='Noise Multiplier'),
    y=alt.Y('epsilon', title='Privacy Budget (ε)')
  )#.properties(title=title)

  chart = chart.configure_axis(
    grid=True,
    tickCount=20
  )

  return chart

In [None]:
# comment if all noise multipliers are wanted
filtered_heart_data = dp_heart_performance[dp_heart_performance['noise_multiplier'] >= 5]
filtered_diabetes_data = dp_diabetes_performance[dp_diabetes_performance['noise_multiplier'] >= 5]

chart_heart = plot_privacy_budget(filtered_heart_data, 'Privacy budget heart dataset')
chart_diabetes = plot_privacy_budget(filtered_diabetes_data, 'Privacy budget diabetes dataset')

chart_heart.show()
chart_diabetes.show()

## Performance evaluation

In [None]:
# maybe split into accuracy+f1 (overall performance) and precision+recall(sensitivity analysis)

# TODO: add shaded regions for variability/ error plot?
def plot_performance_metrics(data, title):
  metric_names = {
        'binary_accuracy': 'Accuracy',
        'precision': 'Precision',
        'recall': 'Recall',
        'f1_score': 'F1 Score'
    }

  metrics = list(metric_names.keys())

  melted_data = data.melt(id_vars='noise_multiplier', value_vars=metrics, var_name='Performance Metric', value_name='value')

  melted_data['Performance Metric'] = melted_data['Performance Metric'].map(metric_names)


  chart = alt.Chart(melted_data).mark_line().encode(
      x=alt.X('noise_multiplier', title='Noise Multiplier'),
      y=alt.Y('value', title='Performance', scale=alt.Scale(domain=[0,1])),
      color='Performance Metric'
  )#.properties(title=title)

  return chart

In [None]:
chart_heart = plot_performance_metrics(dp_heart_performance, 'Performance heart dataset')
chart_diabetes = plot_performance_metrics(dp_diabetes_performance, 'Performance diabetes dataset')

chart_heart.show()
chart_diabetes.show()


In [None]:
def plot_baseline_comparison(baseline_data, data, title):
  metric_names = {
        'binary_accuracy': 'Accuracy',
        'precision': 'Precision',
        'recall': 'Recall',
        'f1_score': 'F1 Score'
    }

  metrics = list(metric_names.keys())

  # Create two rows of charts
  chart_row_1 = []
  chart_row_2 = []

  for i, metric in enumerate(metrics):
    chart = alt.Chart(data).mark_line().encode(
        x=alt.X('noise_multiplier', title='Noise Multiplier'),
        y=alt.Y(metric, title=metric_names[metric], scale=alt.Scale(domain=[0, 1])),
        tooltip=[metric, 'noise_multiplier']
    )

    baseline_rule = alt.Chart(pd.DataFrame({'y': [baseline_data[metric].values[0]]})).mark_rule(color='red').encode(y='y')

    # Add chart to the appropriate row
    if i < 2:
      chart_row_1.append(chart + baseline_rule)
    else:
      chart_row_2.append(chart + baseline_rule)

  # Combine charts in each row horizontally
  combined_chart_row_1 = alt.hconcat(*chart_row_1)
  combined_chart_row_2 = alt.hconcat(*chart_row_2)

  # Combine rows vertically
  combined_chart = alt.vconcat(combined_chart_row_1, combined_chart_row_2)

  return combined_chart

In [None]:
chart_heart = plot_baseline_comparison(baseline_heart_performance, dp_heart_performance, 'Heart dataset')
chart_diabetes = plot_baseline_comparison(baseline_diabetes_performance, dp_diabetes_performance, 'Diabetes dataset')
# TODO: write baseline value next to plot?
chart_heart.show()
chart_diabetes.show()

## Fairness evaluation

In [None]:
def plot_fairnes_metrics(data, title):
  # Reshape the data for plotting
  data_melted = data.melt(id_vars=['noise_multiplier'], var_name='Fairness Measure', value_name='Value')

  rename_dict = {
      'DI_degree': 'DI degree',
      'EODD_difference': 'ΔEODD',
      'EOP_difference': 'ΔEOP',
      'SP_difference': 'ΔSP'
  }
  data_melted['Fairness Measure'] = data_melted['Fairness Measure'].replace(rename_dict)


  chart = alt.Chart(data_melted).mark_bar().encode(
      x=alt.X('noise_multiplier:N', title='Noise Multiplier', axis=alt.Axis(labelAngle=0)),
      y=alt.Y('Value:Q', title='Fairness', scale=alt.Scale(domain=[0,1])),
      color='Fairness Measure:N',
      column=alt.Column('Fairness Measure:N', header=alt.Header(title=None, labelOrient='bottom', labels=False))
  ).properties(width=200, height=200)#, title=title)

  chart = chart.configure_axisY(grid=True, tickCount=10)

  return chart

In [None]:
chart_sex = plot_fairnes_metrics(heart_sex_data, 'Heart dataset - sex:f')
chart_age_1 = plot_fairnes_metrics(heart_age_data_1, 'Heart dataset - age:<40')
chart_age_2 = plot_fairnes_metrics(heart_age_data_2, 'Heart dataset - age:>70')

chart_sex.show()
chart_age_1.show()
chart_age_2.show()

In [None]:
chart_preg_1 = plot_fairnes_metrics(diabetes_preg_data_1, 'Diabetes dataset - preg=0')
chart_preg_2 = plot_fairnes_metrics(diabetes_preg_data_2, 'Diabetes dataset - preg>=9')
chart_bmi_1 = plot_fairnes_metrics(diabetes_bmi_data_1, 'Diabetes dataset - bmi<18.5')
chart_bmi_2 = plot_fairnes_metrics(diabetes_bmi_data_2, 'Diabetes dataset - bmi:18.5-25')
chart_bmi_3 = plot_fairnes_metrics(diabetes_bmi_data_3, 'Diabetes dataset - bmi>40')
chart_age_1 = plot_fairnes_metrics(diabetes_age_data_1, 'Diabetes dataset - age:40-50')
chart_age_2 = plot_fairnes_metrics(diabetes_age_data_2, 'Diabetes dataset - age:>50')

chart_preg_1.show()
chart_preg_2.show()
#chart_bmi_1.show()
chart_bmi_2.show()
chart_bmi_3.show()
chart_age_1.show()
chart_age_2.show()

In [None]:
def create_fairness_comparison_plot(heart_sex_data, baseline_heart_sex_data):
    metrics = ['DI_degree', 'EOP_difference', 'EODD_difference', 'SP_difference']
    noise_multipliers = [5, 20, 50]
    plot_data = []

    for metric in metrics:
        for noise_multiplier in noise_multipliers:
            plot_data.append({
                'metric': metric,
                'value': heart_sex_data[heart_sex_data['noise_multiplier']==noise_multiplier][metric].values[0],
                'group': noise_multiplier
            })
        plot_data.append({
            'metric': metric,
            'value': baseline_heart_sex_data[metric].values[0],
            'group': 'Baseline'
        })

    plot_df = pd.DataFrame(plot_data)

    rename_dict = {
        'DI_degree': 'DI degree',
        'EODD_difference': 'ΔEODD',
        'EOP_difference': 'ΔEOP',
        'SP_difference': 'ΔSP'
    }
    plot_df['metric'] = plot_df['metric'].replace(rename_dict)

    chart = alt.Chart(plot_df).mark_bar().encode(
        x=alt.X('metric:N', title=None, axis=alt.Axis(labelAngle=0)),
        y=alt.Y('value:Q',
                title='Fairness',
                scale=alt.Scale(domain=[0, 1])),
        color=alt.Color('group:N', title='Noise'),
        xOffset='group:N',
        tooltip=['metric', 'group', 'value']
    ).properties(
        width=500,
        height=300,
        #title='Fairness Metrics Comparison'
    )

    return chart

In [None]:
chart_sex = create_fairness_comparison_plot(heart_sex_data, baseline_heart_sex_data)
chart_age_1 = create_fairness_comparison_plot(heart_age_data_1, baseline_heart_age_data_1)
chart_age_2 = create_fairness_comparison_plot(heart_age_data_2, baseline_heart_age_data_2)

chart_sex.show()
chart_age_1.show()
chart_age_2.show()

In [None]:
chart_preg_1 = create_fairness_comparison_plot(diabetes_preg_data_1, baseline_diabetes_preg_data_1)
chart_preg_2 = create_fairness_comparison_plot(diabetes_preg_data_2, baseline_diabetes_preg_data_2)
chart_bmi_1 = create_fairness_comparison_plot(diabetes_bmi_data_1, baseline_diabetes_bmi_data_1)
chart_bmi_2 = create_fairness_comparison_plot(diabetes_bmi_data_2, baseline_diabetes_bmi_data_2)
chart_bmi_3 = create_fairness_comparison_plot(diabetes_bmi_data_3, baseline_diabetes_bmi_data_3)
chart_age_1 = create_fairness_comparison_plot(diabetes_bmi_data_3, baseline_diabetes_bmi_data_3)
chart_age_2 = create_fairness_comparison_plot(diabetes_age_data_2, baseline_diabetes_age_data_2)

chart_preg_1.show()
chart_preg_2.show()
#chart_bmi_1.show()
chart_bmi_2.show()
chart_bmi_3.show()
chart_age_1.show()
chart_age_2.show()