In [1]:
import numpy as np
import pandas as pd
from collections import deque
from kmodes.kmodes import KModes

import utils
import os
import torch

from fairness_metrics.Predicted_outcomes.Error_rate_metrics import Error_rate_metrics
from fairness_metrics.Predicted_outcomes.Predictive_value_metrics import Predictive_value_metrics
from fairness_metrics.Predicted_outcomes.statistical_parity import statistical_parity
from fairness_metrics.predicted_probs.balance_in_pos_neg import balance_in_pos_neg
from fairness_metrics.predicted_probs.well_callibrated import well_calibration
from fairness_metrics.similarity_based.similarity_based import LipschitzFairness

loaded utils
loaded error
loaded predictive
loaded statisctical
loaded balance
loaded calibration


In [10]:
df = pd.read_csv("data/altered_data/data_pred_ground_altered_pred.csv")

df['bin_predictions'] = [1 if i > 0.7 else 0 for i in df['predictions']]
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

features = ['adres_aantal_brp_adres','adres_aantal_verschillende_wijken','adres_aantal_verzendadres','adres_aantal_woonadres_handmatig','adres_dagen_op_adres','adres_recentst_onderdeel_rdam']
X = df[features].astype(str).values
y = df['bin_predictions'].values
max_iter = 10

candidate_attrs = [col for col in df.columns if df[col].nunique() < 10 and col != 'bin_predictions' and col != 'ground_truth' and col != 'cluster'and col != 'predictions']


class Dataset:
    def __init__(self, df):
        data = torch.tensor(df.values, dtype=torch.float)
        self.data = data
        self.columns = df.columns.tolist()
        self.i2c = self.columns
        self.c2i = {name: i for i, name in enumerate(self.columns)}


class ClusterNode:
    def __init__(self, indices, cluster_id):
        self.indices = indices
        self.left = None
        self.right = None
        self.cluster_id = cluster_id


def compute_spd(sub_df, params):
    wrapped = Dataset(sub_df)

    max_diff = 0
    for attr in candidate_attrs:
        params = {
            'prediction_column': 'bin_predictions',
            'ground_truth_column': 'ground_truth',
            'protected_values': torch.tensor([col == attr for col in sub_df.columns])
        }

        metric = statistical_parity(wrapped, params)
        results = metric.show(raw_results=True)

        group_probs = results[attr]['group_probs']
        probs = list(group_probs.values())
        if len(probs) >= 2:
            max_diff = max(max_diff, abs(max(probs) - min(probs)))
    return max_diff


def hbca_tree(X, y, df, max_iter, min_size=50):
    cluster_counter = 1
    root = ClusterNode(np.arange(len(y)), cluster_id=0)
    queue = deque([root])

    for _ in range(max_iter):
        if queue:
            node = queue.popleft()

            if len(node.indices) >= 2 * min_size:
                kmodes = KModes(n_clusters=2)
                labels = kmodes.fit_predict(X[node.indices])

                left_indices = node.indices[labels == 0]
                right_indices = node.indices[labels == 1]

                parent_df = df.iloc[node.indices]
                left_df = df.iloc[left_indices]
                right_df = df.iloc[right_indices]

                spd_parent = compute_spd(parent_df, candidate_attrs)
                spd_left = compute_spd(left_df, candidate_attrs)
                spd_right = compute_spd(right_df, candidate_attrs)
                print(spd_parent, spd_left, spd_right)
                print(max(spd_left, spd_right) > spd_parent)

                if max(spd_left, spd_right) > spd_parent and (len(left_indices) >= min_size or len(right_indices) >= min_size):
                    df.loc[df.index[left_indices], 'cluster'] = cluster_counter
                    cluster_counter += 1
                    df.loc[df.index[right_indices], 'cluster'] = cluster_counter
                    cluster_counter += 1

                    node.left = ClusterNode(left_indices, cluster_id=cluster_counter - 2)
                    node.right = ClusterNode(right_indices, cluster_id=cluster_counter - 1)

                    queue.extend([node.left, node.right])

    return df


df = hbca_tree(X, y, df, max_iter, min_size=50)

wrapped_dataset = Dataset(df)

params = {
    'prediction_column': 'bin_predictions',
    'ground_truth_column': 'ground_truth',
    'protected_values': torch.tensor([col == 'cluster' for col in df.columns])
}

metric = statistical_parity(wrapped_dataset, params)
figs = metric.show()
for fig in figs:
    fig.show()

tensor(0.8783) tensor(0.9060) tensor(1.)
tensor(True)
tensor(0.9060) tensor(0.8904) tensor(0.9412)
tensor(True)
tensor(1.) tensor(1.) tensor(1.)
tensor(False)
tensor(0.8904) tensor(0.8995) tensor(1.)
tensor(True)
tensor(0.9412) tensor(0.9412) tensor(1.)
tensor(True)
tensor(0.8995) tensor(0.9035) tensor(1.)
tensor(True)
tensor(1.) tensor(1.) tensor(1.)
tensor(False)
tensor(0.9412) tensor(0.9412) tensor(1.)
tensor(True)
tensor(0.9035) tensor(0.9040) tensor(0.9412)
tensor(True)
