This file was created to add articifial ground truths to the our test data. Both the data and the predictions were extracted form [suspicion_machine](https://github.com/Lighthouse-Reports/suspicion_machine.git). Running this file will create a series of files to test the fairness metrics.

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Adding the dataset and predictions together

df = pd.read_csv('synth_data.csv')
preds = pd.read_csv('predictions.csv')
df['predictions'] = preds
df.to_csv('synth_data_preds.csv')


In [None]:
# Add a groundtruth based on the the predictions (top 10 procent)

df = pd.read_csv('synth_data.csv')
preds = pd.read_csv('predictions.csv')
df['predictions'] = preds
cutoff = df['predictions'].quantile(0.9)
df['actual_outcome'] = (df['predictions'] > cutoff).astype(int)
df.to_csv('groundtruth_top10.csv')

In [None]:
# Add a groundtruth based on the the predictions added some randomizing using a beta distribution

df = pd.read_csv('synth_data.csv')
preds = pd.read_csv('predictions.csv')
df['predictions'] = preds

temp = pd.DataFrame(df['predictions'].clip(0.001, 0.999))

scale = 20  # Higher = less randomness.
alpha = temp['predictions'] * scale
beta = (1 - temp['predictions']) * scale

# Sample from Beta for each row
temp['predictions'] = np.random.beta(alpha, beta)

cutoff = temp['predictions'].quantile(0.9)
df['actual_outcome'] = (temp['predictions'] > cutoff).astype(int)
df.to_csv('groundtruth_top10_noisy.csv')

In [None]:
# same as above but with bias in age and gender

df = pd.read_csv('synth_data.csv')
preds = pd.read_csv('predictions.csv')
df['predictions'] = preds

temp = pd.DataFrame(df['predictions'].clip(0.001, 0.999))

scale = 10  # Higher = less randomness.
gender_bias = .5  # Positive = upward bias for women, negative = downward
age_bias_strength = -0.1  # Each year adds this much bias

alpha = temp['predictions'] * scale
beta = (1 - temp['predictions']) * scale

# Add gender bias
gender_effect = df['persoon_geslacht_vrouw'] * gender_bias
alpha += gender_effect
beta -= gender_effect

# Add age bias
age_center = 50
age_effect = (df['persoon_leeftijd_bij_onderzoek'] - age_center) * age_bias_strength
alpha += age_effect
beta -= age_effect

alpha = np.clip(alpha, 0.001, None)
beta = np.clip(beta, 0.001, None)

# Sample from Beta for each row
temp['predictions'] = np.random.beta(alpha, beta)

cutoff = temp['predictions'].quantile(0.9)
df['actual_outcome'] = (temp['predictions'] > cutoff).astype(int)
df.to_csv('groundtruth_bias_age_gender.csv')

In [None]:
## random predictions but accurate

df = pd.read_csv('synth_data.csv')
preds = np.random.random(len(df))
outcomes = np.array([1 if pred > np.random.random() else 0 for pred in preds])
df['predictions'] = preds
df['actual_outcome'] = outcomes
df.to_csv('well_calibrated.csv')


In [None]:
## random predictions but accurate and bias

df = pd.read_csv('synth_data.csv')
preds = np.random.random(len(df))

temp = preds.clip(0.001, 0.999)

scale = 10  # Higher = less randomness.
gender_bias = .5  # Positive = upward bias for women, negative = downward
age_bias_strength = -0.1  # Each year adds this much bias

alpha = temp * scale
beta = (1 - temp) * scale

# Add gender bias
gender_effect = (df['persoon_geslacht_vrouw'] - .5) * gender_bias
alpha += gender_effect
beta -= gender_effect

# Add age bias
age_center = 50
age_effect = (df['persoon_leeftijd_bij_onderzoek'] - age_center) * age_bias_strength
alpha += age_effect
beta -= age_effect

alpha = np.clip(alpha, 0.001, None)
beta = np.clip(beta, 0.001, None)

# Sample from Beta for each row
temp = np.random.beta(alpha, beta)

outcomes = np.array([1 if pred > np.random.random() else 0 for pred in temp])
df['predictions'] = preds
df['actual_outcome'] = outcomes
df.to_csv('well_calibrated_bias.csv')
