In [162]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import dalex as dx

In [149]:
# load data
df = pd.read_csv('data/ACSIncome_NY_2018.csv')

### Prepare Data

In [150]:
# rename target for readability
df = df.rename(columns={'TARGET': 'INCOME'})

# keep only certain features
df = df[["AGEP", "COW", "SCHL", "MAR", "WKHP", "SEX", "INCOME"]]

In [151]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [152]:
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(X, y, X['SEX'], test_size=0.2, random_state=42)

### Prepare Data

In [153]:
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

In [154]:
exp = dx.Explainer(model, X, y)

Preparation of a new explainer is initiated

  -> data              : 103021 rows 6 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 103021 values
  -> model_class       : sklearn.linear_model._logistic.LogisticRegression (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x28ee6c040> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 5.22e-05, mean = 0.415, max = 0.999
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.997, mean = -0.000104, max = 1.0
  -> model_info        : package sklearn

A new explainer has been created!



X does not have valid feature names, but StandardScaler was fitted with feature names



In [155]:
exp.model_performance().result

Unnamed: 0,recall,precision,f1,accuracy,auc
LogisticRegression,0.669366,0.714304,0.691105,0.751837,0.827946


### Create the priviledged and protected groups

In [156]:
df['SEX'] = np.where(df['SEX'] == 2.0, "Female", "Male")

protected = df['SEX']
privileged = "Male"

### Check for fairness

In [157]:
fobject = exp.model_fairness(protected = protected, privileged = privileged)
fobject.fairness_check(epsilon = 0.8) # default epsilon

Bias detected in 2 metrics: FPR, STP

Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.

Ratios of metrics, based on 'Male'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
            TPR       ACC       PPV       FPR       STP
Female  0.81044  1.021505  0.951923  0.596708  0.645435


This model cannot be called fair! Generally, each metric should be between (epsilon, 1/epsilon). Metrics are calculated for each subgroup, and then their scores are divided by the score of the privileged subgroup, hence this is why the priviledged subgroup is omitted.

In [158]:
# scaled (by priviledged group) metric values
fobject.result

Unnamed: 0,TPR,TNR,PPV,NPV,FNR,FPR,FDR,FOR,ACC,STP
Female,0.81044,1.129458,0.951923,1.042216,1.507353,0.596708,1.128676,0.867769,1.021505,0.645435
Male,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [160]:
fobject.plot()

In [161]:
fobject.plot(type = "radar")

### Adding noise to the sensitive attributes

In [132]:
# check initial distribution
df['SEX'].value_counts()

male = df['SEX'].value_counts()['Male']
female = df['SEX'].value_counts()['Female']

tot = male + female
print(f'Proportion female {round(female/tot, 3)}')
print(f'Proportion male {round(male/tot, 3)}')

Proportion female 0.494
Proportion male 0.506


Initially, roughly 50-50 distribution!

In [136]:
# oversample the male population
males = df[df['SEX'] == 'Male']
females = df[df['SEX'] == 'Female']