In [None]:
import joblib
from src.preprocessing import get_preprocessed_data
from sklearn.model_selection import train_test_split

In [None]:
model = joblib.load(r"../models/black_box_xgboost.pkl")
df, _, _, true_labels, pct_afroamericans = get_preprocessed_data()

X_train, X_test, y_train, y_test = train_test_split(
    df,
    true_labels,
    test_size=0.2,
    random_state=42,
    stratify=true_labels,
)


## Fairness

### Discussions

Only variable we have : Pct_afro_american

What we want : binary variable assessing whether or not the person is afro-american

4 possibilities to classify people as afro-american :
- If they live in an area with over 50% AAs
- If they live in an area with more AAs than average
- If they are among the people living in the most densely populated areas
- Randomly based on probabilities

In [None]:
from src.utils import quantile_binary, random_binary, over_pct_binary

In [None]:
is_afroamerican_50_pct = over_pct_binary(pct_afroamericans, .5)
is_afroamerican_avg = over_pct_binary(pct_afroamericans, pct_afroamericans.mean())
is_afroamerican_qb = quantile_binary(pct_afroamericans, pct_afroamericans.mean())
is_afroamerican_rb = random_binary(pct_afroamericans, seed=2025)

means = [
    is_afroamerican_50_pct.mean(),
    is_afroamerican_avg.mean(),
    is_afroamerican_qb.mean(),
    is_afroamerican_rb.mean()
]
names = [
    "Over 50%",
    "Over Average",
    "Quantile-based",
    "Random-based"
]

### Plot

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.barh(names[::-1], means[::-1])
plt.xlabel("Proportion of individuals classified as Afro-American")
plt.title("Different definitions of Afro-American individuals")
plt.show()

## Fairness tests and plots

In [None]:
from src.fairness import fairness_test_statistic, fairness_partial_dependance_plots
import constants.column_names as cst
import numpy as np

### Fairness tests

In [None]:
fairness_50_pct = fairness_test_statistic(X_test, model, is_afroamerican_50_pct.loc[X_test.index])
fairness_avg = fairness_test_statistic(X_test, model, is_afroamerican_avg.loc[X_test.index])
fairness_qb = fairness_test_statistic(X_test, model, is_afroamerican_qb.loc[X_test.index])
fairness_rb = fairness_test_statistic(X_test, model, is_afroamerican_rb.loc[X_test.index])

fairnesses = [ fairness_50_pct, fairness_avg, fairness_qb, fairness_rb ]

In [None]:
plt.barh(names[::-1], np.log10(fairnesses[::-1]))
plt.xlabel("Log of Fairness Test Statistic (p-value)")
plt.title("Fairness Test Statistics by Group")
plt.show()

In [None]:
import pandas as pd
import seaborn as sns

is_afroamerican_rb_test = is_afroamerican_rb.loc[X_test.index]
crosstab = pd.crosstab(1*is_afroamerican_rb_test, model.predict(X_test), rownames=['Afro-American'], colnames=['Predicted Default'])
pct_crosstab = crosstab.div(crosstab.sum(axis=1), axis=0)
sns.heatmap(crosstab, annot=True)
plt.savefig("../results/fairness/contingency_table.png")
plt.title("Count of predicted default by Afro-American status")
plt.show()
sns.heatmap(pct_crosstab, annot=True)
plt.title("Proportion of predicted default by Afro-American status")
plt.savefig("../results/fairness/contingency_table_pct.png")
plt.show()

### Analysis of the influence of randomness on the results

In [None]:
res_fairness_random = []
for _ in range(100):
    is_afroamerican_rb_randomness_test = random_binary(pct_afroamericans, seed=None)
    res_fairness_random.append(fairness_test_statistic(X_test, model, is_afroamerican_rb_randomness_test.loc[X_test.index]))
np.mean(res_fairness_random), np.std(res_fairness_random)

plt.boxplot(res_fairness_random)
plt.axhline(y=0.05, color='r', linestyle='--', label='Fairness Test Statistic for Random-based definition')
plt.show()

plt.boxplot(np.log(res_fairness_random))
plt.axhline(y=np.log(0.05), color='r', linestyle='--', label='Fairness Test Statistic for Random-based definition')
plt.show()

### FPDPs

In [None]:
is_afroamerican_50_pct_test = is_afroamerican_50_pct.loc[X_test.index]
fairness_partial_dependance_plots(X_test, model, is_afroamerican_50_pct_test, n_points=10, file_dir="../results/fpdp/50pct", categorical_features=cst.CATEGORICAL_COLUMNS)

is_afroamerican_avg_test = is_afroamerican_avg.loc[X_test.index]
fairness_partial_dependance_plots(X_test, model, is_afroamerican_avg_test, n_points=10, file_dir="../results/fpdp/avg", categorical_features=cst.CATEGORICAL_COLUMNS)

is_afroamerican_qb_test = is_afroamerican_qb.loc[X_test.index]
fairness_partial_dependance_plots(X_test, model, is_afroamerican_qb_test, n_points=10, file_dir="../results/fpdp/qb", categorical_features=cst.CATEGORICAL_COLUMNS)

is_afroamerican_rb_test = is_afroamerican_rb.loc[X_test.index]
fairness_partial_dependance_plots(X_test, model, is_afroamerican_rb_test, n_points=10, file_dir="../results/fpdp/rb", categorical_features=cst.CATEGORICAL_COLUMNS)