In [1]:
import numpy as np
from folktables import ACSDataSource, ACSMobility
from matplotlib import pyplot as plt
import torch
from scipy import stats
from scipy.sparse.linalg import lobpcg
from scipy.linalg import eigh, eig
import pandas as pd
from inFairness.distances import MahalanobisDistances, SquaredEuclideanDistance, LogisticRegSensitiveSubspace
from inFairness.fairalgo import SenSeI
from inFairness.auditor import SenSeIAuditor, SenSRAuditor
from tqdm.auto import tqdm
from utils import *

  warn_deprecated('vmap', 'torch.vmap')


In [2]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import data

In [3]:
class TrainDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __getitem__(self, idx):
        data = self.data[idx]
        label = self.labels[idx]
        return data, label
    
    def __len__(self):
        return len(self.labels)

In [4]:
def relu(z):
    return z * (z > 0)

def sigmoid(z):
    return 1/(1 + np.exp(-z))

def softmax(z):
    exponentials = np.exp(z - np.max(z))
    return exponentials / exponentials.sum()

In [5]:
r = 3
p = 30
n = 75

In [6]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["TX"], download=True)
acs_data_adult = acs_data[acs_data["AGEP"] >= 18]


In [7]:
acs_data_cleaned = acs_data_adult.select_dtypes(include=["float64", "int64"])
acs_data_cleaned = acs_data_cleaned.loc[:, ~(acs_data_cleaned.isna().any())]
acs_data_cleaned = acs_data_cleaned.loc[:, (acs_data_cleaned.var(axis=0) > 0)]
acs_data_cleaned = acs_data_cleaned.sample(frac=1, axis=0)
print(np.shape(np.array(acs_data_cleaned)))


(206826, 219)


In [8]:
synthetic_data = pd.DataFrame(np.random.normal(0, 1, size=(2 * n, p)))

In [9]:
print("Generating synthetic data...")
X = clean_data(2 * n, p, synthetic_data)
X_train = X[:n]
X_test = X[n:]

Generating synthetic data...


In [10]:
M, S, y, Astar, Kstar = generate_synthetic_data(n, r, p, X_train)

print(np.shape(X_train), np.shape(X_test))

(75, 30) (75, 30)


In [11]:
A0 = initialization(n, r, p, S, X_train, y)

In [12]:
np.linalg.norm(A0 @ A0.T - Kstar)

0.5241586420597083

In [13]:
A_iterates = []
A = torch.tensor(A0, requires_grad=True, device="cpu")
dists = []

y_tensor = torch.tensor(y, device="cpu")
M_tensor = torch.tensor(M, device="cpu")
for iterate in range(100):
    loss = L(A, y_tensor, M_tensor)
    loss.backward()
    with torch.no_grad():
        A -= A.grad * 0.5
        A_iterates.append(A.detach().cpu().numpy())
        dists.append(np.linalg.norm(A.detach().cpu().numpy() @ A.detach().cpu().numpy().T - Kstar))
        A.grad.zero_()
    if iterate % 10 == 9:
        print(iterate + 1, loss, np.linalg.norm(A.detach().cpu().numpy() @ A.detach().cpu().numpy().T - Kstar))

10 tensor(0.5076, dtype=torch.float64, grad_fn=<MeanBackward0>) 0.18603579772551634
20 tensor(0.5040, dtype=torch.float64, grad_fn=<MeanBackward0>) 0.1021843662016137
30 tensor(0.5033, dtype=torch.float64, grad_fn=<MeanBackward0>) 0.06592304936157556
40 tensor(0.5031, dtype=torch.float64, grad_fn=<MeanBackward0>) 0.04770017738727625
50 tensor(0.5031, dtype=torch.float64, grad_fn=<MeanBackward0>) 0.03818291918469853



KeyboardInterrupt



In [15]:
Ahat = A_iterates[-1]

In [16]:
np.linalg.norm(Ahat @ Ahat.T - Kstar)

0.03483869761803121

In [17]:
# lin1star = np.random.normal(0, 1, size=(p, 20))
# lin2star = np.random.normal(0, 1, size=(20, 20))
# lin3star = np.random.normal(0, 1, size=20)
# epsilon = np.random.normal(0, 0.01, size=2 * n)

# Y_probs = relu(np.einsum("ij,jk->ik", X, lin1star))
# Y_probs = relu(np.einsum("ij,jk->ik", Y_probs, lin2star))
# Y_probs = sigmoid(np.einsum("ij,j->i", Y_probs, lin3star))

Y_probs = np.sign(X[:, 0]) / 10 + 0.5

Y = (np.random.random(size=2 * n) < Y_probs).astype(int)

Y_train = Y[:n]
Y_test = Y[n:]

In [26]:
X_train_t = torch.Tensor(X_train)
Y_train_t = torch.Tensor(Y_train)

X_test_t = torch.Tensor(X_test)
Y_test_t = torch.Tensor(Y_test)

train_dataset = TrainDataset(X_train_t, Y_train_t)
test_dataset = TrainDataset(X_test_t, Y_test_t)

train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=8)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=8)

In [27]:
class NeuralNet(torch.nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.lin1 = torch.nn.Linear(p, 20, bias=False)
        self.lin2 = torch.nn.Linear(20, 20, bias=False)
        self.lin3 = torch.nn.Linear(20, 1, bias=False)

    def forward(self, x):
        x = torch.nn.functional.relu(self.lin1(x))
        x = torch.nn.functional.relu(self.lin2(x))
        return torch.nn.functional.sigmoid(self.lin3(x))

In [28]:
network_standard = NeuralNet()
optimizer = torch.optim.Adam(network_standard.parameters(), lr=1e-3)
loss_fn = torch.nn.functional.binary_cross_entropy

network_standard.train()

for epoch in range(1000):

    for x, y in train_dl:
        optimizer.zero_grad()
        y_pred = network_standard(x).squeeze()
        loss = loss_fn(y_pred, y)
        loss.backward()
        optimizer.step()

In [29]:
loss

tensor(3.9736e-07, grad_fn=<BinaryCrossEntropyBackward0>)

In [89]:
input_metric = MahalanobisDistances()
input_metric.fit(torch.Tensor(Ahat @ Ahat.T))

input_metric_true = MahalanobisDistances()
input_metric_true.fit(torch.Tensor(Astar @ Astar.T))

In [90]:
output_metric = SquaredEuclideanDistance()
output_metric.fit(num_dims=1)

In [91]:
network = NeuralNet()

In [92]:
rho = 5.0
eps = 0.1
auditor_nsteps = 100
auditor_lr = 0.001

alg = SenSeI(network, input_metric, output_metric, loss_fn, rho, eps, auditor_nsteps, auditor_lr)

In [93]:
optimizer = torch.optim.Adam(network.parameters(), lr=0.001)

In [94]:

alg.train()

for epoch in range(100):
    for x, y in train_dl:
        optimizer.zero_grad()
        result = alg(x, torch.reshape(y, (-1, 1)))
        result.loss.backward()
        optimizer.step()

In [95]:
result.loss

tensor(0.0019, grad_fn=<MeanBackward0>)

In [96]:

# auditor = SenSRAuditor(torch.nn.L1Loss, output_metric, nsteps, lr)

auditor = SenSeIAuditor(input_metric, output_metric, auditor_nsteps, auditor_lr)
auditor_true = SenSeIAuditor(input_metric_true, output_metric, auditor_nsteps, auditor_lr)

In [97]:
auditor.audit(network, X_test_t, Y_test_t, torch.nn.functional.l1_loss)

AuditorResponse(lossratio_mean=1.0001038, lossratio_std=0.18289803, lower_bound=0.9960926247913363, threshold=None, pval=None, confidence=None, is_model_fair=None)

In [98]:
auditor_true.audit(network, X_test_t, Y_test_t, torch.nn.functional.l1_loss)

AuditorResponse(lossratio_mean=1.0173771, lossratio_std=0.22246106, lower_bound=1.012498259726007, threshold=None, pval=None, confidence=None, is_model_fair=None)

In [99]:
examples = auditor.generate_worst_case_examples(network, X_test_t, torch.Tensor([0.1]))

In [100]:
ratios = []
for X_1 in X_test_t:
    for X_2 in X_test_t:
        ratios.append((output_metric(network(X_1), network(X_2)) / input_metric(X_1, X_2)).detach().numpy())
ratios = np.array(ratios)
np.max(ratios[~np.isnan(ratios)])

103.48351

In [101]:
ratios = []
for X_1 in X_test_t:
    for X_2 in X_test_t:
        ratios.append((output_metric(network(X_1), network(X_2)) / input_metric_true(X_1, X_2)).detach().numpy())
ratios = np.array(ratios)
np.max(ratios[~np.isnan(ratios)])

79.06766