In [20]:
import numpy as np
from folktables import ACSDataSource, ACSMobility, ACSIncome
from matplotlib import pyplot as plt
import torch
from scipy import stats
from scipy.sparse.linalg import lobpcg
from scipy.linalg import eigh, eig
import pandas as pd
from inFairness.distances import MahalanobisDistances, SquaredEuclideanDistance, LogisticRegSensitiveSubspace
from inFairness.fairalgo import SenSeI
from inFairness.auditor import SenSeIAuditor, SenSRAuditor
from tqdm.auto import tqdm
from utils import *

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import data

In [21]:
r = 3
p = 20
n = 200

In [22]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["TX"], download=True)
features, labels, _ = ACSMobility.df_to_pandas(acs_data)

# chi = 1
# synthetic_covariance_diag = chi * np.ones(p)
# synthetic_covariance_indices = np.random.choice(range(p), size=int(p / 2), replace=False)
# synthetic_covariance_diag[synthetic_covariance_indices] = 1 / chi
# features = pd.DataFrame(np.random.multivariate_normal(np.zeros(p), np.diag(synthetic_covariance_diag), size=n * 2))


In [23]:
results = pd.DataFrame(columns=[
        "Iterate",
        "AA^T to Kstar", 
        "Standard Train Loss", 
        "Fair Train Loss", 
        "Audit Mean", 
        "Audit Std", 
        "Audit Lower Bound",
        "True Audit Mean", 
        "True Audit Std", 
        "True Audit Lower Bound",
        "Worst Ratio", 
        "Worst Ratio True"
    ])


In [24]:
X = clean_data(2 * n, len(features.columns), features, cut_columns=False)
X_train = X[:n]
X_test = X[n:]

Y = labels.head(2 * n)
Y_train = Y[:n]
Y_test = Y[n:]

# p = np.shape(X)[-1]

M, S, y, Astar, Kstar = generate_synthetic_data(n, r, p, X_train)


In [25]:
with open('numpy_saves.npz', 'wb') as f:
    np.savez(f, S=S, y=y, Astar=Astar, Kstar=Kstar)

In [36]:
A0 = np.random.normal(0, 1, size=(p, r)) / (np.sqrt(p) * np.sqrt(r)) # initialization(n, r, p, S, X_train, y)


In [37]:
A_iterates = []
A = torch.tensor(A0, requires_grad=True, device="cpu").to(torch.float64)
dists = []


In [None]:
y_tensor = torch.tensor(y, device="cpu")
M_tensor = torch.tensor(M, device="cpu")

for iterate in range(20 * n):
    loss = L(A, y_tensor, M_tensor)
    loss.backward()
    with torch.no_grad():
        A -= A.grad * 0.1
        A_iterates.append(A.detach().cpu().numpy())
        dists.append(np.linalg.norm(A.detach().cpu().numpy() @ A.detach().cpu().numpy().T - Kstar))
        A.grad.zero_()
    with open('A_iterates.npy', 'wb') as f:
        np.save(f, np.array(A_iterates))
    print(iterate, dists[-1])
    

0 0.9584487359013317
1 0.9219353846310423
2 0.8985945169466559
3 0.8820211994501779
4 0.8693105227988324
5 0.8589578225480563
6 0.8501100226836692
7 0.8422552262977473
8 0.8350750322159736
9 0.8283659280130596
10 0.8219939374793528
11 0.815867209185477
12 0.8099192843965385
13 0.8040990045255633
14 0.7983646202851109
15 0.7926804720063612
16 0.7870152205815203
17 0.781341021440125
18 0.7756332143474214
19 0.7698702247414193
20 0.7640334974417616
21 0.7581073643956046
22 0.7520788148979543
23 0.7459371977279678
24 0.739673925548801
25 0.7332822646285482
26 0.7267572775629143
27 0.7200959492608009
28 0.7132974768679792
29 0.7063636551202171
30 0.6992992526584577
31 0.692112262062657
32 0.684813922523197
33 0.6774184585142515
34 0.6699425401366765
35 0.6624045322761175
36 0.6548236411682614
37 0.6472190774627629
38 0.6396093360534972
39 0.6320116555033441
40 0.6244416766486535
41 0.6169132826338829
42 0.6094385799628855
43 0.6020279741920646
44 0.5946902998862685
45 0.5874329753973476
46 

In [None]:
len(dists)

In [None]:
plt.plot(dists)