In [134]:
import numpy as np
from scipy.stats import ortho_group
from sklearn.cross_decomposition import CCA
from scipy.stats import multivariate_normal
from scipy import linalg
import pandas as pd
import os

In [135]:
path = '...'
os.chdir(path=path)

In [136]:
np.random.seed(0)
V = ortho_group.rvs(dim=55)[:,:4]
W = ortho_group.rvs(dim=60)[:,:4]
rho = np.array([0.8, 0.6, 0.3, 0.5])
def simcca(V, W, rho, n, noisex, noisey):
    px, d = V.shape
    py, _ = W.shape
    
    mux=np.zeros((px, 1))
    muy=np.zeros((py, 1))

    Qx, Rx = linalg.qr(V)
    Qy, Ry = linalg.qr(W)

    Cxx = Qx @ linalg.pinv(Rx.T)
    Cyy = Qy @ linalg.pinv(Ry.T)
    Cxy = Cxx @ np.diag(rho.flatten()) @ Cyy.T
    Cxx = Cxx @ Cxx.T + noisex * (np.identity(px) - Qx @ Qx.T)
    Cyy = Cyy @ Cyy.T + noisey * (np.identity(py) - Qy @ Qy.T)
    A = np.concatenate((Cxx, Cxy.T), axis=0)
    B = np.concatenate((Cxy, Cyy), axis=0)
    C = np.concatenate((A, B), axis=1)
    
    
    mu = np.concatenate((mux, muy), axis=0).flatten('F')
    RV = multivariate_normal(mu, C, n).rvs(n)
    X = RV[:, :px]
    Y = RV[:, px:]
    
    return X, Y, C


X, Y, C = simcca(V, W, rho, 500, noisex=1, noisey=1e-4)
X.shape, Y.shape

((500, 55), (500, 60))

In [137]:
def generate_sensitive_attribute_from_XY(X, Y):
    features_X_odd = X[:, 0::2] 
    features_Y_even = Y[:, 1::2]

    combined_features = np.exp(np.sum(features_X_odd,axis=1))+np.exp(np.sum(features_Y_even, axis=1))
    
    sensitive_attr = combined_features
    
    threshold = np.mean(sensitive_attr)
    
    sensitive_attr_binary = (sensitive_attr > threshold).astype(int)
    
    return sensitive_attr_binary

In [138]:
z = generate_sensitive_attribute_from_XY(X, Y)+1
z = z.reshape(-1,1)

In [139]:
X_c = X@V
Y_c = Y@W

In [140]:
X_c = X_c / (np.sqrt(np.sum(X_c**2, axis=0)))
Y_c = Y_c / (np.sqrt(np.sum(Y_c**2, axis=0)))

In [141]:
def generate_label(X, Y, z):
    features_X = X[:, :27] 
    features_Y = Y[:, 30:]

    combined_features = np.sum(features_X,axis=1) + np.sum(features_Y, axis=1) + np.exp(z).ravel()

    threshold = np.mean(combined_features)
    
    binary_label = (combined_features > threshold).astype(int)
    
    return binary_label

In [142]:
label = generate_label(X,Y,z)
sensitive = generate_sensitive_attribute_from_XY(X,Y)

In [143]:
label = pd.DataFrame(label)
sensitive = pd.DataFrame(sensitive)

In [144]:
X = pd.DataFrame(X)
Y = pd.DataFrame(Y)
label.to_csv("label.csv")
sensitive.to_csv("sensitive.csv")
X.to_csv("synthetic_data_X.csv")
Y.to_csv("synthetic_data_Y.csv")