### Imports

In [None]:
import math
import torch
import random
import xxhash
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, log_loss, accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.preprocessing import LabelEncoder

from sys import maxsize

from lightgbm import LGBMClassifier

from folktables import ACSDataSource, ACSIncome, ACSPublicCoverage, ACSEmployment

from copy import deepcopy
warnings.filterwarnings("ignore")

### Datasets

In [None]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
ca_data = data_source.get_data(states=["CA"], download=True)

In [None]:
def get_ACSPubCov():
    features, labels, _ = ACSPublicCoverage.df_to_pandas(ca_data)
    
    df = pd.concat([features, labels], axis=1)
    
    df = df.drop_duplicates(keep='first', ignore_index=True)
    df = df.drop(['DEAR', 'DEYE', 'DREM', 'PINCP', 'ST', 'MAR', 'ESP', 'MIG', 'CIT', 'ESR', 'FER', 'MIL', 'ANC'], axis=1)
    
    df['AGEP'] = np.where(df['AGEP'] >= df['AGEP'].mean(), 1, 0)
    df['SCHL'] = np.where(df['SCHL'] >= df['SCHL'].mean(), 1, 0)
    
    def binarize(dataset, features):
        dataset[features] = np.where(df[features] == 1, 1, 0)
        
    binarize(df, ['SEX', 'RAC1P', 'DIS', 'NATIVITY', 'PUBCOV'])
    
    for col in df.columns:
        df[col] = df[col].astype('int')

    num_train = int(len(df) * .8)

    dfTrain = df.sample(n=num_train, replace=False, axis=0, ignore_index=False)
    
    dfTest = df.drop(dfTrain.index, axis=0)
    
    return dfTrain, dfTest


### Randomized Response

In [None]:
def GRR_Client(input_data, p):
    
    if np.random.binomial(1, p) == 1:
        return input_data

    else:
        return 1 - input_data

In [None]:
def gen_keys(num_feat):
    total = 2 ** (num_feat+1)
    possible_keys = ['0' for _ in range(int(total/2))]
    possible_keys.extend(['1' for _ in range(int(total/2))])
    
    rounds = [i+1 for i in range(num_feat)]
    
    for r in rounds[::-1]:
        count = 0
        for i, k in enumerate(possible_keys):
            if count < 2**(r-1):
                possible_keys[i] = k + '0'
            else:
                possible_keys[i] = k +'1'
            count += 1
            if count == 2**r:
                count = 0
                
    return possible_keys

### Main

In [None]:
def main(epsilon, which_set):
    print("=======================================================")
    print(f"EPSILON: " + str(epsilon))
    print("=======================================================")
    dfTrain_main, dfTest_main = get_ACSPubCov()

    dis_all_acc_lr, ndis_all_acc_lr = [], []
    dis_all_acc_nb, ndis_all_acc_nb = [], []
    dis_all_acc_lgbm, ndis_all_acc_lgbm = [], []

    for itr in range(100):
        if itr % 10 == 0 and itr != 0:
            print(str(itr)+'\100')
            
        dfTrain = deepcopy(dfTrain_main)
        dfTest = deepcopy(dfTest_main)

        X_test = dfTest.loc[:, dfTest.columns != 'PUBCOV']
        y_test = dfTest.loc[:, 'PUBCOV']

        dis_loc, dis_true = [], []
        ndis_true = []

        for i, (index, r) in enumerate(X_test.iterrows()):
            if r['RAC1P'] == 1: # should be dis trying with age, 1 = greater than mean age (dis)
                dis_loc.append(i)
                dis_true.append(y_test.loc[index])
            else:
                ndis_true.append(y_test.loc[index])

        X_train = dfTrain.loc[:, dfTrain.columns != 'PUBCOV']
        y_train = dfTrain.loc[:, 'PUBCOV']
        
        # Do randomized response
        if epsilon:
            p = np.exp(epsilon) / (np.exp(epsilon) + 1)

            lst_df_train = []
            lst_df_test = []
            sensitive_att = ['DIS', 'SEX', 'RAC1P']

            if which_set in ['feat-lab', 'feat']:
                for col in list(set(X_train.columns)):
                    if col in sensitive_att:
                        df_new_col = pd.DataFrame([int(GRR_Client(val, p)) for val in X_train[col]], columns=[col])
                        lst_df_train.append(df_new_col)
                    else:
                        lst_df_train.append(pd.DataFrame([int(val) for val in X_train[col]], columns=[col]))

                X_train = pd.concat(lst_df_train, axis=1)

                if which_set == 'feat-lab':
                    y_train = pd.DataFrame([int(GRR_Client(val, p)) for val in y_train], columns=['PUBCOV'])
                else:
                    y_train = pd.DataFrame([int(val) for val in y_train], columns=['PUBCOV'])
                    
            elif which_set == 'lab':
                for col in list(set(X_train.columns)):
                    lst_df_train.append(pd.DataFrame([int(val) for val in X_train[col]], columns=[col]))
                X_train = pd.concat(lst_df_train, axis=1)
                
                y_train = pd.DataFrame([int(GRR_Client(val, p)) for val in y_train], columns=['PUBCOV'])
                
            # perform reconstruction
            num_repeat = len(X_train.columns)

            possible_keys =  gen_keys(num_repeat)
            lambda_dict = {}

            for key in possible_keys:
                lambda_dict[key] = 0

            joint_train = pd.concat([X_train, y_train], axis=1)
         
            for index, row in joint_train.iterrows():
                key = ''.join(str(x) for x in row)
                lambda_dict[key] += 1
            
            if which_set == 'feat':
                selected_n = [joint_train.columns.get_loc(sele_feat) for sele_feat in sensitive_att]
            elif which_set == 'feat-lab':
                selected_n = [joint_train.columns.get_loc(sele_feat) for sele_feat in sensitive_att]
                selected_n.append(num_repeat)
            else:
                selected_n = [num_repeat]

            if 0 in selected_n:
                p_ = np.linalg.inv([[p, 1-p],[1-p, p]])
            else:
                p_ = np.linalg.inv([[1, 0],[0, 1]])

            # get P^-1
            for n in range(num_repeat+1):
                if n == 0:
                    continue
                if n in selected_n:
                    b = np.linalg.inv([[p, 1-p], [1-p, p]])
                    p_ = np.kron(p_, b)
                else:
                    b  = np.linalg.inv([[1,0], [0,1]])
                    p_ = np.kron(p_, b)
       
            # construct big lambda in order
            keys = list(lambda_dict.keys())
            keys.sort()
            sorted_lambda_dict = {i: lambda_dict[i] for i in keys}
           
      
            lambda_list = [lambda_dict[k]/len(X_train) for k in keys] #lambda hat 
            
            pi_tilde = np.matmul(p_, lambda_list)
            
            for i, pi in enumerate(pi_tilde):
                if pi < 0:
                    pi_tilde[i] = 0

            pi_tilde_scaled = np.true_divide(pi_tilde, np.sum(pi_tilde))

            pi_tilde_list = [round(pi*len(X_train)) for pi in pi_tilde_scaled]

            recon_train = []

            for i, counts in enumerate(pi_tilde_list):
                for j in range(counts):
                    recon_train.append([int(elem) for elem in keys[i]])

            recon_train = pd.DataFrame(recon_train, columns=dfTrain.columns)
            recon_train = recon_train.sample(frac=1).reset_index(drop=True)
            
            X_train, y_train = recon_train.loc[:, dfTrain.columns != 'PUBCOV'], recon_train.loc[:, 'PUBCOV']
            
        #######################
        # Logistic Regression #
        #######################
        LR = LogisticRegression(max_iter=500, fit_intercept=True)
        LR.fit(X_train, y_train)
        predictions1 = LR.predict(X_test)
        dis_pred1, ndis_pred1 = [], []

        for i, p in enumerate(predictions1):
            if i in dis_loc:
                dis_pred1.append(p)
            else:
                ndis_pred1.append(p)

        dis_acc1 = accuracy_score(dis_true, dis_pred1)
        ndis_acc1 = accuracy_score(ndis_true, ndis_pred1)

        ndis_all_acc_lr.append(ndis_acc1)
        dis_all_acc_lr.append(dis_acc1)

        ###############
        # Naive Bayes #
        ###############
        NB = GaussianNB()
        NB.fit(X_train, y_train)
        predictions2 = NB.predict(X_test)
        dis_pred2, ndis_pred2 = [], []

        for i, p in enumerate(predictions2):
            if i in dis_loc:
                dis_pred2.append(p)
            else:
                ndis_pred2.append(p)

        dis_acc2 = accuracy_score(dis_true, dis_pred2)
        ndis_acc2 = accuracy_score(ndis_true, ndis_pred2)

        ndis_all_acc_nb.append(ndis_acc2)
        dis_all_acc_nb.append(dis_acc2)

        ########
        # LGBM #
        ########
        LGBM = LGBMClassifier(verbose=-1)
        LGBM.fit(X_train, y_train)
        predictions3 = LGBM.predict(X_test)
        dis_pred3, ndis_pred3 = [], []

        for i, p in enumerate(predictions3):
            if i in dis_loc:
                dis_pred3.append(p)
            else:
                ndis_pred3.append(p)

        dis_acc3 = accuracy_score(dis_true, dis_pred3)
        ndis_acc3 = accuracy_score(ndis_true, ndis_pred3)

        ndis_all_acc_lgbm.append(ndis_acc3)
        dis_all_acc_lgbm.append(dis_acc3)

    print("=======================================================")
    print(f"AVERAGE")
    print("=======================================================")
    print(f"Logisitc Regression Model:")
    print("-- Disabled Accuracy: " + str(round(sum(dis_all_acc_lr)/100,3)))
    print("-- Non-disabled Accuracy: " + str(round(sum(ndis_all_acc_lr)/100,3)))
    print("=======================================================")
    print(f"Naive Bayes Model:")
    print("-- Disabled Accuracy: " + str(round(sum(dis_all_acc_nb)/100,3)))
    print("-- Non-disabled Accuracy: " + str(round(sum(ndis_all_acc_nb)/100,3)))
    print("=======================================================")
    print(f"LGBM Model:")
    print("-- Disabled Accuracy: " + str(round(sum(dis_all_acc_lgbm)/100,3)))
    print("-- Non-disabled Accuracy: " + str(round(sum(ndis_all_acc_lgbm)/100,3)))
    print("=======================================================")

In [None]:
epsilons = [None, .001, .01, .1, .25, .5, 1, 2, 5]
which_set = 'lab'

for e in epsilons:
    main(e, which_set)