### Imports

In [1]:
import math
import torch
import random
import xxhash
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, log_loss, accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.preprocessing import LabelEncoder

from sys import maxsize

from lightgbm import LGBMClassifier

from folktables import ACSDataSource, ACSIncome, ACSPublicCoverage, ACSEmployment

from copy import deepcopy
warnings.filterwarnings("ignore")

### Datasets

In [2]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
ca_data = data_source.get_data(states=["CA"], download=True)

In [3]:
def get_ACSIncome():
    df, labels, _ = ACSIncome.df_to_pandas(ca_data)
    df = pd.concat([df, labels], axis=1)
    df= df.drop_duplicates(keep='first', ignore_index=True)
    df = df.drop(['OCCP','POBP', 'RELP', 'MAR', 'COW'], axis=1)
    df = df.dropna(how='any', axis=0)
  
    df['AGEP'] = np.where(df['AGEP'] >= df['AGEP'].mean(), 1, 0)
    df['SCHL'] = np.where(df['SCHL'] >= df['SCHL'].mean(), 1, 0)
    df['WKHP'] = np.where(df['WKHP'] >= df['WKHP'].mean(), 1, 0)
    
    df['SEX'] = df['SEX'].replace([1, 2], [1, 0]).astype('int')
    df['RAC1P'] = np.where(df['RAC1P'] == 1, 1.0, 0.0).astype('int')
    df['PINCP'] = df['PINCP'].replace([True, False], [1, 0])

    num_train = int(len(df) * .8)
    dfTrain = df.sample(n=num_train, replace=False, axis=0, ignore_index=False)
    dfTest = df.drop(dfTrain.index, axis=0)
    
    return dfTrain, dfTest

### Randomized Response

In [4]:
def GRR_Client(input_data, p):
    
    if np.random.binomial(1, p) == 1:
        return input_data

    else:
        return 1 - input_data

In [5]:
def gen_keys(num_feat):
    total = 2 ** (num_feat+1)
    possible_keys = ['0' for _ in range(int(total/2))]
    possible_keys.extend(['1' for _ in range(int(total/2))])
    
    rounds = [i+1 for i in range(num_feat)]
    
    for r in rounds[::-1]:
        count = 0
        for i, k in enumerate(possible_keys):
            if count < 2**(r-1):
                possible_keys[i] = k + '0'
            else:
                possible_keys[i] = k +'1'
            count += 1
            if count == 2**r:
                count = 0
                
    return possible_keys

### Main

In [6]:
def main(epsilon, which_set):

    m_all_acc_lr, f_all_acc_lr = [], []
    m_all_acc_nb, f_all_acc_nb = [], []
    m_all_acc_lgbm, f_all_acc_lgbm = [], []

    print("=======================================================")
    print(f"EPSILON: " + str(epsilon))
    print("=======================================================")
        
    for itr in range(100):
        if itr % 10 == 0 and itr != 0:
            print(str(itr)+'/100')
        dfTrain_main, dfTest_main = get_ACSIncome()
        
        dfTrain = deepcopy(dfTrain_main)
        dfTest = deepcopy(dfTest_main)

        X_test = dfTest.loc[:, dfTest.columns != 'PINCP']
        y_test = dfTest.loc[:, 'PINCP']

        m_loc, m_true = [], []
        f_true = []

        for i, (index, r) in enumerate(X_test.iterrows()):
            if r['SEX'] == 1:
                m_loc.append(i)
                m_true.append(y_test.loc[index])
            else:
                f_true.append(y_test.loc[index])

        X_train = dfTrain.loc[:, dfTrain.columns != 'PINCP']
        y_train = dfTrain.loc[:, 'PINCP']
        
        # Do randomized response
        if epsilon:
            p = np.exp(epsilon) / (np.exp(epsilon) + 1)

            lst_df_train = []
            lst_df_test = []
            sensitive_att = ['AGEP', 'SEX', 'RAC1P']

            if which_set in ['feat-lab', 'feat']:
                for col in list(set(X_train.columns)):
                    if col in sensitive_att:
                        df_new_col = pd.DataFrame([int(GRR_Client(val, p)) for val in X_train[col]], columns=[col])
                        lst_df_train.append(df_new_col)
                    else:
                        lst_df_train.append(pd.DataFrame([int(val) for val in X_train[col]], columns=[col]))

                X_train = pd.concat(lst_df_train, axis=1)

                if which_set == 'feat-lab':
                    y_train = pd.DataFrame([int(GRR_Client(val, p)) for val in y_train], columns=['PINCP'])
                else:
                    y_train = pd.DataFrame([int(val) for val in y_train], columns=['PINCP'])
                    
            elif which_set == 'lab':
                for col in list(set(X_train.columns)):
                    lst_df_train.append(pd.DataFrame([int(val) for val in X_train[col]], columns=[col]))
                X_train = pd.concat(lst_df_train, axis=1)
                
                y_train = pd.DataFrame([int(GRR_Client(val, p)) for val in y_train], columns=['PINCP'])
            
            # perform reconstruction
            num_repeat = len(X_train.columns)

            possible_keys =  gen_keys(num_repeat)
            lambda_dict = {}

            for key in possible_keys:
                lambda_dict[key] = 0

            joint_train = pd.concat([X_train, y_train], axis=1)
         
            for index, row in joint_train.iterrows():
  
                key = ''.join(str(x) for x in row)
                lambda_dict[key] += 1
            
            if which_set == 'feat':
                selected_n = [joint_train.columns.get_loc(sele_feat) for sele_feat in sensitive_att]
            elif which_set == 'feat-lab':
                selected_n = [joint_train.columns.get_loc(sele_feat) for sele_feat in sensitive_att]
                selected_n.append(num_repeat)
            else:
                selected_n = [num_repeat]
        
            if 0 in selected_n:
                p_ = np.linalg.inv([[p, 1-p],[1-p, p]])
            else:
                p_ = np.linalg.inv([[1, 0],[0, 1]])

            # get P^-1
            for n in range(num_repeat+1):
                if n == 0:
                    continue
                if n in selected_n:
                    b = np.linalg.inv([[p, 1-p], [1-p, p]])
                    p_ = np.kron(p_, b)
                else:
                    b  = np.linalg.inv([[1,0], [0,1]])
                    p_ = np.kron(p_, b)
       
            # construct big lambda in order
            keys = list(lambda_dict.keys())
            keys.sort()
            sorted_lambda_dict = {i: lambda_dict[i] for i in keys}
           
      
            lambda_list = [lambda_dict[k]/len(X_train) for k in keys] #lambda hat 
            
            pi_tilde = np.matmul(p_, lambda_list)
            
            for i, pi in enumerate(pi_tilde):
                if pi < 0:
                    pi_tilde[i] = 0

            pi_tilde_scaled = np.true_divide(pi_tilde, np.sum(pi_tilde))

            pi_tilde_list = [round(pi*len(X_train)) for pi in pi_tilde_scaled]

            recon_train = []

            for i, counts in enumerate(pi_tilde_list):
                for j in range(counts):
                    recon_train.append([int(elem) for elem in keys[i]])

            recon_train = pd.DataFrame(recon_train, columns=dfTrain.columns)
            recon_train = recon_train.sample(frac=1).reset_index(drop=True)
            
            X_train, y_train = recon_train.loc[:, dfTrain.columns != 'PINCP'], recon_train.loc[:, 'PINCP']
            
        #######################
        # Logistic Regression #
        #######################
        LR = LogisticRegression(max_iter=500, fit_intercept=True)
        LR.fit(X_train, y_train)
        predictions1 = LR.predict(X_test)
        m_pred1, f_pred1 = [], []

        for i, p in enumerate(predictions1):
            if i in m_loc:
                m_pred1.append(p)
            else:
                f_pred1.append(p)

        m_acc1 = accuracy_score(m_true, m_pred1)
        f_acc1 = accuracy_score(f_true, f_pred1)

        m_all_acc_lr.append(m_acc1)
        f_all_acc_lr.append(f_acc1)

        ###############
        # Naive Bayes #
        ###############
        NB = GaussianNB()
        NB.fit(X_train, y_train)
        predictions2 = NB.predict(X_test)
        m_pred2, f_pred2 = [], []

        for i, p in enumerate(predictions2):
            if i in m_loc:
                m_pred2.append(p)
            else:
                f_pred2.append(p)

        m_acc2 = accuracy_score(m_true, m_pred2)
        f_acc2 = accuracy_score(f_true, f_pred2)

        m_all_acc_nb.append(m_acc2)
        f_all_acc_nb.append(f_acc2)

        ########
        # LGBM #
        ########
        LGBM = LGBMClassifier(verbose=-1)
        LGBM.fit(X_train, y_train)
        predictions3 = LGBM.predict(X_test)
        m_pred3, f_pred3 = [], []

        for i, p in enumerate(predictions3):
            if i in m_loc:
                m_pred3.append(p)
            else:
                f_pred3.append(p)

        m_acc3 = accuracy_score(m_true, m_pred3)
        f_acc3 = accuracy_score(f_true, f_pred3)

        m_all_acc_lgbm.append(m_acc3)
        f_all_acc_lgbm.append(f_acc3)

    print("=======================================================")
    print(f"AVERAGE")
    print("=======================================================")
    print(f"Logisitc Regression Model:")
    print("-- Male Accuracy: " + str(round(sum(m_all_acc_lr)/100,3)))
    print("-- Female Accuracy: " + str(round(sum(f_all_acc_lr)/100,3)))
    print("=======================================================")
    print(f"Naive Bayes Model:")
    print("-- Male Accuracy: " + str(round(sum(m_all_acc_nb)/100,3)))
    print("-- Female Accuracy: " + str(round(sum(f_all_acc_nb)/100,3)))
    print("=======================================================")
    print(f"LGBM Model:")
    print("-- Male Accuracy: " + str(round(sum(m_all_acc_lgbm)/100,3)))
    print("-- Female Accuracy: " + str(round(sum(f_all_acc_lgbm)/100,3)))
    print("=======================================================")

In [9]:
epsilons = [None, .001, .01, .1, .25, .5, 1, 2, 5] # None gives original accuracy on non-rr data
which_set = 'lab' # 'feat-lab', 'feat', 'lab'

for e in epsilons:
    main(e, which_set)

EPSILON: 0.001
10
20
30
40
50
60
70
80
90
AVERAGE
Logisitc Regression Model:
-- Male Accuracy: 0.506
-- Female Accuracy: 0.512
Naive Bayes Model:
-- Male Accuracy: 0.503
-- Female Accuracy: 0.51
LGBM Model:
-- Male Accuracy: 0.507
-- Female Accuracy: 0.501
EPSILON: 0.01
10
20
30
40
50
60
70
80
90
AVERAGE
Logisitc Regression Model:
-- Male Accuracy: 0.552
-- Female Accuracy: 0.575
Naive Bayes Model:
-- Male Accuracy: 0.56
-- Female Accuracy: 0.581
LGBM Model:
-- Male Accuracy: 0.545
-- Female Accuracy: 0.524
EPSILON: 0.1
10
20
30
40
50
60
70
80
90
AVERAGE
Logisitc Regression Model:
-- Male Accuracy: 0.673
-- Female Accuracy: 0.661
Naive Bayes Model:
-- Male Accuracy: 0.674
-- Female Accuracy: 0.664
LGBM Model:
-- Male Accuracy: 0.657
-- Female Accuracy: 0.655
EPSILON: 0.25
10
20
30
40
50
60
70
80
90
AVERAGE
Logisitc Regression Model:
-- Male Accuracy: 0.684
-- Female Accuracy: 0.651
Naive Bayes Model:
-- Male Accuracy: 0.672
-- Female Accuracy: 0.651
LGBM Model:
-- Male Accuracy: 0.667
