In [1]:
import sys
sys.path.append('/path/AL')

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, Dataset
import matplotlib.pyplot as plt
from AAE_model import AdversarialAutoEncoder

In [2]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.'
train_url = url + 'data'
test_url = url + 'test'

#labels in the dataset
names = ["age","workclass","fnlwgt","education","education-num",
         "marital-status","occupation","relationship","race","sex",
         "capital-gain","capital-loss", "hours-per-week",
         "native-country","income>50k"]

# download data
train_df = pd.read_csv(train_url, names = names, index_col = False, comment = '|')
test_df = pd.read_csv(test_url, names = names, index_col = False, comment = '|')

In [3]:
# Cleaning data and getting rid of symbols and 'None' values
train_df[train_df == '?'] = np.nan
train_df[train_df == ' ?'] = np.nan
train_df.dropna(inplace = True)

test_df[test_df == '?'] = np.nan
test_df[test_df == ' ?'] = np.nan
test_df.dropna(inplace = True)

In [4]:
# 'fnlwgt' is a counter for the number of repeated entries in the census, it has 0 correlation to 
# income so it can be discarded
# education-num is a numerical representation of education with a 1.0 correlation to education 
# so it is discarded as well

train_inputs = pd.get_dummies(train_df.drop(['income>50k', 'education-num', 'fnlwgt'], axis=1))
test_inputs = pd.get_dummies(test_df.drop(['income>50k', 'education-num', 'fnlwgt'], axis=1))

# Ensuring the dimensions are the same for the train and test sets
extra_columns = list(set(train_inputs.columns) - set(test_inputs.columns))
for c in extra_columns:
    test_inputs[c] = 0



# Creating labels from the dataset
train_labels, _ = train_df['income>50k'].factorize()
train_protected, _ = train_df['sex'].factorize() # male = 0, female = 1
test_labels, _ = test_df['income>50k'].factorize()
test_protected, _ = test_df['sex'].factorize() # male = 0, female = 1



# Convert input samples and target labels to tensors
train_inputs = F.normalize(torch.Tensor(train_inputs.values))
train_labels = torch.Tensor(train_labels)
train_protected = torch.Tensor(train_protected)


test_inputs = F.normalize(torch.Tensor(test_inputs.values))
test_labels = torch.Tensor(test_labels)
test_protected = torch.Tensor(test_protected)


# Create a dataset object that pairs the input samples and target labels
train_dataset = TensorDataset(train_inputs, train_labels, train_protected)
test_dataset = TensorDataset(test_inputs, test_labels, test_protected)


# Dataloaders for training and testing
dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
testloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [5]:
# training settings
# input shape for Auto Encoder
input_dim = train_inputs.shape[1]
# layer size for AE
hidden_dim = 100
# Decode down size
encoded_dim = 10
# Check if cuda processing is available
cuda = torch.cuda.is_available()

NUM_PARAM_TRIALS = 80
alpha = 0.05
beta = np.random.randint(1,50,NUM_PARAM_TRIALS)
gamma = np.random.randint(1,10,NUM_PARAM_TRIALS)
parameters = [(alpha, beta[i], gamma[i]) for i in range(len(beta))]

epochs = 100

In [None]:


# Log to store results for each set of parameters
log = np.zeros((len(parameters),6))
# Counter to update which log entry
i = 0 
for params in parameters:
    model = AdversarialAutoEncoder(input_dim, hidden_dim, encoded_dim,
                                   params[0],params[1],params[2])

    model.train_model(epochs, dataloader)
    results = model.evaluate_model(testloader)
    log[i] = [results[0],results[1],results[2], params[0],params[1],params[2]]
    print(f'''Results for iteration {i} using model with hyperparameters:
    alpha = {log[i][3]}, beta = {log[i][4]}, gamma = {log[i][5]}
    
y_t_delta       = {log[i][0]},
discrimination  = {log[i][1]},
accuracy        = {log[i][2]}
''')
    i+=1

Results for iteration 0 using model with hyperparameters:
    alpha = 0.05, beta = 40.0, gamma = 5.0
    
y_t_delta       = 0.5296041369438171,
discrimination  = 0.30598682165145874,
accuracy        = 0.8355909585952759

Results for iteration 1 using model with hyperparameters:
    alpha = 0.05, beta = 40.0, gamma = 8.0
    
y_t_delta       = 0.5370133519172668,
discrimination  = 0.30455368757247925,
accuracy        = 0.8415670394897461

Results for iteration 2 using model with hyperparameters:
    alpha = 0.05, beta = 31.0, gamma = 1.0
    
y_t_delta       = 0.8078251481056213,
discrimination  = 0.02185612916946411,
accuracy        = 0.8296812772750854

Results for iteration 3 using model with hyperparameters:
    alpha = 0.05, beta = 44.0, gamma = 1.0
    
y_t_delta       = 0.5943068265914917,
discrimination  = 0.2368353009223938,
accuracy        = 0.8311421275138855

Results for iteration 4 using model with hyperparameters:
    alpha = 0.05, beta = 30.0, gamma = 3.0
    
y_t_delta  

In [None]:
# Store results
np.savetxt('results_log', log, fmt = '%d', delimiter = ','