In [None]:
# Adjustable Privacy - train_on_original.ipynb
# - Train a machine (weak adversary) on original (UCI-Adult) dataset to infer a specific feature.
# - Uses Categorical dataset UCI-Adult (private attr: gender, utility attr: income).
# - Saves models after each epoch number (to google drive and locally).
# - It can stop and resume training.
# - Draws loss and accuracy plots and saves them (to google drive).
# - Also it can load models and draw plots (from google drive).
# - You can manage notebook parameters in parser block

In [None]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import torch
from torch import nn
from torch import optim
from torchvision import datasets, transforms, models
import numpy as np
from collections import OrderedDict
import time
from torch.utils.data import random_split
from math import floor
import torchvision.utils as vutils
import torch.nn.parallel
import torch.backends.cudnn as cudnn
from tqdm import tqdm
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
import itertools
import random
import shutil
from zipfile import ZipFile
import os
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import argparse

In [None]:
# Parser
parser = argparse.ArgumentParser(description='Adjustable Privacy - Train a machine (weak adversary) on original (UCI-Adult) dataset to infer a specific feature. '
                                 + 'Uses Categorical dataset UCI-Adult (private attr: gender, utility attr: income). '
                                 + 'Saves models after each epoch number (to google drive and locally). '
                                 + 'It can stop and resume training.'
                                 + 'Draws loss and accuracy plots and saves them (to google drive and locally). '
                                 + 'Also it can load models and draw plots (from google drive).')

parser.add_argument('--resume', default = False, help = 'Accepts "True" or "False". ')
parser.add_argument('--last_epoch', type=int, default = 0, help = 'In case of resumming training use last saved epoch number and in case of loading a model, set to model number.')
parser.add_argument('--target_index', type=int, required=True, help = 'gender(1), income(2)')
parser.add_argument('--save_path', type=str, required=True, help = 'Full path on your google drive to save model and plots. And also load from it. Like "drive/MyDrive/adjustable-privacy/Models/categorical-Gender/"')
parser.add_argument('--epoch_numbers', type=int, default = 20, help = 'Number of epochs to train model. (when you want load a model, it should set to that model number)')
parser.add_argument('--dataset_path', type=str, default = "", help = 'Full path on your google drive to adult.csv. Like "drive/MyDrive/adjustable-privacy/Datasets/"')

command_string = "--resume False" \
" --last_epoch 0" \
" --target_index 1" \
" --save_path drive/MyDrive/adjustable-privacy/Models/categorical-Gender/" \
" --epoch_numbers 50" \
" --dataset_path drive/MyDrive/adjustable-privacy/Datasets/"

args = parser.parse_args(command_string.split())

In [None]:
# Hyper parameters:
isFirstRun = args.resume=='False'
lastRunEpochNumber = args.last_epoch
manual_seed = 20
learning_rate = 0.001 #0.2
batch_size = 64
files_not_ready = True
dataset_folder_path = args.dataset_path
data_dir = 'adult'
saving_path = args.save_path
# Number of workers for dataloader
workers = 2
# Beta1 hyperparam for Adam optimizers
beta1 = 0.5
# Number of GPUs available. Use 0 for CPU mode.
ngpu = 1
# Number of training epochs
num_epochs = args.epoch_numbers
data_index = args.target_index

In [None]:
# Check if CUDA is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

In [None]:
# Mount google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# download dataset and unzip

if files_not_ready:
    dataset_csv_path = dataset_folder_path + '/adult.csv'

    try:
      os.mkdir(data_dir)
      print("data folder created successfully")
    except OSError as e:
      print("Error: %s" % (e.strerror))

    shutil.copyfile(dataset_csv_path, data_dir + r'/adult.csv')

try:
    os.mkdir(saving_path)
    print("saving_path directory created successfully")
except OSError as e:
    print("Error: %s" % (e.strerror))

In [None]:
#PreProcess dataset:
df = pd.read_csv(os.path.join(data_dir,'adult.csv'))
df = df.replace({'?':np.nan})
df = df.dropna()
df1 = pd.get_dummies(df)
train, test = train_test_split(df1, test_size = 0.2, random_state = 42)
utility_train_true_labels = np.array(train[['income_<=50K','income_>50K']])
utility_test_true_labels = np.array(test[['income_<=50K','income_>50K']])
private_train_true_labels = np.array(train[['gender_Male', 'gender_Female']])
private_test_true_labels = np.array(test[['gender_Male', 'gender_Female']])
x_train = (train.drop(['income_<=50K','income_>50K','gender_Male', 'gender_Female'],axis='columns'))
x_test = (test.drop(['income_<=50K','income_>50K','gender_Male', 'gender_Female'],axis='columns'))
standard_scaler = preprocessing.StandardScaler()
standard_scaler.fit(x_train)
x_train = standard_scaler.transform(x_train)
x_test = standard_scaler.transform(x_test)

In [None]:
from torch.utils.data import Dataset

class UciAdultDataset(Dataset):
    def __init__(self, X, Y_p, Y_u):
        self.X = X
        self.Y_p = Y_p
        self.Y_u = Y_u
        
    def __len__(self):
        return len(self.Y_p)
    
    def __getitem__(self, idx):
        data = self.X[idx]
        label_p = self.Y_p[idx]
        label_u = self.Y_u[idx]
        data = torch.from_numpy(data)
        label_p = torch.from_numpy(label_p)
        label_u = torch.from_numpy(label_u)
        return data, label_p, label_u

In [None]:
# Load Datas
train_set = UciAdultDataset(x_train, private_train_true_labels, utility_train_true_labels)
test_set = UciAdultDataset(x_test, private_test_true_labels, utility_test_true_labels)

# DataLoader
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=workers, drop_last=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=workers, drop_last=True)

In [None]:
# Decide which device we want to run on
device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu")

In [None]:
# custom weights initialization
def weights_init(m):
  classname = m.__class__.__name__
  if classname.find('Linear') != -1:
    nn.init.normal_(m.weight.data, 0.0, 0.02)
    nn.init.constant_(m.bias.data, 0)

In [None]:
# Model
class UtlAdvModel(nn.Module):
    def __init__(self, ngpu):
        super(UtlAdvModel, self).__init__()
        self.ngpu = ngpu
        
        # input is 102
        # classifier
        self.fllc1 = nn.Linear(102, 256)
        self.actv1 = nn.ReLU(inplace=True)
        self.dropout1 = nn.Dropout(p=0.2)
        self.fllc2 = nn.Linear(256, 256)
        self.actv2 = nn.ReLU(inplace=True)
        self.dropout2 = nn.Dropout(p=0.3)
        self.fllc3 = nn.Linear(256, 128)
        self.actv3 = nn.ReLU(inplace=True)
        self.dropout3 = nn.Dropout(p=0.4)
        self.fllc4 = nn.Linear(128, 2)
        self.actv4 = nn.LogSoftmax(dim=1)

    def forward(self, x):
        y1 = self.fllc1(x)
        y1 = self.actv1(y1)
        y1 = self.dropout1(y1)
        y1 = self.fllc2(y1)
        y1 = self.actv2(y1)
        y1 = self.dropout2(y1)
        y1 = self.fllc3(y1)
        y1 = self.actv3(y1)
        y1 = self.dropout3(y1)
        y1 = self.fllc4(y1)
        y1 = self.actv4(y1)
        return y1


In [None]:
# Create the UtlADV
utladversaryModel = UtlAdvModel(ngpu).to(device)
# Handle multi-gpu if desired
if (device.type == 'cuda') and (ngpu > 1):
    utladversaryModel = nn.DataParallel(utladversaryModel, list(range(ngpu)))

# Apply the weights_init function to randomly initialize all weights
utladversaryModel.apply(weights_init)

In [None]:
# total parameters
total_params = sum(p.numel() for p in utladversaryModel.parameters())
print(f"{total_params:,} total parameters.")

In [None]:
utladversaryCriterion = nn.NLLLoss()
utladversaryOptimizer = optim.Adam(utladversaryModel.parameters(), lr=learning_rate, betas=(beta1, 0.999))

In [None]:
# Function - Save:
def save_model(name, number, model, res):
  checkpoint = {'res': res,
                'state_dict': model.state_dict()}
  torch.save(checkpoint, saving_path + 'checkpoint-' + name + '-' + str(number) + '.pth')
  return True

In [None]:
# Function - Load:
def load_model(name, number, model, device):
  
  checkpoint = torch.load(saving_path + 'checkpoint-' + name + '-' + str(number) + '.pth', map_location=device)
  res = checkpoint['res']
  model.load_state_dict(checkpoint['state_dict'])
  return {'model':model,
          'res':res}

In [None]:
# Save Start Checkpoint
if(isFirstRun):
  utladv_res = {'train_losses': [],
             'valid_losses': [],
             'test_y1_acc': [],
             'epoch_number': 0
           };
  save_model('ins', 0, utladversaryModel, utladv_res)

In [None]:
# Load Last Checkpoint:
utladv_load = load_model('ins', lastRunEpochNumber, utladversaryModel, device)

train_losses = utladv_load['res']['train_losses']
valid_losses = utladv_load['res']['valid_losses']
test_y1_acc = utladv_load['res']['test_y1_acc']
last_epoch = utladv_load['res']['epoch_number']

In [None]:
def extract_class_index(labels):
  return labels[:,0]

In [None]:
# Function - training function
def fit(model, train_loader, optimizer, criterion):
    print('Training')
    model.train()

    train_loss = 0.0
    prog_bar = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, data in prog_bar:
        inputs, labels = data[0], data[data_index]
        labels = extract_class_index(labels)
        inputs = inputs.to(torch.float32)
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        outputs = model.forward(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()          
    train_loss = train_loss / len(train_loader)
    return train_loss

In [None]:
# Function - validation function
def validate(model, valid_loader, criterion):
    print('Validating')
    model.eval()
    valid_loss = 0.0

    prog_bar = tqdm(enumerate(valid_loader), total=len(valid_loader))
    with torch.no_grad():
        for i, data in prog_bar:
            inputs, labels = data[0], data[data_index]
            labels = extract_class_index(labels)
            inputs = inputs.to(torch.float32)
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model.forward(inputs)
            loss = criterion(outputs, labels)
            valid_loss += loss.item()
        valid_loss = valid_loss / len(valid_loader)
        return valid_loss

In [None]:
# Calc Accuracy
def calcAccuracyTest(model, test_loader):
    print('Testing')
    model.to(device)
    print("Calculating Accuracy...")
    model.eval()
    y1_accuracy = 0
    prog_bar = tqdm(enumerate(test_loader), total=len(test_loader))
    with torch.no_grad():
        for i, data in prog_bar:
            inputs, labels = data[0], data[data_index]
            labels = extract_class_index(labels)
            inputs = inputs.to(torch.float32)
            inputs, labels = inputs.to(device), labels.to(device)
            output = model(inputs)
            ps_y1 = torch.exp(output)
            top_p_y1, top_class_y1 = ps_y1.topk(1, dim=1)
            equals_y1 = top_class_y1 == labels.view(*top_class_y1.shape)
            acc_y1 = equals_y1.sum().item()
            y1_accuracy += (acc_y1 / len(equals_y1))            
    y1_accuracy = y1_accuracy / len(test_loader)
    return y1_accuracy

In [None]:
# Training Loop
utladversaryModel.to(device)
save_every_epoch = 1

start = time.time()
print("Starting Training Loop...")

for epoch in range(last_epoch+1, num_epochs+1):
    print(f"Epoch {epoch}/{num_epochs}: ")
    train_loss = fit(utladversaryModel, train_loader, utladversaryOptimizer, utladversaryCriterion)
    valid_loss = validate(utladversaryModel, test_loader, utladversaryCriterion)
    y1_accuracy = calcAccuracyTest(utladversaryModel, test_loader)
    
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    test_y1_acc.append(y1_accuracy)

    utladv_res = {'train_losses': train_losses,
               'valid_losses': valid_losses,
               'test_y1_acc': test_y1_acc,
               'epoch_number': epoch
                }
    if epoch % save_every_epoch == 0:
        save_model('ins', epoch, utladversaryModel, utladv_res)

    print(f"Train Loss: {train_loss:.6f}")
    print(f"Valid Loss: {valid_loss:.6f}")
    print(f"Accuracy on Testset: {y1_accuracy:.6f}")

end = time.time()
print(f"Training time: {(end-start)/60:.3f} minutes")

print('TRAINING COMPLETE')

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

print('Loss plot...')

# loss plots
plt.figure(figsize=(10,7))
plt.title("Train-Valid Loss Trend")
plt.plot(train_losses, color='green', label='Training Loss')
plt.plot(valid_losses, color='blue', label='Validation Loss')
plt.legend(frameon=False)
plt.xlabel("epochs")
plt.ylabel("Loss")
plt.savefig(saving_path + "loss_plot.png")
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.title("Accuracy Trend")
plt.plot(test_y1_acc, color='green', label='Test set Accuracy')
plt.legend(frameon=False)
plt.xlabel("epochs")
plt.ylabel("Accuracy")
plt.savefig(saving_path + "accuracy_test_plot.png")
plt.show()