In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import random

In [None]:
# read the adult data set
adult_data = pd.read_csv('adult_data.csv', sep=',')

In [None]:
# removing the non values and the two features fnlwgt and education
adult_data.dropna(inplace=True)
adult_data.reset_index(drop=True, inplace=True)

# Drop fnlwgt, not interesting for ML
adult_data.drop('fnlwgt', axis=1, inplace=True)
adult_data.drop('education', axis=1, inplace=True)

#Data columns and their types
adult_data.info()
adult_data.head(10)

In [None]:
# incoding the marital-status into married or unmarried
adult_data['marital-status'].replace('Married-civ-spouse', 'Married', inplace=True)
adult_data['marital-status'].replace('Divorced', 'Unmarried', inplace=True)
adult_data['marital-status'].replace('Never-married', 'Unmarried', inplace=True)
adult_data['marital-status'].replace('Separated', 'Unmarried', inplace=True)
adult_data['marital-status'].replace('Widowed', 'Unmarried', inplace=True)
adult_data['marital-status'].replace('Married-spouse-absent', 'Married', inplace=True)
adult_data['marital-status'].replace('Married-AF-spouse', 'Married', inplace=True)

In [None]:
obj_columns = adult_data.select_dtypes(['object']).columns
adult_data[obj_columns] = adult_data[obj_columns].astype('category')

In [None]:
# Convert numerics to floats 
num_columns = adult_data.select_dtypes(['int64']).columns
adult_data[num_columns] = adult_data[num_columns].astype('float64')
# encoding the categorical attributes into numerical
marital_status = dict(zip(adult_data['income'].cat.codes, adult_data['income']))
adult_data['income'] = adult_data['income'].cat.codes
marital_status = dict(zip(adult_data['marital-status'].cat.codes, adult_data['marital-status']))
adult_data['marital-status'] = adult_data['marital-status'].cat.codes
occupation = dict(zip(adult_data['occupation'].cat.codes, adult_data['occupation']))
adult_data['occupation'] = adult_data['occupation'].cat.codes
relationship = dict(zip(adult_data['relationship'].cat.codes, adult_data['relationship']))
adult_data['relationship'] = adult_data['relationship'].cat.codes
race = dict(zip(adult_data['race'].cat.codes, adult_data['race']))
adult_data['race'] = adult_data['race'].cat.codes
gender = dict(zip(adult_data['gender'].cat.codes, adult_data['gender']))
adult_data['gender'] = adult_data['gender'].cat.codes
native_country = dict(zip(adult_data['native-country'].cat.codes, adult_data['native-country']))
adult_data['native-country'] = adult_data['native-country'].cat.codes
workclass = dict(zip(adult_data['workclass'].cat.codes, adult_data['workclass']))
adult_data['workclass'] = adult_data['workclass'].cat.codes

num_columns = adult_data.select_dtypes(['int8']).columns
adult_data[num_columns] = adult_data[num_columns].astype('float64')

display(adult_data.info())
display(adult_data.head(10))

In [None]:
# convert the data set from pandas to numpy
adult_data = adult_data.to_numpy()

In [None]:
# spliting the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(adult_data[:,:-1],adult_data[:,-1], test_size=0.2, random_state=92)


In [None]:
# normalizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# the attributes names
names = ['age','workclass','educational-num','marital-status','occupation','relationship','race','gender','capital-gain','capital-loss','hours-per-week','native-country']

In [None]:
## train data
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = TrainData(torch.FloatTensor(X_train), 
                       torch.FloatTensor(y_train))
## test data    
class TestData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = TestData(torch.FloatTensor(X_test))

In [None]:
# the hyper parametes of the original model
EPOCHS = 20
BATCH_SIZE = 64
LEARNING_RATE = 0.001

In [None]:
# the data loader for the provider model
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
# models

class provider_model(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 12.
        self.layer_1 = nn.Linear(12, 100) 
        self.layer_2 = nn.Linear(100, 100)
        self.layer_out = nn.Linear(100, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(100)
        self.batchnorm2 = nn.BatchNorm1d(100)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x
class user_model(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 12.
        self.layer_1 = nn.Linear(12, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# read the provider model
model = provider_model()
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
model.load_state_dict(torch.load('original_model.pth'))
model.eval()

In [None]:
# read the user surrogate model
local_model = user_model()
local_model.to(device)
print(local_model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(local_model.parameters(), lr=LEARNING_RATE)
local_model.load_state_dict(torch.load('/content/gdrive/MyDrive/work/adult/local_model.pth'))
local_model.eval()

In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
# check for the wrongly classified records by the provider model
def test(original_model, device, local_test_loader, targets):

    # Accuracy counter
    correct = 0
    wrong_examples = []
    true_labels = []
    wrong_labels = []
    counter = 0
    bad_answer = 0

    # Loop over all examples in test set
    for temp in local_test_loader:
        counter += 1
        data = temp
        target = targets[counter-1]

        # Send the data and label to the device
        data = data.to(device) 

        # Forward pass the data through the model
        model.eval()
        with torch.no_grad():
            output = model(data)
            output = torch.sigmoid(output)
            final_pred = torch.round(output)
        if final_pred.item() == target:
            # print('ture')
            correct += 1
        else:
            wrong_examples.append(data)
            true_labels.append(target)
            wrong_labels.append(final_pred.item())
            bad_answer  = bad_answer+1

    
    # Calculate final accuracy for this epsilon
    final_acc = correct/float(len(local_test_loader))

    # Return the accuracy and an adversarial example
    return final_acc, wrong_examples, true_labels, wrong_labels

In [None]:
final_acc, wrong_examples, true_labels, wrong_labels = test(model, device, test_loader, y_test)

In [None]:
len(wrong_examples)

In [None]:
wrong_labels[499]

In [None]:
# adding the perturbations to create the counterfactual example.
perturbed_examples = []
original_wrong_examples = []
new_labels = []
eps_all = []
counter = 0
# the list of features the user selects to be changed
list_of_features = [0,2,10]

for i in range(len(wrong_examples)):
    x, y, prediction = wrong_examples[i], true_labels[i], wrong_labels[i]
    print(prediction)
    eps = 0.0
    while True:
        y =  float(y)
        y = torch.tensor([[y]])
        y = y.to(device)
        counter = counter + 1
        perturbed_image = x.clone()
        perturbed_image.requires_grad = True
        local_model.eval()
        output = local_model(perturbed_image)
        output = torch.sigmoid(output)
        loss = criterion(output, y)
        output = torch.round(output)
        loss.backward()
        img_grad = perturbed_image.grad.data
        with torch.no_grad():
#         adding the perturbations according to the user preference
            if list_of_features:
                for h in range(len(img_grad[0])):
                    if int(h)  in list_of_features:
                        perturbed_image[0][h] = perturbed_image[0][h] - eps*img_grad[0][h]
#             if the user did not select a preference
            else:
                perturbed_image = perturbed_image - eps*img_grad
        model.eval()
        with torch.no_grad():
            output = model(perturbed_image)
            output = torch.sigmoid(output)
            output = torch.round(output)
        new_label = output.item()
        if(new_label == y):
            perturbed_examples.append(perturbed_image.squeeze().data.cpu().numpy())
            original_wrong_examples.append(x.squeeze().data.cpu().numpy())
            new_labels.append(new_label)
            eps_all.append(eps)
            print("Image {} has been modified with epsilon {}".format(i, eps))
            break
        eps += 0.5
        if eps > 100:
            print('there are not example')
            break

In [None]:
# inverse the normalization of the data set
inversed_perturbed_examples = scaler.inverse_transform(perturbed_examples)
inversed_original_wrong_examples = scaler.inverse_transform(original_wrong_examples)

In [None]:
# print the counterfactual examples with its categorical values
temp = inversed_perturbed_examples
for j in range(20):
    print('the sample number ', j)
    print('----------------------------------')
    for i in range(len(names)):
        if i ==0 or i == 2 or i == 8 or i == 9 or i == 10:
            print(names[i],round(temp[j][i]))
        elif i == 1:
            print(names[i],workclass[round(temp[j][i])])
        elif i == 3:
            print(names[i],marital_status[round(temp[j][i])])
        elif i == 4:
            print(names[i],occupation[round(temp[j][i])])
        elif i == 5:
            print(names[i],relationship[round(temp[j][i])])
        elif i == 6:
            print(names[i],race[round(temp[j][i])])
        elif i == 7:
            print(names[i],gender[round(temp[j][i])])
        elif i == 11:
            print(names[i],native_country[round(temp[j][i])])
    print('----------------------------------')

In [None]:
# print the original dataset with its categorical values
temp = inversed_original_wrong_examples
for j in range(20):
    print('the sample number ', j)
    print('----------------------------------')
    for i in range(len(names)):
        if i ==0 or i == 2 or i == 8 or i == 9 or i == 10:
            print(names[i],round(temp[j][i]))
        elif i == 1:
            print(names[i],workclass[round(temp[j][i])])
        elif i == 3:
            print(names[i],marital_status[round(temp[j][i])])
        elif i == 4:
            print(names[i],occupation[round(temp[j][i])])
        elif i == 5:
            print(names[i],relationship[round(temp[j][i])])
        elif i == 6:
            print(names[i],race[round(temp[j][i])])
        elif i == 7:
            print(names[i],gender[round(temp[j][i])])
        elif i == 11:
            print(names[i],native_country[round(temp[j][i])])
    print('----------------------------------')

In [None]:
# save the counterfactual examples
np.savetxt("explanation.csv", perturbed_examples, delimiter=",")

In [None]:
# check for the correctly classified records by the provider model
def test_images_true_classified(original_model, device, local_test_loader, targets):
    correct_examples = []
    true_labels = []
    wrong_labels = []
    counter = 0


  # Loop over all examples in test set
    for temp in local_test_loader:
        counter += 1
        data = temp
        target = targets[counter-1]
  # Send the data and label to the device
        data = data.to(device) 
    # Forward pass the data through the model
        original_model.eval()
        with torch.no_grad():
            output = original_model(data)
            output = torch.sigmoid(output)
            final_pred = torch.round(output)
            if final_pred.item() == target:
                correct_examples.append(data)
                true_labels.append(final_pred.item())
            if final_pred.item() == 0:
                wrong_labels.append(1)
            if final_pred.item() == 1:
                wrong_labels.append(0)

    # Return the accuracy and an adversarial example
    return  correct_examples, true_labels, wrong_labels




In [None]:
correct_created_examples, true_predictions, wrong_target = test_images_true_classified(model, device, test_loader, y_test)

In [None]:
len(correct_created_examples)

In [None]:
# adding the perturbations to create the counterfactual example.
new_labels = []
eps_all = []
original_correct_examples = []
counter = 0
adversarial_example_correct_examples=[]
# the list of features the user selects to be changed
list_of_features = [0,2,10]


for i in range(len(correct_created_examples)):
    x, prediction, target = correct_created_examples[i], true_predictions[i], wrong_target[i]
    eps = 0.0
    while True:
        y =  float(target)
        y = torch.tensor([[y]])
        target = y.to(device)
        counter = counter + 1
        perturbed_record = x.clone()
        perturbed_record.requires_grad = True
        model.eval()
        output = local_model(perturbed_record)
        output = torch.sigmoid(output)
        loss = criterion(output, target)
        output = torch.round(output)
        loss.backward()
        img_grad = perturbed_record.grad.data
        with torch.no_grad():
#         adding the perturbations according to the user preference
            if list_of_features:
                for h in range(len(img_grad[0])):
                    if int(h)  in list_of_features:
                        perturbed_record[0][h] = perturbed_record[0][h] - eps*img_grad[0][h]
#             if the user did not select a preference
            else:
                perturbed_record = perturbed_record - eps*img_grad
        model.eval()
        with torch.no_grad():
            output = model(perturbed_record)
            output = torch.sigmoid(output)
            output = torch.round(output)
        new_label = output.item()
        if(new_label == target):
            adversarial_example_correct_examples.append(perturbed_record.squeeze().data.cpu().numpy())
            original_correct_examples.append(x.squeeze().data.cpu().numpy())
            new_labels.append(new_label)
            eps_all.append(eps)
            print("Image {} has been modified with epsilon {}".format(i, eps))
            break
        eps += 20
        if eps > 500:
            print('record number {} there are not example'.format(i))
            break

In [None]:
# inverse the normalization of the data set
inversed_perturbed_record = scaler.inverse_transform(adversarial_example_correct_examples)
inversed_original_correct_examples = scaler.inverse_transform(original_correct_examples)

In [None]:
# print the counterfactual examples with its categorical values
temp = inversed_perturbed_record
for j in range(20):
    print('the sample number ', j)
    print('----------------------------------')
    for i in range(len(names)):
        if i ==0 or i == 2 or i == 8 or i == 9 or i == 10:
            print(names[i],round(temp[j][i]))
        elif i == 1:
            print(names[i],workclass[round(temp[j][i])])
        elif i == 3:
            print(names[i],marital_status[round(temp[j][i])])
        elif i == 4:
            print(names[i],occupation[round(temp[j][i])])
        elif i == 5:
            print(names[i],relationship[round(temp[j][i])])
        elif i == 6:
            print(names[i],race[round(temp[j][i])])
        elif i == 7:
            print(names[i],gender[round(temp[j][i])])
        elif i == 11:
            print(names[i],native_country[round(temp[j][i])])
    print('----------------------------------')

In [None]:
# print the original dataset with its categorical values
temp = inversed_original_correct_examples
for j in range(20):
    print('the sample number ', j)
    print('----------------------------------')
    for i in range(len(names)):
        if i ==0 or i == 2 or i == 8 or i == 9 or i == 10:
            print(names[i],round(temp[j][i]))
        elif i == 1:
            print(names[i],workclass[round(temp[j][i])])
        elif i == 3:
            print(names[i],marital_status[round(temp[j][i])])
        elif i == 4:
            print(names[i],occupation[round(temp[j][i])])
        elif i == 5:
            print(names[i],relationship[round(temp[j][i])])
        elif i == 6:
            print(names[i],race[round(temp[j][i])])
        elif i == 7:
            print(names[i],gender[round(temp[j][i])])
        elif i == 11:
            print(names[i],native_country[round(temp[j][i])])
    print('----------------------------------')

In [None]:
# save the counterfactual examples
np.savetxt("explanation_correct.csv", adversarial_example_correct_examples, delimiter=",")