In [1]:
# The basics
import numpy as np
import pandas as pd

# Import Sklearn model libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score, precision_recall_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA


# Import Utility libraries
import shap
from collections import Counter
import pickle
from tqdm import tqdm_notebook
import warnings
import ipyparallel as ipp
import time
from termcolor import colored
import os
# Attack Utilities
# from create_models import create_models
# from attack import attack
# from attack_utils import baseline, load_data
# from attack_data import attack_dataset, run_engines

# Pre-run necessities
warnings.filterwarnings('ignore')
rnds = [60, 452, 774, 802, 961, 626, 726, 211, 375, 448, 883, 684, 724, 333, 64, 646, 116, 714, 483, 73, 562, 703, 276, 394, 190, 675, 314, 604, 297, 266, 456, 845, 822, 529, 420, 605, 935, 733, 167, 603, 401, 948, 241, 734, 550, 65, 429, 470, 633, 627, 223, 713, 958, 40, 200, 641, 357, 778, 781, 498, 202, 349, 983, 125, 548, 331, 206, 556, 356, 805, 246, 626, 358, 393, 307, 792, 777, 169, 595, 279, 719, 902, 124, 197, 983, 499, 368, 864, 896, 887, 879, 224, 220, 926, 565, 173, 919, 3, 908, 941]

In [2]:
def read_file(run,dataset_name,influence_type):
    
#     if at_type == '1':
#         parent_dir = '/home/amanmoha/influence_attack/results/actual/RF/modules/'
#     elif at_type == '2':
    parent_dir = '/home/amanmoha/influence_attack/results/actual/RF/modules_ref_tuning/'

    directory = 'run'+run+'/'+dataset_name+'/'+influence_type
    path = os.path.join(parent_dir,directory)

    with open(path+'/data.txt','rb') as f:
        data = pickle.load(f)
    f.close()
    
    tar_val_,tar_lab_,ref_val_,ref_lab_,true_labels_ = data

    return tar_val_,tar_lab_,ref_val_,ref_lab_,true_labels_

In [4]:
class attack():
    
    def __init__(self,r,want_tune,tune_scoring):
        self.rs = r
        self.want_tune=want_tune
        self.tune_scoring=tune_scoring
    
    def set_parameters(self,model,x,y):
        params = {'n_estimators':[20,50,100,150,200],'max_depth':[3,4,5,6,7]}
        grid_search = GridSearchCV(model,params,scoring=self.tune_scoring,n_jobs=-1)
        grid_search.fit(x,y)
        best_parameters = grid_search.best_params_
        return best_parameters

    def fit(self,x,y):
        self.attack_model = GradientBoostingClassifier(random_state=self.rs)
        if self.want_tune:
            best_params = self.set_parameters(self.attack_model,x,y)
            self.attack_model.set_params(n_estimators = best_params['n_estimators'])
            self.attack_model.set_params(max_depth = best_params['max_depth'])
#             print(colored("Best parameters - Attack model",'blue'),end=": ")
#             print('n_estimators= {a}, max_depth= {b}\n'.format(a=best_params['n_estimators'],b=best_params['max_depth']))
        self.attack_model.fit(x,y)
    
    def predict(self,x):
        return self.attack_model.predict(x)

In [11]:
###################
# DRIVER
###################
'''
DATASET NAMES: adult_income, bcw, german_credit, pima_diabetes, hepatitis, heart_disease
KNOWLEGDE: disjoint, shared
INFLUENCE TYPE: SHAP, DP_impurity, DP_threshold, DP_importance, DP_nodesamples
DOUBLE INFLUENCE: False, True
'''

## INITIALIZATIONS
dataset_names = ['german_credit']
attack_params = {'tuning':True,
                'scoring':'f1'}
run = '4'
influence_type = 'SHAP'
for dataset_name in tqdm_notebook(dataset_names,desc="Datasets"):
    tar_val,tar_lab,ref_val,ref_lab,true_labels = read_file(run,dataset_name,influence_type)
    ac,ps,rs,fs,tr_ac,te_ac=0,0,0,0,0,0
    ac_ref,ps_ref,rs_ref,fs_ref=0,0,0,0
    #################################
    # print information to screen
    #################################
    print(colored("###################################################",'blue'))
    print(colored("Dataset used: ",'blue'), dataset_name)
    print(colored("###################################################\n",'blue'))
    n_states = tar_val.shape[0]
    for i in tqdm_notebook(range(n_states),desc='Different random states'):

        obj = attack(rnds[0],want_tune=attack_params['tuning'],tune_scoring=attack_params['scoring'])
        obj.fit(np.squeeze(ref_val[i]),np.squeeze(ref_lab[i]))
        preds = obj.predict(np.squeeze(tar_val[i]))
        preds_ref = obj.predict(np.squeeze(ref_val[i]))
        ## METRICS RECORD
        ac+=accuracy_score(np.squeeze(tar_lab[i]),preds)
        ps+=precision_score(np.squeeze(tar_lab[i]),preds)
        rs+=recall_score(np.squeeze(tar_lab[i]),preds)
        fs+=f1_score(np.squeeze(tar_lab[i]),preds)
        
        ac_ref+=accuracy_score(np.squeeze(ref_lab[i]),preds_ref)
        ps_ref+=precision_score(np.squeeze(ref_lab[i]),preds_ref)
        rs_ref+=recall_score(np.squeeze(ref_lab[i]),preds_ref)
        fs_ref+=f1_score(np.squeeze(ref_lab[i]),preds_ref)

    print(colored("################# RESULTS #################\n",'blue'))
    print("Target train accuracy:\t {a}\nTarget test accuracy:\t {b}\nAttack accuracy:\t {c}\nAttack precision:\t {d}\nAttack recall:\t\t {e}\nAttack f1:\t\t {f}\n".format(a=tr_ac/n_states,b=te_ac/n_states,c=ac/n_states,d=ps/n_states,e=rs/n_states,f=fs/n_states))

HBox(children=(IntProgress(value=0, description='Datasets', max=1, style=ProgressStyle(description_width='init…

[34m###################################################[0m
[34mDataset used: [0m german_credit
[34m###################################################
[0m


HBox(children=(IntProgress(value=0, description='Different random states', max=30, style=ProgressStyle(descrip…

[34m################# RESULTS #################
[0m
Target train accuracy:	 0.0
Target test accuracy:	 0.0
Attack accuracy:	 0.6008666666666668
Attack precision:	 0.59177290944961
Attack recall:		 0.6577333333333334
Attack f1:		 0.6136675024697701




In [12]:
ac_ref/n_states,ps_ref/n_states,rs_ref/n_states,fs_ref/n_states

(0.9147200000000001,
 0.9249598963732808,
 0.9002666666666665,
 0.9121698382298988)

In [6]:
help(GradientBoostingClassifier)

Help on class GradientBoostingClassifier in module sklearn.ensemble._gb:

class GradientBoostingClassifier(sklearn.base.ClassifierMixin, BaseGradientBoosting)
 |  Gradient Boosting for classification.
 |  
 |  GB builds an additive model in a
 |  forward stage-wise fashion; it allows for the optimization of
 |  arbitrary differentiable loss functions. In each stage ``n_classes_``
 |  regression trees are fit on the negative gradient of the
 |  binomial or multinomial deviance loss function. Binary classification
 |  is a special case where only a single regression tree is induced.
 |  
 |  Read more in the :ref:`User Guide <gradient_boosting>`.
 |  
 |  Parameters
 |  ----------
 |  loss : {'deviance', 'exponential'}, default='deviance'
 |      loss function to be optimized. 'deviance' refers to
 |      deviance (= logistic regression) for classification
 |      with probabilistic outputs. For loss 'exponential' gradient
 |      boosting recovers the AdaBoost algorithm.
 |  
 |  learni