In [1]:
import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from IPython import display
import random
import scipy.stats
from sklearn import metrics
from sklearn.metrics import roc_curve
import os

## Attack Metrics Tables

This notebook generates tables that succinctly display AUC and TPR @ FPR = 0.01, 0.001 for a particular inputted attack setting. Each table compares with either Shokri et al.'s explanation variance thresholding baseline or Carlini et al.'s loss LiRA baseline.

For display purposes, we show the following attack setting:
* Dataset: CIFAR10
* Explanation type: Input * Gradient (IXG)
* Model: vit_small_patch16_224
* Attack type: L1-LRT

### Comparing L1-LRT with Shokri et al.'s Thresholding

In [13]:
num_experiments=33
epochs_nondp = [30]
model='vit_small_patch16_224' # 'vit_relpos_small_patch16_224.sw_in1k'
clipping_mode_nondp='nonDP'
num_samples=20000
data='CIFAR10'
exp_types=['ixg']
attack_type = 'L1 Norm'

tpr_new_vs_thres_df = pd.DataFrame(columns=[
    'explanation_type', 'epochs',
    'tpr_01_mean', 'tpr01_sd',
    'tpr_01_thres_mean', 'tpr01_thres_sd',
    'tpr_001_mean', 'tpr_001_sd',
    'tpr_001_thres_mean', 'tpr_001_thres_sd',
    'auc_mean', 'auc_sd',
    'auc_thres_mean', 'auc_thres_sd'])

for exp_type in tqdm(exp_types):
    for ep in epochs_nondp:
        tpr01_file = f'attack_data/tpr01_{data}_{attack_type}/model={model}_mode={clipping_mode_nondp}_type={exp_type}_epochs={ep}.txt'
        tpr01_list = np.loadtxt(tpr01_file)
        tpr001_file = f'attack_data/tpr001_{data}_{attack_type}/model={model}_mode={clipping_mode_nondp}_type={exp_type}_epochs={ep}.txt'
        tpr001_list = np.loadtxt(tpr001_file)
        
        tpr01_thres_file = f'attack_data/tpr01thres_{data}_{attack_type}/model={model}_mode={clipping_mode_nondp}_type={exp_type}_epochs={ep}.txt'
        tpr01_thres_list = np.loadtxt(tpr01_thres_file)
        tpr001_thres_file = f'attack_data/tpr001thres_{data}_{attack_type}/model={model}_mode={clipping_mode_nondp}_type={exp_type}_epochs={ep}.txt'
        tpr001_thres_list = np.loadtxt(tpr001_thres_file)
        
        auc_file = f'attack_data/auc_{data}_{attack_type}/model={model}_mode={clipping_mode_nondp}_type={exp_type}_epochs={ep}.txt'
        auc_list = np.loadtxt(auc_file)
        auc_thres_file = f'attack_data/aucthres_{data}_{attack_type}/model={model}_mode={clipping_mode_nondp}_type={exp_type}_epochs={ep}.txt'
        auc_thres_list = np.loadtxt(auc_thres_file)
        
        row_tpr = {
            'explanation_type': exp_type,
            'epochs': ep,
            'tpr01_mean': np.mean(tpr01_list),
            'tpr01_sd': np.std(tpr01_list),
            'tpr01_thres_mean': np.mean(tpr01_thres_list),
            'tpr01_thres_sd': np.std(tpr01_thres_list),
            'tpr001_mean': np.mean(tpr001_list),
            'tpr001_sd': np.std(tpr001_list),
            'tpr001_thres_mean': np.mean(tpr001_thres_list),
            'tpr001_thres_sd': np.std(tpr001_thres_list),
            'auc_mean': np.mean(auc_list),
            'auc_sd': np.std(auc_list),
            'auc_thres_mean': np.mean(auc_thres_list),
            'auc_thres_sd': np.std(auc_thres_list)
        }
        print(row_tpr)
        tpr_new_vs_thres_df.loc[len(tpr_new_vs_thres_df)]= list(row_tpr.values())
    
tpr_new_vs_thres_df.to_csv(f'attack_metrics/tprs_nondp_liravsthres_{data}_{attack_type}.csv')
tpr_new_vs_thres_df

  0%|          | 0/1 [00:00<?, ?it/s]

{'explanation_type': 'ixg', 'epochs': 30, 'tpr01_mean': 0.15621818181818184, 'tpr01_sd': 0.015731987382312674, 'tpr01_thres_mean': 0.011154545454545453, 'tpr01_thres_sd': 0.0027355228749635844, 'tpr001_mean': 0.0921939393939394, 'tpr001_sd': 0.018633660311446795, 'tpr001_thres_mean': 0.0015515151515151514, 'tpr001_thres_sd': 0.0008464260737095001, 'auc_mean': 0.6380133166666668, 'auc_sd': 0.007334452739926561, 'auc_thres_mean': 0.5590237734848484, 'auc_thres_sd': 0.0057550229672714585}


Unnamed: 0,explanation_type,epochs,tpr_01_mean,tpr01_sd,tpr_01_thres_mean,tpr01_thres_sd,tpr_001_mean,tpr_001_sd,tpr_001_thres_mean,tpr_001_thres_sd,auc_mean,auc_sd,auc_thres_mean,auc_thres_sd
0,ixg,30,0.156218,0.015732,0.011155,0.002736,0.092194,0.018634,0.001552,0.000846,0.638013,0.007334,0.559024,0.005755


### cifar10: non-DP models only, 33 runs, comparing L1-LRT with Loss LiRA

In [3]:
num_experiments=33
epochs_nondp = [30]
model='vit_small_patch16_224' # 'vit_relpos_small_patch16_224.sw_in1k'
clipping_mode_nondp='nonDP'
num_samples=20000
data='CIFAR10'
exp_types=['ixg']
attack_type = 'L1 Norm'

tpr_new_vs_loss_df = pd.DataFrame(columns=[
    'explanation_type', 'epochs',
    'tpr_01_mean', 'tpr01_sd',
    'tpr_01_loss_mean', 'tpr01_loss_sd',
    'tpr_001_mean', 'tpr_001_sd',
    'tpr_001_loss_mean', 'tpr_001_loss_sd',
    'auc_mean', 'auc_sd',
    'auc_loss_mean', 'auc_loss_sd'])
# nonDP

for exp_type in tqdm(exp_types):
    for ep in epochs_nondp:
        tpr01_file = f'attack_data/tpr01_{data}_{attack_type}/model={model}_mode={clipping_mode_nondp}_type={exp_type}_epochs={ep}.txt'
        tpr01_list = np.loadtxt(tpr01_file)
        tpr001_file = f'attack_data/tpr001_{data}_{attack_type}/model={model}_mode={clipping_mode_nondp}_type={exp_type}_epochs={ep}.txt'
        tpr001_list = np.loadtxt(tpr001_file)
        
        tpr01_loss_file = f'attack_data/tpr01_{data}_Losses/model={model}_mode={clipping_mode_nondp}_epochs={ep}.txt'
        tpr01_loss_list = np.loadtxt(tpr01_loss_file)
        tpr001_loss_file = f'attack_data/tpr001_{data}_Losses/model={model}_mode={clipping_mode_nondp}_epochs={ep}.txt'
        tpr001_loss_list = np.loadtxt(tpr001_loss_file)
        
        auc_file = f'attack_data/auc_{data}_{attack_type}/model={model}_mode={clipping_mode_nondp}_type={exp_type}_epochs={ep}.txt'
        auc_list = np.loadtxt(auc_file)
        auc_loss_file = f'attack_data/auc_{data}_Losses/model={model}_mode={clipping_mode_nondp}_epochs={ep}.txt'
        auc_loss_list = np.loadtxt(auc_loss_file)
        
        row_tpr = {
            'explanation_type': exp_type,
            'epochs': ep,
            'tpr01_mean': np.mean(tpr01_list),
            'tpr01_sd': np.std(tpr01_list),
            'tpr01_loss_mean': np.mean(tpr01_loss_list),
            'tpr01_loss_sd': np.std(tpr01_loss_list),
            'tpr001_mean': np.mean(tpr001_list),
            'tpr001_sd': np.std(tpr001_list),
            'tpr001_loss_mean': np.mean(tpr001_loss_list),
            'tpr001_loss_sd': np.std(tpr001_loss_list),
            'auc_mean': np.mean(auc_list),
            'auc_sd': np.std(auc_list),
            'auc_loss_mean': np.mean(auc_loss_list),
            'auc_loss_sd': np.std(auc_loss_list)
        }
        print(row_tpr)
        tpr_new_vs_loss_df.loc[len(tpr_new_vs_loss_df)]= list(row_tpr.values())
    
tpr_new_vs_loss_df.to_csv(f'attack_metrics/tprs_nondp_liravsloss_{data}_{attack_type}.csv')
tpr_new_vs_loss_df

  0%|          | 0/1 [00:00<?, ?it/s]

{'explanation_type': 'ixg', 'epochs': 30, 'tpr01_mean': 0.15621818181818184, 'tpr01_sd': 0.015731987382312674, 'tpr01_loss_mean': 0.0946212121212121, 'tpr01_loss_sd': 0.011575830556647619, 'tpr001_mean': 0.0921939393939394, 'tpr001_sd': 0.018633660311446795, 'tpr001_loss_mean': 0.05428484848484849, 'tpr001_loss_sd': 0.0071114555988643375, 'auc_mean': 0.6380133166666668, 'auc_sd': 0.007334452739926561, 'auc_loss_mean': 0.5696043016666666, 'auc_loss_sd': 0.009400881991053203}


Unnamed: 0,explanation_type,epochs,tpr_01_mean,tpr01_sd,tpr_01_loss_mean,tpr01_loss_sd,tpr_001_mean,tpr_001_sd,tpr_001_loss_mean,tpr_001_loss_sd,auc_mean,auc_sd,auc_loss_mean,auc_loss_sd
0,ixg,30,0.156218,0.015732,0.094621,0.011576,0.092194,0.018634,0.054285,0.007111,0.638013,0.007334,0.569604,0.009401
