In [1]:
import random
import time
import copy
from collections import Counter
import csv
import scipy
import scipy.stats as stats

from utils import *
from datasets import *
from mdav import *
from train import *
from models import *
from attacks import *

import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, TensorDataset, Subset



from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import metrics

%matplotlib inline

In [2]:
import warnings
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning

# Filter out ConvergenceWarning and FitFailedWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Assuming y_test and y_forget are arrays of class indices
encoder = OneHotEncoder(sparse_output=False, categories="auto")


In [3]:
# def seed_everything(seed=7):
#     np.random.seed(seed)
#     np.random.seed(seed)
#     random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
    
# seed_everything(seed=7)

In [None]:
# Step 1: Get dataset

df=pd.read_csv('data/heart/cardio_train.csv', sep=';')
df.drop(columns=['id'], inplace=True)
df.dropna(inplace=True)

split_ratio = 0.8  # 80% for the first DataFrame, 20% for the second
# Perform the random split
mask = np.random.rand(len(df)) < split_ratio
trainset = df[mask]
testset = df[~mask]
# Reset the index of the new DataFrames if needed
trainset.reset_index(drop=True, inplace=True)
testset.reset_index(drop=True, inplace=True)


X_train = trainset.iloc[:,:-1].values
y_train = trainset.iloc[:,-1].values
X_test = testset.iloc[:,:-1].values
y_test = testset.iloc[:,-1].values

SC = StandardScaler()
X_train = SC.fit_transform(X_train)
X_test = SC.transform(X_test)

# Sharding data
forget_ratio = 0.05
# Divide X_train into 5 equal shards
num_shards = 5
shard_size = len(y_train) // num_shards
X_shards = []
y_shards = []
retain_sets_X = []
retain_sets_y = []
forget_sets_X = []
forget_sets_y = []

for i in range(num_shards):
    # Calculate indices for slicing
    start_idx = i * shard_size
    end_idx = start_idx + shard_size if i < num_shards - 1 else len(y_train)
    
    # Slice the data to create shards
    X_shard = X_train[start_idx:end_idx]
    y_shard = y_train[start_idx:end_idx]
    
    X_shards.append(X_shard)
    y_shards.append(y_shard)
    
    # Shuffle indices for random sampling
    idxs = np.arange(len(y_shard))
    random.shuffle(idxs)
    m = int(len(y_shard) * forget_ratio)  # 5% forget ratio
    
    # Split indices for retain and forget sets
    retain_idxs = idxs[m:]
    forget_idxs = idxs[:m]
    
    # Create retain and forget sets for the shard
    X_retain = X_shard[retain_idxs]
    y_retain = y_shard[retain_idxs]
    X_forget = X_shard[forget_idxs]
    y_forget = y_shard[forget_idxs]
    
    retain_sets_X.append(X_retain)
    retain_sets_y.append(y_retain)
    forget_sets_X.append(X_forget)
    forget_sets_y.append(y_forget)

counter = Counter(y_train)
for k,v in counter.items():
    per = v / len(y_train) * 100
    print('Class=%s, Count=%d, Percentage=%.2f%%' % (k, v, per))
    
num_features = X_train.shape[-1]
num_classes = len(set(y_train))


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
initial_model = XGBClassifier(num_classes= num_classes, reg_lambda=5, 
                              learning_rate=0.5, max_depth=7, n_estimators=200, device = device)
n_repeat = 3

In [None]:
# Step 2: Define and train M on D
train_accs = []
test_accs = []
mia_aucs = []
mia_advs = []
runtimes = []
y_forget_ = np.concatenate(forget_sets_y)
for r in range(n_repeat):
    models = []
    t0 = time.time()
    for i in range(num_shards):
        model = copy.deepcopy(initial_model)
        model.fit(X_shards[i], y_shards[i])
        models.append(model)
    t1 = time.time()
    rt = t1-t0
    runtimes.append(rt)

    # Evaluate the model accuracy, and MIA
    # Accuracy
    all_train_preds = []
    all_test_preds = []
    for model in models:
        all_train_preds.append(model.predict(X_train))
        all_test_preds.append(model.predict(X_test))
    
    all_train_preds = np.array(all_train_preds)
    all_test_preds = np.array(all_test_preds)
    preds_train, _ = stats.mode(all_train_preds, axis = 0, keepdims = True)
    preds_test, _ = stats.mode(all_test_preds, axis = 0, keepdims = True)
    train_acc = metrics.accuracy_score(y_train, preds_train.squeeze())
    test_acc = metrics.accuracy_score(y_test, preds_test.squeeze())
    train_accs.append(100.0*train_acc)
    test_accs.append(100.0*test_acc)
    
    #MIA
    all_test_preds = []
    all_forget_preds = []
    for i, model in enumerate(models):
        all_test_preds.append(model.predict_proba(X_test))
        all_forget_preds.append(model.predict_proba(forget_sets_X[i]))
        
    all_test_preds = np.array(all_test_preds)
    test_preds = np.mean(all_test_preds, axis=0)
    forget_preds = np.concatenate(all_forget_preds, axis=0)
    
    # Convert class indices to one-hot encoding
    y_test_one_hot = encoder.fit_transform(y_test.reshape(-1, 1))
    y_forget_one_hot = encoder.transform(y_forget_.reshape(-1, 1))

    loss_test = np.array([metrics.log_loss(y_test_one_hot[i], test_preds[i]) for i in range(len(y_test))])
    loss_forget = np.array([metrics.log_loss(y_forget_one_hot[i], forget_preds[i]) for i in range(len(y_forget_))])

    idxs = np.arange(len(y_test))
    random.shuffle(idxs)
    m = len(forget_preds)
    rand_idxs = idxs[:m]
    attack_result = tf_attack(logits_train = forget_preds, logits_test = test_preds[rand_idxs], 
                              loss_train = loss_forget, loss_test = loss_test[rand_idxs], 
                              train_labels = y_forget_, test_labels = y_test[rand_idxs])

    auc = attack_result.get_result_with_max_auc().get_auc()
    adv = attack_result.get_result_with_max_attacker_advantage().get_attacker_advantage()
    mia_aucs.append(100.0*auc)
    mia_advs.append(100.0*adv)

mean_runtime = np.mean(runtimes)
std_runtime = np.std(runtimes)
mean_train_acc = np.mean(train_accs)
std_train_acc = np.std(train_accs)
mean_test_acc = np.mean(test_accs)
std_test_acc = np.std(test_accs)
mean_mia_auc = np.mean(mia_aucs)
std_mia_auc = np.std(mia_aucs)
mean_mia_adv = np.mean(mia_advs)
std_mia_adv = np.std(mia_advs)

# Print the results
print('Training M on D time:{:0.2f}(±{:0.2f}) seconds'.format(mean_runtime, std_runtime))
print('Train accuracy:{:0.2f}(±{:0.2f})%'.format(mean_train_acc, std_train_acc))
print('Test accuracy:{:0.2f}(±{:0.2f})%'.format(mean_test_acc, std_test_acc))
print('MIA AUC:{:0.2f}(±{:0.2f})%'.format(mean_mia_auc, std_mia_auc))
print('MIA Advantage:{:0.2f}(±{:0.2f})%'.format(mean_mia_adv, std_mia_adv))

# Save to CSV
csv_file_path = 'results/SISA/heart/xgb_shards={}_fr={}_base.csv'.format(num_shards, forget_ratio)

with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Metric', 'Mean', 'Standard Deviation'])
    writer.writerow(['Training Time', mean_runtime, std_runtime])
    writer.writerow(['Train Accuracy', mean_train_acc, std_train_acc])
    writer.writerow(['Test Accuracy', mean_test_acc, std_test_acc])
    writer.writerow(['MIA AUC', mean_mia_auc, std_mia_auc])
    writer.writerow(['MIA Advantage', mean_mia_adv, std_mia_adv])


In [None]:
# Step 2: Define and train M on D
retain_accs = []
test_accs = []
mia_aucs = []
mia_advs = []
runtimes = []
X_retain_ = np.concatenate(retain_sets_X)
y_retain_ = np.concatenate(retain_sets_y)
X_forget_ = np.concatenate(forget_sets_X)
for r in range(n_repeat):
    models = []
    t0 = time.time()
    for i in range(num_shards):
        model = copy.deepcopy(initial_model)
        model.fit(retain_sets_X[i], retain_sets_y[i])
        models.append(model)
    t1 = time.time()
    rt = t1-t0
    runtimes.append(rt)

    # Evaluate the model accuracy, and MIA
    # Accuracy
    all_retain_preds = []
    all_test_preds = []
    for model in models:
        all_retain_preds.append(model.predict(X_retain_))
        all_test_preds.append(model.predict(X_test))
    
    all_retain_preds = np.array(all_retain_preds)
    all_test_preds = np.array(all_test_preds)
    preds_retain, _ = stats.mode(all_retain_preds, axis = 0, keepdims = True)
    preds_test, _ = stats.mode(all_test_preds, axis = 0, keepdims = True)
    train_acc = metrics.accuracy_score(y_retain_, preds_retain.squeeze())
    test_acc = metrics.accuracy_score(y_test, preds_test.squeeze())
    train_accs.append(100.0*train_acc)
    test_accs.append(100.0*test_acc)
    
    #MIA
    all_test_preds = []
    all_forget_preds = []
    for model in models:
        all_test_preds.append(model.predict_proba(X_test))
        all_forget_preds.append(model.predict_proba(X_forget_))
        
    all_test_preds = np.array(all_test_preds)
    all_forget_preds = np.array(all_forget_preds)
    test_preds = np.mean(all_test_preds, axis=0)
    forget_preds = np.mean(all_forget_preds, axis=0)

    # Convert class indices to one-hot encoding
    y_test_one_hot = encoder.fit_transform(y_test.reshape(-1, 1))
    y_forget_one_hot = encoder.transform(y_forget_.reshape(-1, 1))

    loss_test = np.array([metrics.log_loss(y_test_one_hot[i], test_preds[i]) for i in range(len(y_test))])
    loss_forget = np.array([metrics.log_loss(y_forget_one_hot[i], forget_preds[i]) for i in range(len(y_forget_))])

    attack_result = tf_attack(logits_train = forget_preds, logits_test = test_preds[rand_idxs], 
                              loss_train = loss_forget, loss_test = loss_test[rand_idxs], 
                              train_labels = y_forget_, test_labels = y_test[rand_idxs])

    auc = attack_result.get_result_with_max_auc().get_auc()
    adv = attack_result.get_result_with_max_attacker_advantage().get_attacker_advantage()
    mia_aucs.append(100.0*auc)
    mia_advs.append(100.0*adv)

mean_runtime = np.mean(runtimes)
std_runtime = np.std(runtimes)
mean_train_acc = np.mean(train_accs)
std_train_acc = np.std(train_accs)
mean_test_acc = np.mean(test_accs)
std_test_acc = np.std(test_accs)
mean_mia_auc = np.mean(mia_aucs)
std_mia_auc = np.std(mia_aucs)
mean_mia_adv = np.mean(mia_advs)
std_mia_adv = np.std(mia_advs)

# Print the results
print('Retraining time:{:0.2f}(±{:0.2f}) seconds'.format(mean_runtime, std_runtime))
print('Retain accuracy:{:0.2f}(±{:0.2f})%'.format(mean_train_acc, std_train_acc))
print('Test accuracy:{:0.2f}(±{:0.2f})%'.format(mean_test_acc, std_test_acc))
print('MIA AUC:{:0.2f}(±{:0.2f})%'.format(mean_mia_auc, std_mia_auc))
print('MIA Advantage:{:0.2f}(±{:0.2f})%'.format(mean_mia_adv, std_mia_adv))

# Save to CSV
csv_file_path = 'results/SISA/heart/xgb_shards={}_fr={}_retrain.csv'.format(num_shards, forget_ratio)

with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Metric', 'Mean', 'Standard Deviation'])
    writer.writerow(['Retraining Time', mean_runtime, std_runtime])
    writer.writerow(['Retain Accuracy', mean_train_acc, std_train_acc])
    writer.writerow(['Test Accuracy', mean_test_acc, std_test_acc])
    writer.writerow(['MIA AUC', mean_mia_auc, std_mia_auc])
    writer.writerow(['MIA Advantage', mean_mia_adv, std_mia_adv])
