In [8]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from diffprivlib.models import RandomForestClassifier as DPRandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import time
import math
from ortools.sat.python import cp_model
from scipy.optimize import linear_sum_assignment

def load_data(file_path, n_samples=100):
    df = pd.read_csv(file_path)
    df['class'] = df['class'].astype(int)
    categorical_cols = df.columns[:-1]
    
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_encoded = encoder.fit_transform(df[categorical_cols])
    feature_names = encoder.get_feature_names_out(categorical_cols)
    
    y = df['class'].values
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
    
    return X_train[:n_samples], y_train[:n_samples], X_test, y_test, encoder, feature_names

def train_dp_rf(X, y, n_estimators=10, max_depth=5, epsilon=1.0):
    start_time = time.time()
    dp_rf = DPRandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, epsilon=epsilon, random_state=42)
    dp_rf.fit(X, y)
    print(f"Training time: {time.time() - start_time:.2f} seconds")
    return dp_rf

def extract_tree_info(dp_rf, n_classes):
    trees_info = []
    for tree in dp_rf.estimators_:
        tree_info = {'tree': tree.tree_, 'noisy_counts': []}
        leaf_indices = np.where(tree.tree_.feature == -2)[0]
        for leaf_idx in leaf_indices:
            noisy_counts = tree.tree_.value[leaf_idx][0]
            tree_info['noisy_counts'].append(noisy_counts)
        trees_info.append(tree_info)
    return trees_info

class ReconstructionAttack:
    def __init__(self, dp_rf, X_shape, n_classes, epsilon):
        self.dp_rf = dp_rf
        self.n_samples, self.n_features = X_shape
        self.n_classes = n_classes
        self.epsilon = epsilon
        self.delta = int(12 / epsilon)
        self.trees_info = extract_tree_info(dp_rf, n_classes)
    
    def reconstruct(self, time_limit_seconds=7200):
        print("Starting reconstruction attack...")
        model = cp_model.CpModel()
        X_vars = {(k, i): model.NewBoolVar(f'X_{k}_{i}') for k in range(self.n_samples) for i in range(self.n_features)}
        Z_vars = {(k, c): model.NewBoolVar(f'Z_{k}_{c}') for k in range(self.n_samples) for c in range(self.n_classes)}
        
        for k in range(self.n_samples):
            model.Add(sum(Z_vars[k, c] for c in range(self.n_classes)) == 1)

        solver = cp_model.CpSolver()
        solver.parameters.max_time_in_seconds = time_limit_seconds
        print(f"Solving model with time limit: {time_limit_seconds} seconds...")
        status = solver.Solve(model)

        if status in [cp_model.OPTIMAL, cp_model.FEASIBLE]:
            print("Reconstruction completed successfully!")
            X_reconstructed = np.array([[solver.Value(X_vars[k, i]) for i in range(self.n_features)] for k in range(self.n_samples)])
            y_reconstructed = np.array([next(c for c in range(self.n_classes) if solver.Value(Z_vars[k, c])) for k in range(self.n_samples)])
            return X_reconstructed, y_reconstructed
        else:
            print(f"Failed to reconstruct. Status: {solver.StatusName(status)}")
            return None, None

def evaluate_reconstruction(X_original, y_original, X_reconstructed, y_reconstructed):
    n_samples = X_original.shape[0]
    distance_matrix = np.array([[np.sum(np.abs(X_original[i] - X_reconstructed[j])) for j in range(n_samples)] for i in range(n_samples)])
    
    row_ind, col_ind = linear_sum_assignment(distance_matrix)
    feature_error = np.mean(distance_matrix[row_ind, col_ind]) / X_original.shape[1]
    label_error = np.mean(y_original[row_ind] != y_reconstructed[col_ind])
    
    return {
        'feature_error': feature_error,
        'feature_accuracy': 1 - feature_error,
        'label_error': label_error,
        'label_accuracy': 1 - label_error,
        'matched_indices': (row_ind, col_ind)
    }

def main():
    print("Loading dataset...")
    X_train, y_train, X_test, y_test, encoder, feature_names = load_data('credit_customers.csv', n_samples=100)
    
    n_estimators, max_depth = 10, 5
    epsilons = [0.1, 1.0, 5.0, 10.0, 20.0]
    time_limit = 1800  

    results = {}

    for eps in epsilons:
        print(f"\n{'='*50}")
        print(f"Training DP-RF with epsilon = {eps}")
        print(f"{'='*50}")

        dp_rf = train_dp_rf(X_train, y_train, n_estimators=n_estimators, max_depth=max_depth, epsilon=eps)
        attack = ReconstructionAttack(dp_rf, X_train.shape, len(np.unique(y_train)), eps)
        X_reconstructed, y_reconstructed = attack.reconstruct(time_limit_seconds=time_limit)

        if X_reconstructed is not None:
            metrics = evaluate_reconstruction(X_train, y_train, X_reconstructed, y_reconstructed)
            results[eps] = {
                'feature_accuracy': metrics['feature_accuracy'],
                'label_accuracy': metrics['label_accuracy'],
                'X_reconstructed': X_reconstructed,
                'y_reconstructed': y_reconstructed,
                'matched_indices': metrics['matched_indices']
            }

            print(f"Feature Accuracy: {metrics['feature_accuracy']:.4f}, Label Accuracy: {metrics['label_accuracy']:.4f}")
        else:
            print("Attack failed.")

    return results

if __name__ == "__main__":
    reconstructed_results = main()


Loading dataset...

Training DP-RF with epsilon = 0.1
Training time: 0.63 seconds
Starting reconstruction attack...
Solving model with time limit: 1800 seconds...
Reconstruction completed successfully!
Feature Accuracy: 0.9814, Label Accuracy: 0.6800

Training DP-RF with epsilon = 1.0
Training time: 0.61 seconds
Starting reconstruction attack...
Solving model with time limit: 1800 seconds...
Reconstruction completed successfully!
Feature Accuracy: 0.9814, Label Accuracy: 0.6800

Training DP-RF with epsilon = 5.0
Training time: 0.61 seconds
Starting reconstruction attack...
Solving model with time limit: 1800 seconds...
Reconstruction completed successfully!
Feature Accuracy: 0.9814, Label Accuracy: 0.6800

Training DP-RF with epsilon = 10.0
Training time: 0.61 seconds
Starting reconstruction attack...
Solving model with time limit: 1800 seconds...
Reconstruction completed successfully!
Feature Accuracy: 0.9814, Label Accuracy: 0.6800

Training DP-RF with epsilon = 20.0
