In [None]:
# ==================== INSTALL DEPENDENCIES ====================
# Install opacus for Differential Privacy
import sys
!{sys.executable} -m pip install --no-cache-dir opacus

print("✓ Dependencies installed successfully!")

In [None]:
# ==================== IMPORTS ====================
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
import kagglehub
import os
import json
from datetime import datetime
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from opacus import PrivacyEngine
import glob

print("✓ All libraries imported successfully!")

# Configuration
RANDOM_SEEDS = [42, 123, 456, 789, 1011]
N_SPLITS = 5
DP_EPOCHS = 50
BATCH_SIZE = 64
EPSILON_VALUES = [0.5, 1.0, 3.0, 5.0, 10.0]
DP_NOISE_MULTIPLIER = 1.0
DP_MAX_GRAD_NORM = 1.0
DP_DELTA = 1e-5

print("\nConfiguration:")
print(f"  Random seeds: {RANDOM_SEEDS}")
print(f"  K-fold splits: {N_SPLITS}")
print(f"  Privacy budgets (ε): {EPSILON_VALUES}")
print(f"  Total evaluations per config: {len(RANDOM_SEEDS)} × {N_SPLITS} = {len(RANDOM_SEEDS) * N_SPLITS}")

In [None]:
# ==================== LOAD DATASETS ====================
print("\n" + "="*80)
print("LOADING DATASETS FROM KAGGLE")
print("="*80)

# Try Kaggle's native dataset access first (for Kaggle notebooks)
try:
    # On Kaggle, datasets are mounted at /kaggle/input/
    diabetes_paths = glob.glob('/kaggle/input/*/diabetes_binary_health_indicators_BRFSS2015.csv')
    adult_paths = glob.glob('/kaggle/input/*/adult.csv')
    
    if diabetes_paths and adult_paths:
        diabetes_csv = diabetes_paths[0]
        adult_csv = adult_paths[0]
        print("✓ Using Kaggle native dataset paths")
    else:
        raise FileNotFoundError("Datasets not found in /kaggle/input/")
        
except (FileNotFoundError, IndexError):
    # Fallback to kagglehub for local execution
    print("✓ Using kagglehub for dataset download")
    
    diabetes_path = kagglehub.dataset_download("alexteboul/diabetes-health-indicators-dataset")
    diabetes_csv = f"{diabetes_path}/diabetes_binary_health_indicators_BRFSS2015.csv"
    
    adult_path = kagglehub.dataset_download("uciml/adult-census-income")
    adult_csv = f"{adult_path}/adult.csv"

# Load datasets
df_diabetes = pd.read_csv(diabetes_csv)
print(f"✓ Diabetes dataset loaded: {df_diabetes.shape}")

df_adult = pd.read_csv(adult_csv)
print(f"✓ Adult dataset loaded: {df_adult.shape}")

In [None]:
# ==================== PREPROCESS DATA ====================
print("\n" + "="*80)
print("PREPROCESSING DATA")
print("="*80)

# Diabetes
X_diabetes = df_diabetes.drop(columns=['Diabetes_binary']).values
y_diabetes = df_diabetes['Diabetes_binary'].values
print(f"✓ Diabetes - Features: {X_diabetes.shape}, Target: {y_diabetes.shape}")

# Adult
X_adult_df = df_adult.drop(columns=['income'])
y_adult = (df_adult['income'] == '>50K').astype(int).values

# Encode categorical features
categorical_cols = X_adult_df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    X_adult_df[col] = le.fit_transform(X_adult_df[col].astype(str))

# Convert to numpy array
X_adult = X_adult_df.values
print(f"✓ Adult - Features: {X_adult.shape}, Target: {y_adult.shape}")

DATASETS = {'diabetes': (X_diabetes, y_diabetes), 'adult': (X_adult, y_adult)}
MODEL_TYPES = ['LR', 'FNN']

In [None]:
# ==================== MODEL ARCHITECTURES ====================

class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size, output_size=2):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        return self.linear(x)

class FeedforwardNN(nn.Module):
    def __init__(self, input_size, hidden_sizes=[128, 64], output_size=2, dropout_rate=0.3):
        super().__init__()
        layers = []
        prev_size = input_size
        
        for hidden_size in hidden_sizes:
            layers.extend([
                nn.Linear(prev_size, hidden_size),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_size = hidden_size
        
        layers.append(nn.Linear(prev_size, output_size))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def evaluate_model(model, X, y):
    """Evaluate model and return accuracy, f1"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()
    
    X_tensor = torch.FloatTensor(X).to(device)
    
    with torch.no_grad():
        outputs = model(X_tensor)
        _, predicted = torch.max(outputs, 1)
    
    accuracy = accuracy_score(y, predicted.cpu().numpy())
    f1 = f1_score(y, predicted.cpu().numpy(), average='weighted', zero_division=0)
    
    return accuracy, f1

print("✓ Model architectures defined")

In [None]:
# ==================== DIFFERENTIAL PRIVACY TRAINING ====================
print("\n" + "="*80)
print("DIFFERENTIAL PRIVACY WITH CROSS-VALIDATION")
print("="*80)

# Create models directory
try:
    models_dir = "/kaggle/working/models_research_dp"
    os.makedirs(models_dir, exist_ok=True)
except:
    models_dir = os.path.join(r"c:\Users\almir\ai-privacy\backend", "models_research_dp")
    os.makedirs(models_dir, exist_ok=True)

print(f"\nResults will be saved to: {models_dir}")
print(f"Total configurations: {len(DATASETS)} datasets × {len(MODEL_TYPES)} models × {len(EPSILON_VALUES)} ε values = {len(DATASETS) * len(MODEL_TYPES) * len(EPSILON_VALUES)}")

dp_results = {}

for dataset_name, (X_data, y_data) in DATASETS.items():
    print("\n" + "-"*80)
    print(f"Dataset: {dataset_name.upper()}")
    print("-"*80)
    
    for model_type in MODEL_TYPES:
        print(f"\n  Model: {model_type}")
        
        for target_epsilon in EPSILON_VALUES:
            print(f"\n    Target ε: {target_epsilon}")
            
            all_accuracies = []
            all_f1s = []
            all_epsilons = []
            
            for run_idx, seed in enumerate(RANDOM_SEEDS):
                skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
                
                for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_data, y_data)):
                    X_train, X_val = X_data[train_idx], X_data[val_idx]
                    y_train, y_val = y_data[train_idx], y_data[val_idx]
                    
                    # Scale features
                    scaler = StandardScaler()
                    X_train_scaled = scaler.fit_transform(X_train)
                    X_val_scaled = scaler.transform(X_val)
                    
                    # Create DataLoader
                    train_dataset = TensorDataset(
                        torch.FloatTensor(X_train_scaled),
                        torch.LongTensor(y_train)
                    )
                    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
                    
                    # Initialize model
                    input_size = X_train_scaled.shape[1]
                    if model_type == 'LR':
                        model = LogisticRegressionModel(input_size, output_size=2)
                    else:
                        model = FeedforwardNN(input_size, hidden_sizes=[128, 64], output_size=2)
                    
                    optimizer = optim.Adam(model.parameters(), lr=0.01)
                    loss_fn = nn.CrossEntropyLoss()
                    
                    # Attach DP
                    privacy_engine = PrivacyEngine()
                    dp_model, optimizer, train_loader_dp = privacy_engine.make_private(
                        module=model,
                        optimizer=optimizer,
                        data_loader=train_loader,
                        noise_multiplier=DP_NOISE_MULTIPLIER,
                        max_grad_norm=DP_MAX_GRAD_NORM,
                    )
                    
                    # Training
                    current_epsilon = 0
                    for epoch in range(DP_EPOCHS):
                        dp_model.train()
                        for batch_x, batch_y in train_loader_dp:
                            optimizer.zero_grad()
                            outputs = dp_model(batch_x)
                            loss = loss_fn(outputs, batch_y)
                            loss.backward()
                            optimizer.step()
                        
                        current_epsilon = privacy_engine.get_epsilon(DP_DELTA)
                        
                        # Stop if reached target epsilon
                        if current_epsilon >= target_epsilon:
                            break
                    
                    # Evaluate
                    accuracy, f1 = evaluate_model(dp_model, X_val_scaled, y_val)
                    all_accuracies.append(accuracy)
                    all_f1s.append(f1)
                    all_epsilons.append(current_epsilon)
                    
                    if fold_idx == N_SPLITS - 1:
                        print(f"      Run {run_idx + 1}, Fold {fold_idx + 1}: Acc={accuracy:.4f}, ε={current_epsilon:.3f}")
            
            # Statistics
            acc_mean = np.mean(all_accuracies)
            acc_std = np.std(all_accuracies, ddof=1)
            acc_min = np.min(all_accuracies)
            acc_max = np.max(all_accuracies)
            f1_mean = np.mean(all_f1s)
            f1_std = np.std(all_f1s, ddof=1)
            eps_mean = np.mean(all_epsilons)
            
            config_key = f"{dataset_name}_{model_type}_DP_eps{target_epsilon}"
            dp_results[config_key] = {
                'dataset': dataset_name,
                'model': model_type,
                'target_epsilon': target_epsilon,
                'actual_epsilon': eps_mean,
                'accuracy': {'mean': acc_mean, 'std': acc_std, 'min': acc_min, 'max': acc_max},
                'f1': {'mean': f1_mean, 'std': f1_std},
                'all_accuracies': all_accuracies,
                'all_f1s': all_f1s
            }
            
            print(f"\n    ✓ DP ε={target_epsilon} Results:")
            print(f"      Accuracy: {acc_mean*100:.2f}% ± {acc_std*100:.2f}% (range: {acc_min*100:.2f}% - {acc_max*100:.2f}%)")
            print(f"      F1-Score: {f1_mean*100:.2f}% ± {f1_std*100:.2f}%")
            print(f"      Actual ε: {eps_mean:.3f}")

print("\n" + "="*80)
print("DIFFERENTIAL PRIVACY PHASE COMPLETE")
print("="*80)

In [None]:
# ==================== STATISTICAL ANALYSIS & COMPARISON ====================
print("\n" + "="*80)
print("STATISTICAL ANALYSIS")
print("="*80)

# Load baseline results
try:
    baseline_path = "/kaggle/input/ai-privacy-baseline-results/research_results.json"
    with open(baseline_path, 'r') as f:
        baseline_data = json.load(f)
except:
    baseline_path = os.path.join(r"c:\Users\almir\ai-privacy\backend", "models_research", "research_results.json")
    with open(baseline_path, 'r') as f:
        baseline_data = json.load(f)

baseline_results = {}
for dataset in ['diabetes', 'adult']:
    for model in ['LR', 'FNN']:
        key = f"{dataset}_{model}"
        baseline_results[key] = {
            'accuracy': baseline_data['baseline_results'][dataset][model]['accuracy']['mean'],
            'all_accuracies': baseline_data['baseline_results'][dataset][model]['all_accuracies']
        }

print("\n✓ Baseline results loaded")

# DP vs Baseline comparisons
print("\n" + "="*80)
print("DIFFERENTIAL PRIVACY vs BASELINE - Statistical Tests")
print("="*80)

dp_comparison = []
for config_key, dp_data in dp_results.items():
    baseline_key = f"{dp_data['dataset']}_{dp_data['model']}"
    baseline_acc = baseline_results[baseline_key]['accuracy']
    baseline_all = baseline_results[baseline_key]['all_accuracies']
    
    dp_acc = dp_data['accuracy']['mean']
    dp_all = dp_data['all_accuracies']
    accuracy_loss = baseline_acc - dp_acc
    
    # T-test
    t_stat, p_value = stats.ttest_ind(baseline_all, dp_all)
    
    dp_comparison.append({
        'Dataset': dp_data['dataset'],
        'Model': dp_data['model'],
        'Epsilon': dp_data['target_epsilon'],
        'DP_Accuracy': dp_acc * 100,
        'DP_Std': dp_data['accuracy']['std'] * 100,
        'Baseline': baseline_acc * 100,
        'Accuracy_Loss': accuracy_loss * 100,
        't_statistic': t_stat,
        'p_value': p_value,
        'Significant': 'Yes' if p_value < 0.05 else 'No'
    })

dp_comparison_df = pd.DataFrame(dp_comparison)
print("\n" + dp_comparison_df.to_string(index=False))

In [None]:
# ==================== SAVE RESULTS ====================
print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

# Save comprehensive JSON
results_json = {
    'metadata': {
        'timestamp': datetime.now().isoformat(),
        'random_seeds': RANDOM_SEEDS,
        'n_splits': N_SPLITS,
        'total_evaluations': len(RANDOM_SEEDS) * N_SPLITS,
        'epsilon_values': EPSILON_VALUES,
        'dp_epochs': DP_EPOCHS
    },
    'differential_privacy': dp_results,
    'baseline_reference': baseline_results
}

json_path = os.path.join(models_dir, 'dp_research_results.json')
with open(json_path, 'w') as f:
    json.dump(results_json, f, indent=2, default=lambda x: float(x) if isinstance(x, np.floating) else x)
print(f"✓ Saved: dp_research_results.json")

# Save comparison CSV
dp_csv_path = os.path.join(models_dir, 'dp_vs_baseline.csv')
dp_comparison_df.to_csv(dp_csv_path, index=False)
print(f"✓ Saved: dp_vs_baseline.csv")

print("\n" + "="*80)
print("ALL RESULTS SAVED")
print("="*80)

In [None]:
# ==================== VISUALIZATIONS ====================
print("\n" + "="*80)
print("GENERATING VISUALIZATIONS")
print("="*80)

# DP Privacy-Accuracy Tradeoff
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('Differential Privacy: Privacy-Accuracy Tradeoff - 5-Fold CV × 5 Runs', fontsize=16, fontweight='bold')

for idx, dataset in enumerate(['diabetes', 'adult']):
    ax = axes[idx]
    
    for model in ['LR', 'FNN']:
        subset = dp_comparison_df[(dp_comparison_df['Dataset'] == dataset) & (dp_comparison_df['Model'] == model)]
        
        ax.errorbar(subset['Epsilon'], subset['DP_Accuracy'], yerr=subset['DP_Std'], 
                    marker='o', capsize=5, label=model, linewidth=2, markersize=8)
    
    # Baseline line
    baseline_lr = baseline_results[f"{dataset}_LR"]['accuracy'] * 100
    baseline_fnn = baseline_results[f"{dataset}_FNN"]['accuracy'] * 100
    ax.axhline(y=baseline_lr, color='blue', linestyle='--', alpha=0.5, label='LR Baseline')
    ax.axhline(y=baseline_fnn, color='orange', linestyle='--', alpha=0.5, label='FNN Baseline')
    
    ax.set_xlabel('Privacy Budget (ε)', fontsize=12)
    ax.set_ylabel('Accuracy (%)', fontsize=12)
    ax.set_title(f'{dataset.upper()}', fontsize=14)
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_xscale('log')

plt.tight_layout()
dp_viz_path = os.path.join(models_dir, 'dp_privacy_accuracy_tradeoff.png')
plt.savefig(dp_viz_path, dpi=300, bbox_inches='tight')
print(f"✓ Saved: dp_privacy_accuracy_tradeoff.png")
plt.show()

print("\n" + "="*80)
print("VISUALIZATIONS COMPLETE")
print("="*80)

## Summary

This notebook has completed:

1. **Differential Privacy**: 5-fold CV × 5 runs for 5 epsilon values × 2 datasets × 2 models = 20 configurations (500 total evaluations)
2. **Statistical Analysis**: T-tests comparing DP against baseline with p-values
3. **Results Export**: JSON and CSV files with comprehensive statistics
4. **Visualizations**: Privacy-accuracy tradeoff charts showing mean ± std

All results are publication-ready with proper statistical rigor.