In [None]:
%pip install opacus

In [None]:
# ==================== IMPORTS ====================
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
import kagglehub
import os
import json
from datetime import datetime
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from opacus import PrivacyEngine
import glob
import warnings

# Suppress Opacus warnings
warnings.filterwarnings('ignore', category=UserWarning, module='opacus')

print("âœ“ All libraries imported successfully!")

# Configuration
RANDOM_SEEDS = [42, 123, 456, 789, 1011]
N_SPLITS = 5
DP_EPOCHS = 50
BATCH_SIZE = 64
EPSILON_VALUES = [0.5, 1.0, 3.0, 5.0, 10.0]
DP_NOISE_MULTIPLIER = 1.0
DP_MAX_GRAD_NORM = 1.0
DP_DELTA = 1e-5

print("\nConfiguration:")
print(f"  Dataset: ADULT CENSUS INCOME")
print(f"  Random seeds: {RANDOM_SEEDS}")
print(f"  Total evaluations per config: {len(RANDOM_SEEDS)} Ã— {N_SPLITS} = {len(RANDOM_SEEDS) * N_SPLITS}")
print(f"  K-fold splits: {N_SPLITS}")
print(f"  Privacy budgets (Îµ): {EPSILON_VALUES}")
print(f"  Models: LR, FNN")
print(f"  Total configs: 2 models Ã— 5 Îµ values = 10 configurations")

In [None]:
# ==================== LOAD ADULT DATASET ====================
print("\n" + "="*80)
print("LOADING ADULT CENSUS INCOME DATASET FROM KAGGLE")
print("="*80)

# Try Kaggle's native dataset access first (for Kaggle notebooks)
try:
    # On Kaggle, datasets are mounted at /kaggle/input/
    adult_paths = glob.glob('/kaggle/input/*/adult.csv')
    
    if adult_paths:
        adult_csv = adult_paths[0]
        print("âœ“ Using Kaggle native dataset path")
    else:
        raise FileNotFoundError("Dataset not found in /kaggle/input/")
        
except (FileNotFoundError, IndexError):
    # Fallback to kagglehub for local execution
    print("âœ“ Using kagglehub for dataset download")
    adult_path = kagglehub.dataset_download("uciml/adult-census-income")
    adult_csv = f"{adult_path}/adult.csv"

# Load dataset
df_adult = pd.read_csv(adult_csv)
print(f"âœ“ Adult dataset loaded: {df_adult.shape}")

In [None]:
# ==================== PREPROCESS DATA ====================
print("\n" + "="*80)
print("PREPROCESSING DATA")
print("="*80)

# Adult Census Income preprocessing
X_adult_df = df_adult.drop(columns=['income'])
y_adult = (df_adult['income'] == '>50K').astype(int).values

# Encode categorical features
categorical_cols = X_adult_df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    X_adult_df[col] = le.fit_transform(X_adult_df[col].astype(str))

# Convert to numpy array
X_adult = X_adult_df.values
print(f"âœ“ Adult - Features: {X_adult.shape}, Target: {y_adult.shape}")
print(f"âœ“ Class distribution: {np.bincount(y_adult)}")

In [None]:
# ==================== MODEL ARCHITECTURES ====================

class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size, output_size=2):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        return self.linear(x)

class FeedforwardNN(nn.Module):
    def __init__(self, input_size, hidden_sizes=[128, 64], output_size=2, dropout_rate=0.3):
        super().__init__()
        layers = []
        prev_size = input_size
        
        for hidden_size in hidden_sizes:
            layers.extend([
                nn.Linear(prev_size, hidden_size),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_size = hidden_size
        
        layers.append(nn.Linear(prev_size, output_size))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

def evaluate_model(model, X, y):
    """Evaluate model and return accuracy, f1"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()
    
    X_tensor = torch.FloatTensor(X).to(device)
    
    with torch.no_grad():
        outputs = model(X_tensor)
        _, predicted = torch.max(outputs, 1)
    
    accuracy = accuracy_score(y, predicted.cpu().numpy())
    f1 = f1_score(y, predicted.cpu().numpy(), average='weighted', zero_division=0)
    
    return accuracy, f1

print("âœ“ Model architectures defined")

In [None]:
# ==================== DIFFERENTIAL PRIVACY TRAINING - ADULT DATASET ====================
print("\n" + "="*80)
print("DIFFERENTIAL PRIVACY WITH CROSS-VALIDATION - ADULT DATASET")
print("="*80)

# Create models directory
try:
    models_dir = "/kaggle/working/models_dp_adult"
    os.makedirs(models_dir, exist_ok=True)
except:
    models_dir = os.path.join(r"c:\Users\almir\ai-privacy\backend", "models_dp_adult")
    os.makedirs(models_dir, exist_ok=True)

print(f"\nResults will be saved to: {models_dir}")

MODEL_TYPES = ['LR', 'FNN']
dp_results = {}

print(f"\nâœ“ Running all Adult DP configurations:")
print(f"  - Models: {MODEL_TYPES}")
print(f"  - Epsilon values: {EPSILON_VALUES}")
print(f"  - Total: {len(MODEL_TYPES)} models Ã— {len(EPSILON_VALUES)} Îµ = {len(MODEL_TYPES) * len(EPSILON_VALUES)} configurations")
print(f"  - Total evaluations: {len(MODEL_TYPES) * len(EPSILON_VALUES) * len(RANDOM_SEEDS) * N_SPLITS} (10 configs Ã— 25 evals)")

checkpoint_path = os.path.join(models_dir, 'dp_adult_checkpoint.json')

for model_type in MODEL_TYPES:
    print("\n" + "-"*80)
    print(f"Model: {model_type}")
    print("-"*80)
    
    for target_epsilon in EPSILON_VALUES:
        config_key = f"adult_{model_type}_DP_eps{target_epsilon}"
        
        print(f"\n  Target Îµ: {target_epsilon}")
        
        all_accuracies = []
        all_f1s = []
        all_epsilons = []
        
        for run_idx in range(len(RANDOM_SEEDS)):
            seed = RANDOM_SEEDS[run_idx]
            skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
            
            for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_adult, y_adult)):
                X_train, X_val = X_adult[train_idx], X_adult[val_idx]
                y_train, y_val = y_adult[train_idx], y_adult[val_idx]
                
                # Scale features
                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_val_scaled = scaler.transform(X_val)
                
                # Create DataLoader
                train_dataset = TensorDataset(
                    torch.FloatTensor(X_train_scaled),
                    torch.LongTensor(y_train)
                )
                train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
                
                # Initialize model
                input_size = X_train_scaled.shape[1]
                if model_type == 'LR':
                    model = LogisticRegressionModel(input_size, output_size=2)
                else:
                    model = FeedforwardNN(input_size, hidden_sizes=[128, 64], output_size=2)
                
                optimizer = optim.Adam(model.parameters(), lr=0.01)
                loss_fn = nn.CrossEntropyLoss()
                
                # Attach DP
                privacy_engine = PrivacyEngine()
                dp_model, optimizer, train_loader_dp = privacy_engine.make_private(
                    module=model,
                    optimizer=optimizer,
                    data_loader=train_loader,
                    noise_multiplier=DP_NOISE_MULTIPLIER,
                    max_grad_norm=DP_MAX_GRAD_NORM,
                )
                
                # Training
                current_epsilon = 0
                for epoch in range(DP_EPOCHS):
                    dp_model.train()
                    for batch_x, batch_y in train_loader_dp:
                        optimizer.zero_grad()
                        outputs = dp_model(batch_x)
                        loss = loss_fn(outputs, batch_y)
                        loss.backward()
                        optimizer.step()
                    
                    current_epsilon = privacy_engine.get_epsilon(DP_DELTA)
                    
                    # Stop if reached target epsilon
                    if current_epsilon >= target_epsilon:
                        break
                
                # Evaluate
                accuracy, f1 = evaluate_model(dp_model, X_val_scaled, y_val)
                all_accuracies.append(accuracy)
                all_f1s.append(f1)
                all_epsilons.append(current_epsilon)
                
                if fold_idx == N_SPLITS - 1:
                    print(f"    Run {run_idx + 1}, Fold {fold_idx + 1}: Acc={accuracy:.4f}, Îµ={current_epsilon:.3f}")
        
        # Statistics
        acc_mean = np.mean(all_accuracies)
        acc_std = np.std(all_accuracies, ddof=1)
        acc_min = np.min(all_accuracies)
        acc_max = np.max(all_accuracies)
        f1_mean = np.mean(all_f1s)
        f1_std = np.std(all_f1s, ddof=1)
        eps_mean = np.mean(all_epsilons)
        
        dp_results[config_key] = {
            'dataset': 'adult',
            'model': model_type,
            'target_epsilon': target_epsilon,
            'actual_epsilon': eps_mean,
            'accuracy': {'mean': acc_mean, 'std': acc_std, 'min': acc_min, 'max': acc_max},
            'f1': {'mean': f1_mean, 'std': f1_std},
            'all_accuracies': all_accuracies,
            'all_f1s': all_f1s
        }
        
        print(f"\n  âœ“ DP Îµ={target_epsilon} Results:")
        print(f"    Accuracy: {acc_mean*100:.2f}% Â± {acc_std*100:.2f}% (range: {acc_min*100:.2f}% - {acc_max*100:.2f}%)")
        print(f"    F1-Score: {f1_mean*100:.2f}% Â± {f1_std*100:.2f}%")
        print(f"    Actual Îµ: {eps_mean:.3f}")
        
        # Save checkpoint after each config
        checkpoint_data = {
            'dp_results': dp_results,
            'timestamp': datetime.now().isoformat(),
            'last_completed': config_key
        }
        with open(checkpoint_path, 'w') as f:
            json.dump(checkpoint_data, f, indent=2, default=lambda x: float(x) if isinstance(x, np.floating) else x)
        print(f"    ðŸ’¾ Checkpoint saved")

print("\n" + "="*80)
print("DIFFERENTIAL PRIVACY PHASE COMPLETE - ALL ADULT CONFIGS DONE")
print("="*80)

In [None]:
# ==================== STATISTICAL ANALYSIS & COMPARISON ====================
print("\n" + "="*80)
print("STATISTICAL ANALYSIS")
print("="*80)

# Load baseline results
try:
    baseline_path = "/kaggle/input/ai-privacy-baseline-results/research_results.json"
    with open(baseline_path, 'r') as f:
        baseline_data = json.load(f)
except:
    baseline_path = os.path.join(r"c:\Users\almir\ai-privacy\backend", "models_research", "research_results.json")
    with open(baseline_path, 'r') as f:
        baseline_data = json.load(f)

baseline_results = {}
for model in ['LR', 'FNN']:
    key = f"adult_{model}"
    baseline_results[key] = {
        'accuracy': baseline_data['baseline_results']['adult'][model]['accuracy']['mean'],
        'all_accuracies': baseline_data['baseline_results']['adult'][model]['all_accuracies']
    }

print("\nâœ“ Baseline results loaded")

# DP vs Baseline comparisons
print("\n" + "="*80)
print("DIFFERENTIAL PRIVACY vs BASELINE - Statistical Tests")
print("="*80)

dp_comparison = []
for config_key, dp_data in dp_results.items():
    baseline_key = f"adult_{dp_data['model']}"
    baseline_acc = baseline_results[baseline_key]['accuracy']
    baseline_all = baseline_results[baseline_key]['all_accuracies']
    
    dp_acc = dp_data['accuracy']['mean']
    dp_all = dp_data['all_accuracies']
    accuracy_loss = baseline_acc - dp_acc
    
    # T-test
    t_stat, p_value = stats.ttest_ind(baseline_all, dp_all)
    
    dp_comparison.append({
        'Model': dp_data['model'],
        'Epsilon': dp_data['target_epsilon'],
        'DP_Accuracy': dp_acc * 100,
        'DP_Std': dp_data['accuracy']['std'] * 100,
        'Baseline': baseline_acc * 100,
        'Accuracy_Loss': accuracy_loss * 100,
        't_statistic': t_stat,
        'p_value': p_value,
        'Significant': 'Yes' if p_value < 0.05 else 'No'
    })

dp_comparison_df = pd.DataFrame(dp_comparison)
print("\n" + dp_comparison_df.to_string(index=False))

In [None]:
# ==================== SAVE RESULTS ====================
print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

# Save comprehensive JSON
results_json = {
    'metadata': {
        'timestamp': datetime.now().isoformat(),
        'dataset': 'adult',
        'random_seeds': RANDOM_SEEDS,
        'n_splits': N_SPLITS,
        'total_evaluations': len(RANDOM_SEEDS) * N_SPLITS,
        'epsilon_values': EPSILON_VALUES,
        'dp_epochs': DP_EPOCHS,
        'models': MODEL_TYPES,
        'total_configs': len(MODEL_TYPES) * len(EPSILON_VALUES)
    },
    'differential_privacy': dp_results,
    'baseline_reference': baseline_results
}

json_path = os.path.join(models_dir, 'dp_adult_results.json')
with open(json_path, 'w') as f:
    json.dump(results_json, f, indent=2, default=lambda x: float(x) if isinstance(x, np.floating) else x)
print(f"âœ“ Saved: dp_adult_results.json")

# Save comparison CSV
dp_csv_path = os.path.join(models_dir, 'dp_adult_vs_baseline.csv')
dp_comparison_df.to_csv(dp_csv_path, index=False)
print(f"âœ“ Saved: dp_adult_vs_baseline.csv")

print("\n" + "="*80)
print("ALL RESULTS SAVED")
print("="*80)

In [None]:
# ==================== VISUALIZATIONS ====================
print("\n" + "="*80)
print("GENERATING VISUALIZATIONS")
print("="*80)

# DP Privacy-Accuracy Tradeoff for Adult Dataset
fig, ax = plt.subplots(1, 1, figsize=(12, 7))
fig.suptitle('Adult Dataset: Differential Privacy Privacy-Accuracy Tradeoff\n5-Fold CV Ã— 5 Runs', 
             fontsize=16, fontweight='bold')

for model in ['LR', 'FNN']:
    subset = dp_comparison_df[dp_comparison_df['Model'] == model]
    
    ax.errorbar(subset['Epsilon'], subset['DP_Accuracy'], yerr=subset['DP_Std'], 
                marker='o', capsize=5, label=model, linewidth=2.5, markersize=10)

# Baseline lines
baseline_lr = baseline_results['adult_LR']['accuracy'] * 100
baseline_fnn = baseline_results['adult_FNN']['accuracy'] * 100
ax.axhline(y=baseline_lr, color='blue', linestyle='--', alpha=0.5, linewidth=2, label='LR Baseline')
ax.axhline(y=baseline_fnn, color='orange', linestyle='--', alpha=0.5, linewidth=2, label='FNN Baseline')

ax.set_xlabel('Privacy Budget (Îµ)', fontsize=14, fontweight='bold')
ax.set_ylabel('Accuracy (%)', fontsize=14, fontweight='bold')
ax.set_title('Adult Census Income Dataset', fontsize=14)
ax.legend(fontsize=12, loc='best')
ax.grid(True, alpha=0.3)
ax.set_xscale('log')

plt.tight_layout()
dp_viz_path = os.path.join(models_dir, 'dp_adult_privacy_accuracy_tradeoff.png')
plt.savefig(dp_viz_path, dpi=300, bbox_inches='tight')
print(f"âœ“ Saved: dp_adult_privacy_accuracy_tradeoff.png")
plt.show()

# Accuracy Loss Heatmap
print("\nâœ“ Generating accuracy loss heatmap...")
pivot_data = dp_comparison_df.pivot(index='Model', columns='Epsilon', values='Accuracy_Loss')

fig, ax = plt.subplots(1, 1, figsize=(10, 4))
sns.heatmap(pivot_data, annot=True, fmt='.2f', cmap='YlOrRd', 
            cbar_kws={'label': 'Accuracy Loss (%)'}, ax=ax)
ax.set_title('Adult Dataset: Accuracy Loss vs Baseline (%)', fontsize=14, fontweight='bold')
ax.set_xlabel('Privacy Budget (Îµ)', fontsize=12)
ax.set_ylabel('Model', fontsize=12)

heatmap_path = os.path.join(models_dir, 'dp_adult_accuracy_loss_heatmap.png')
plt.savefig(heatmap_path, dpi=300, bbox_inches='tight')
print(f"âœ“ Saved: dp_adult_accuracy_loss_heatmap.png")
plt.show()

print("\n" + "="*80)
print("VISUALIZATIONS COMPLETE")
print("="*80)

## Summary - Adult Dataset Differential Privacy

This notebook completed **ALL Adult DP configurations**:

### **Configurations**
- **Models:** Logistic Regression (LR), Feedforward Neural Network (FNN)
- **Privacy budgets (Îµ):** [0.5, 1.0, 3.0, 5.0, 10.0]
- **Total configs:** 2 models Ã— 5 Îµ values = **10 configurations**
- **Total evaluations:** 10 configs Ã— 25 evals = **250 evaluations**

### **Results saved to:**
- `/kaggle/working/models_dp_adult/` (Kaggle)
- `c:\Users\almir\ai-privacy\backend\models_dp_adult\` (Local)

### **Output files:**
1. `dp_adult_results.json` - Complete results with all metrics
2. `dp_adult_vs_baseline.csv` - Statistical comparison
3. `dp_adult_privacy_accuracy_tradeoff.png` - Visualization
4. `dp_adult_accuracy_loss_heatmap.png` - Heatmap
5. `dp_adult_checkpoint.json` - Checkpoints (crash recovery)

### **Next steps:**
- Merge with other DP results (diabetes)
- Upload to Kaggle for comprehensive analysis
- Include in final research paper