In [1]:
!pip install --no-cache-dir git+https://github.com/OpenMined/PyDP.git
!pip install opacus

Collecting git+https://github.com/OpenMined/PyDP.git
  Cloning https://github.com/OpenMined/PyDP.git to /tmp/pip-req-build-qd7qc5kc
  Running command git clone --filter=blob:none --quiet https://github.com/OpenMined/PyDP.git /tmp/pip-req-build-qd7qc5kc

  Resolved https://github.com/OpenMined/PyDP.git to commit 01620d5553cc2e6f54b47bc748498f3eb156543c
  Running command git submodule update --init --recursive -q
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: python-dp
  Building wheel for python-dp (pyproject.toml) ... [?25ldone
[?25h  Created wheel for python-dp: filename=python_dp-1.1.5rc4-cp310-cp310-linux_x86_64.whl size=38795 sha256=1ab357d9a1ace9d1f33fafd73a2ae639bcbf19afffcd3f9de91babd0cebefe69
  Stored in directory: /tmp/pip-ephem-wheel-cache-4one74_h/wheels/c9/b9/e9/cb36261017c51e1d4497438ec8ff121a52d75047f91f7f4c6

In [None]:
#Diabetes Dataset

In [17]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler
from opacus import PrivacyEngine

In [10]:
torch.device("cuda" if torch.cuda.is_available() else "cpu")

device(type='cuda')

In [11]:
###############################################################################
# 1) LOAD AND PREPROCESS DATA
###############################################################################
def load_and_preprocess_data(csv_path):
    # Load the dataset
    data = pd.read_csv(csv_path)
    
    # Check for missing values and handle them
    # Replace zeros with NaN in certain columns where zero doesn't make sense
    cols_to_replace_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
    for col in cols_to_replace_zeros:
        data[col] = data[col].replace(0, np.nan)
    
    # Fill NaN values with the median of each column
    for col in cols_to_replace_zeros:
        data[col] = data[col].fillna(data[col].median())
    
    # Separate features and target
    X = data.drop(columns=['Outcome']).values
    y = data['Outcome'].values
    
    # Standardize the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    return X.astype(np.float32), y.astype(np.int64)

In [12]:
###############################################################################
# 2) TORCH DATASET
###############################################################################
class MedicalDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [13]:
###############################################################################
# 3) BUILD A SIMPLE MLP WITH PYTORCH
###############################################################################
class DeeperMLP(nn.Module):
    def __init__(self, input_dim, num_classes=2):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.gn1 = nn.GroupNorm(8, 512)
        self.fc2 = nn.Linear(512, 256)
        self.gn2 = nn.GroupNorm(8, 256)
        self.fc3 = nn.Linear(256, 128)
        self.gn3 = nn.GroupNorm(8, 128)
        self.fc4 = nn.Linear(128, 64)
        self.gn4 = nn.GroupNorm(8, 64)
        self.fc5 = nn.Linear(64, num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = torch.relu(self.gn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn2(self.fc2(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn3(self.fc3(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn4(self.fc4(x)))
        x = self.dropout(x)
        x = self.fc5(x)
        return x

In [14]:
###############################################################################
# 4) TRAIN WITH DP-SGD (Opacus) AND EVALUATE
###############################################################################
def train_dp_sgd(X, y, epochs=10, batch_size=128, lr=0.01, max_grad_norm=1.0, noise_multiplier=1.1, delta=1e-5):
    N = len(X)
    indices = np.arange(N)
    np.random.shuffle(indices)
    X, y = X[indices], y[indices]
    split = int(0.8*N)
    X_tr, X_te = X[:split], X[split:]
    y_tr, y_te = y[:split], y[split:]
    train_ds = MedicalDataset(X_tr, y_tr)
    test_ds = MedicalDataset(X_te, y_te)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    input_dim = X.shape[1]
    num_classes = len(np.unique(y))
    model = DeeperMLP(input_dim, num_classes)
    # Switch to SGD with Momentum
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
    criterion = nn.CrossEntropyLoss()

    # Initialize PrivacyEngine with RDP accountant
    privacy_engine = PrivacyEngine(accountant="rdp")
    model, optimizer, train_loader = privacy_engine.make_private(
        module=model,
        optimizer=optimizer,
        data_loader=train_loader,
        noise_multiplier=noise_multiplier,
        max_grad_norm=max_grad_norm,
    )

    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * y_batch.size(0)
            _, preds = torch.max(outputs, dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
        train_loss = running_loss / total
        train_acc = correct / total
        scheduler.step(train_loss)
        epsilon, best_alpha = privacy_engine.accountant.get_privacy_spent(delta=delta)
        print(f"Epoch {epoch+1}/{epochs}, Loss={train_loss:.4f}, Acc={train_acc:.4f}, Eps={epsilon:.2f}")

    model.eval()
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            _, preds = torch.max(outputs, dim=1)
            correct_test += (preds == y_batch).sum().item()
            total_test += y_batch.size(0)
    test_acc = correct_test / total_test
    epsilon, best_alpha = privacy_engine.accountant.get_privacy_spent(delta=delta)
    print(f"Final Eps={epsilon:.2f} (delta={delta}), Test Acc={test_acc:.4f}")
    return model

In [26]:
###############################################################################
# 5) MAIN
###############################################################################
def main():
    csv_path = "diabetes (3).csv"
    X, y = load_and_preprocess_data(csv_path)
    # Adjusted learning rate for SGD
    model = train_dp_sgd(X, y, epochs=10, batch_size=64,  # Larger batch size for stability
                        lr=0.001,  # Higher learning rate for SGD
                        max_grad_norm=1.0,  # Adjusted gradient norm
                        noise_multiplier=1.3,  # Adjusted noise multiplier
                        delta=1e-5)
    return model

In [27]:
main()



Epoch 1/10 - Loss: 0.7635, Acc: 0.6206, ε: 2.04
Epoch 2/10 - Loss: 0.7835, Acc: 0.6074, ε: 2.54
Epoch 3/10 - Loss: 0.7432, Acc: 0.6260, ε: 2.95
Epoch 4/10 - Loss: 0.6967, Acc: 0.6738, ε: 3.30
Epoch 5/10 - Loss: 0.7590, Acc: 0.6398, ε: 3.62
Epoch 6/10 - Loss: 0.7560, Acc: 0.6364, ε: 3.92
Epoch 7/10 - Loss: 0.7903, Acc: 0.6224, ε: 4.19
Epoch 8/10 - Loss: 0.7352, Acc: 0.6588, ε: 4.46
Epoch 9/10 - Loss: 0.7369, Acc: 0.6536, ε: 4.70
Epoch 10/10 - Loss: 0.7471, Acc: 0.6550, ε: 4.94

Final Privacy Budget: ε=4.94, δ=1e-05
Test Accuracy: 0.6948


GradSampleModule(DeeperMLP(
  (fc1): Linear(in_features=8, out_features=512, bias=True)
  (gn1): GroupNorm(8, 512, eps=1e-05, affine=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (gn2): GroupNorm(8, 256, eps=1e-05, affine=True)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (gn3): GroupNorm(8, 128, eps=1e-05, affine=True)
  (fc4): Linear(in_features=128, out_features=64, bias=True)
  (gn4): GroupNorm(8, 64, eps=1e-05, affine=True)
  (fc5): Linear(in_features=64, out_features=2, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
))

In [None]:
#ACTGAN Diabetes Dataset

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from opacus import PrivacyEngine

###############################################################################
# 1) LOAD AND PREPROCESS DATA
###############################################################################
def load_and_preprocess_data(csv_path):
    # Load dataset
    df = pd.read_csv(csv_path)
    
    # Separate features and target
    X = df.drop('Outcome', axis=1).values
    y = df['Outcome'].values
    
    # Handle zero values in features where zero is invalid
    # Columns: Glucose, BloodPressure, SkinThickness, Insulin, BMI
    invalid_zero_cols = [1, 2, 3, 4, 5]  # Indices of the columns to impute
    for col in invalid_zero_cols:
        col_data = X[:, col]
        # Replace zeros with the mean of the column (ignoring zeros)
        col_mean = np.mean(col_data[col_data != 0])
        col_data[col_data == 0] = col_mean
        X[:, col] = col_data
    
    return X.astype(np.float32), y.astype(np.int64)

###############################################################################
# 2) TORCH DATASET
###############################################################################
class MedicalDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

###############################################################################
# 3) BUILD A SIMPLE MLP WITH PYTORCH
###############################################################################
class DeeperMLP(nn.Module):
    def __init__(self, input_dim, num_classes=2):  # Fixed num_classes for binary classification
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.gn1 = nn.GroupNorm(8, 512)
        self.fc2 = nn.Linear(512, 256)
        self.gn2 = nn.GroupNorm(8, 256)
        self.fc3 = nn.Linear(256, 128)
        self.gn3 = nn.GroupNorm(8, 128)
        self.fc4 = nn.Linear(128, 64)
        self.gn4 = nn.GroupNorm(8, 64)
        self.fc5 = nn.Linear(64, num_classes)
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        x = torch.relu(self.gn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn2(self.fc2(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn3(self.fc3(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn4(self.fc4(x)))
        x = self.dropout(x)
        x = self.fc5(x)
        return x

###############################################################################
# 4) TRAIN WITH DP-SGD (Opacus) AND EVALUATE
###############################################################################
def train_dp_sgd(X, y, epochs=10, batch_size=64, lr=0.001, max_grad_norm=1.0, noise_multiplier=1.3, delta=1e-5):
    # Split into training and test sets
    N = len(X)
    indices = np.arange(N)
    np.random.shuffle(indices)
    split = int(0.8 * N)
    X_train, X_test = X[indices[:split]], X[indices[split:]]
    y_train, y_test = y[indices[:split]], y[indices[split:]]
    
    # Scale features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Create datasets and dataloaders
    train_ds = MedicalDataset(X_train, y_train)
    test_ds = MedicalDataset(X_test, y_test)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
    
    # Initialize model
    input_dim = X.shape[1]
    model = DeeperMLP(input_dim)
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
    criterion = nn.CrossEntropyLoss()
    
    # Attach PrivacyEngine
    privacy_engine = PrivacyEngine(accountant="rdp")
    model, optimizer, train_loader = privacy_engine.make_private(
        module=model,
        optimizer=optimizer,
        data_loader=train_loader,
        noise_multiplier=noise_multiplier,
        max_grad_norm=max_grad_norm,
    )
    
    # Training loop
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch.float())
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * y_batch.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
        
        epoch_loss = running_loss / total
        epoch_acc = correct / total
        scheduler.step(epoch_loss)
        
        epsilon = privacy_engine.get_epsilon(delta)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.4f}, Acc: {epoch_acc:.4f}, ε: {epsilon:.2f}")
    
    # Evaluation
    model.eval()
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch.float())
            _, preds = torch.max(outputs, 1)
            correct_test += (preds == y_batch).sum().item()
            total_test += y_batch.size(0)
    
    test_acc = correct_test / total_test
    epsilon = privacy_engine.get_epsilon(delta)
    print(f"\nFinal Privacy Budget: ε={epsilon:.2f}, δ={delta}")
    print(f"Test Accuracy: {test_acc:.4f}")
    
    return model

###############################################################################
# 5) MAIN
###############################################################################
def main():
    csv_path = "Synthetic_generated_ACTGAN_Daibetes.csv"  # Update path as needed
    X, y = load_and_preprocess_data(csv_path)
    
    # Train with DP-SGD
    model = train_dp_sgd(
        X, y,
        epochs=10,
        batch_size=64,
        lr=0.001,
        max_grad_norm=1.0,
        noise_multiplier=1.3,
        delta=1e-5
    )

if __name__ == "__main__":
    main()



Epoch 1/10 - Loss: 0.6872, Acc: 0.5946, ε: 2.04
Epoch 2/10 - Loss: 0.6925, Acc: 0.5608, ε: 2.54
Epoch 3/10 - Loss: 0.6986, Acc: 0.5538, ε: 2.95
Epoch 4/10 - Loss: 0.6739, Acc: 0.6065, ε: 3.30
Epoch 5/10 - Loss: 0.6888, Acc: 0.5969, ε: 3.62
Epoch 6/10 - Loss: 0.6595, Acc: 0.6355, ε: 3.92
Epoch 7/10 - Loss: 0.6719, Acc: 0.6336, ε: 4.19
Epoch 8/10 - Loss: 0.6708, Acc: 0.6327, ε: 4.46
Epoch 9/10 - Loss: 0.6585, Acc: 0.6413, ε: 4.70
Epoch 10/10 - Loss: 0.7205, Acc: 0.6231, ε: 4.94

Final Privacy Budget: ε=4.94, δ=1e-05
Test Accuracy: 0.6883


In [None]:
#Credit Customers Dataset

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from opacus import PrivacyEngine

###############################################################################
# 1) DATA PREPROCESSING
###############################################################################
def load_and_preprocess_data(csv_path):
    # Load dataset
    df = pd.read_csv(csv_path)
    
    # Define numerical and categorical columns
    numerical_cols = ['duration', 'creditamount', 'installmentcommitment', 
                      'residencesince', 'age', 'existingcredits', 'numdependents']
    categorical_cols = ['checkingstatus', 'credithistory', 'purpose', 'savingsstatus', 
                         'employment', 'personalstatus', 'otherparties', 'propertymagnitude', 
                         'otherpaymentplans', 'housing', 'job', 'owntelephone', 'foreignworker']
    
    # Separate target
    y = df['class'].values.astype(int)
    df = df.drop(columns=['class'])
    
    # Process numerical features
    numerical_data = df[numerical_cols].astype(float)
    scaler = StandardScaler()
    numerical_data = scaler.fit_transform(numerical_data)
    
    # Process categorical features (one-hot encoding)
    categorical_data = pd.get_dummies(df[categorical_cols], columns=categorical_cols)
    
    # Combine features
    X = np.hstack([numerical_data, categorical_data.values.astype(float)])
    
    return X, y

###############################################################################
# 2) TORCH DATASET
###############################################################################
class MedicalDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).long()  # CrossEntropyLoss expects long integers

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

###############################################################################
# 3) BUILD A SIMPLE MLP WITH PYTORCH
###############################################################################
class DeeperMLP(nn.Module):
    def __init__(self, input_dim, num_classes=2):  # Binary classification
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.gn1 = nn.GroupNorm(8, 512)
        self.fc2 = nn.Linear(512, 256)
        self.gn2 = nn.GroupNorm(8, 256)
        self.fc3 = nn.Linear(256, 128)
        self.gn3 = nn.GroupNorm(8, 128)
        self.fc4 = nn.Linear(128, 64)
        self.gn4 = nn.GroupNorm(8, 64)
        self.fc5 = nn.Linear(64, num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = torch.relu(self.gn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn2(self.fc2(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn3(self.fc3(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn4(self.fc4(x)))
        x = self.dropout(x)
        x = self.fc5(x)
        return x

###############################################################################
# 4) TRAIN WITH DP-SGD (Opacus) AND EVALUATE
###############################################################################
def train_dp_sgd(X, y, epochs=10, batch_size=128, lr=0.01, 
                 max_grad_norm=1.0, noise_multiplier=1.1, delta=1e-5):
    # Shuffle and split data
    N = len(X)
    indices = np.arange(N)
    np.random.shuffle(indices)
    X, y = X[indices], y[indices]
    split = int(0.8 * N)
    X_tr, X_te = X[:split], X[split:]
    y_tr, y_te = y[:split], y[split:]
    
    # Create datasets and loaders
    train_ds = MedicalDataset(X_tr, y_tr)
    test_ds = MedicalDataset(X_te, y_te)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
    
    # Initialize model and optimizer
    input_dim = X.shape[1]
    num_classes = len(np.unique(y))
    model = DeeperMLP(input_dim, num_classes)
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
    criterion = nn.CrossEntropyLoss()
    
    # Add DP constraints
    privacy_engine = PrivacyEngine(accountant="rdp")
    model, optimizer, train_loader = privacy_engine.make_private(
        module=model,
        optimizer=optimizer,
        data_loader=train_loader,
        noise_multiplier=noise_multiplier,
        max_grad_norm=max_grad_norm,
    )
    
    # Training loop
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * y_batch.size(0)
            _, preds = torch.max(outputs, dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
        
        # Metrics and scheduler
        train_loss = running_loss / total
        train_acc = correct / total
        scheduler.step(train_loss)
        epsilon, _ = privacy_engine.accountant.get_privacy_spent(delta=delta)
        print(f"Epoch {epoch+1}/{epochs} | Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | ε: {epsilon:.2f}")
    
    # Evaluation
    model.eval()
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            _, preds = torch.max(outputs, dim=1)
            correct_test += (preds == y_batch).sum().item()
            total_test += y_batch.size(0)
    
    test_acc = correct_test / total_test
    epsilon, _ = privacy_engine.accountant.get_privacy_spent(delta=delta)
    print(f"\nFinal Privacy: ε={epsilon:.2f}, δ={delta} | Test Accuracy: {test_acc:.4f}")
    return model

###############################################################################
# 5) MAIN
###############################################################################
def main():
    csv_path = "credit_customers.csv"
    X, y = load_and_preprocess_data(csv_path)
    
    # Train with DP-SGD
    model = train_dp_sgd(
        X, y,
        epochs=10,
        batch_size=64,
        lr=0.005,
        max_grad_norm=1.0,
        noise_multiplier=0.7,
        delta=1e-5
    )

if __name__ == "__main__":
    main()



Epoch 1/10 | Loss: 0.7908 | Acc: 0.4147 | ε: 6.59
Epoch 2/10 | Loss: 0.6948 | Acc: 0.5610 | ε: 8.04
Epoch 3/10 | Loss: 0.6172 | Acc: 0.6946 | ε: 9.15
Epoch 4/10 | Loss: 0.6365 | Acc: 0.7118 | ε: 10.12
Epoch 5/10 | Loss: 0.8027 | Acc: 0.6771 | ε: 11.01
Epoch 6/10 | Loss: 0.9114 | Acc: 0.6912 | ε: 11.80
Epoch 7/10 | Loss: 1.0091 | Acc: 0.7000 | ε: 12.57
Epoch 8/10 | Loss: 1.0552 | Acc: 0.7018 | ε: 13.27
Epoch 9/10 | Loss: 1.0148 | Acc: 0.7299 | ε: 13.98
Epoch 10/10 | Loss: 1.1767 | Acc: 0.6968 | ε: 14.62

Final Privacy: ε=14.62, δ=1e-05 | Test Accuracy: 0.6950


In [None]:
#ACTGAN Credit Customers Dataset

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from opacus import PrivacyEngine

###############################################################################
# 1) DATA PREPROCESSING
###############################################################################
def load_and_preprocess_data(csv_path):
    # Load dataset
    df = pd.read_csv(csv_path)
    
    # Define numerical and categorical columns
    numerical_cols = ['duration', 'creditamount', 'installmentcommitment', 
                      'residencesince', 'age', 'existingcredits', 'numdependents']
    categorical_cols = ['checkingstatus', 'credithistory', 'purpose', 'savingsstatus', 
                         'employment', 'personalstatus', 'otherparties', 'propertymagnitude', 
                         'otherpaymentplans', 'housing', 'job', 'owntelephone', 'foreignworker']
    
    # Separate target
    y = df['class'].values.astype(int)
    df = df.drop(columns=['class'])
    
    # Process numerical features
    numerical_data = df[numerical_cols].astype(float)
    scaler = StandardScaler()
    numerical_data = scaler.fit_transform(numerical_data)
    
    # Process categorical features (one-hot encoding)
    categorical_data = pd.get_dummies(df[categorical_cols], columns=categorical_cols)
    
    # Combine features
    X = np.hstack([numerical_data, categorical_data.values.astype(float)])
    
    return X, y

###############################################################################
# 2) TORCH DATASET
###############################################################################
class MedicalDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).long()  # CrossEntropyLoss expects long integers

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

###############################################################################
# 3) BUILD A SIMPLE MLP WITH PYTORCH
###############################################################################
class DeeperMLP(nn.Module):
    def __init__(self, input_dim, num_classes=2):  # Binary classification
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.gn1 = nn.GroupNorm(8, 512)
        self.fc2 = nn.Linear(512, 256)
        self.gn2 = nn.GroupNorm(8, 256)
        self.fc3 = nn.Linear(256, 128)
        self.gn3 = nn.GroupNorm(8, 128)
        self.fc4 = nn.Linear(128, 64)
        self.gn4 = nn.GroupNorm(8, 64)
        self.fc5 = nn.Linear(64, num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = torch.relu(self.gn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn2(self.fc2(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn3(self.fc3(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn4(self.fc4(x)))
        x = self.dropout(x)
        x = self.fc5(x)
        return x

###############################################################################
# 4) TRAIN WITH DP-SGD (Opacus) AND EVALUATE
###############################################################################
def train_dp_sgd(X, y, epochs=10, batch_size=128, lr=0.01, 
                 max_grad_norm=1.0, noise_multiplier=1.1, delta=1e-5):
    # Shuffle and split data
    N = len(X)
    indices = np.arange(N)
    np.random.shuffle(indices)
    X, y = X[indices], y[indices]
    split = int(0.8 * N)
    X_tr, X_te = X[:split], X[split:]
    y_tr, y_te = y[:split], y[split:]
    
    # Create datasets and loaders
    train_ds = MedicalDataset(X_tr, y_tr)
    test_ds = MedicalDataset(X_te, y_te)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
    
    # Initialize model and optimizer
    input_dim = X.shape[1]
    num_classes = len(np.unique(y))
    model = DeeperMLP(input_dim, num_classes)
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
    criterion = nn.CrossEntropyLoss()
    
    # Add DP constraints
    privacy_engine = PrivacyEngine(accountant="rdp")
    model, optimizer, train_loader = privacy_engine.make_private(
        module=model,
        optimizer=optimizer,
        data_loader=train_loader,
        noise_multiplier=noise_multiplier,
        max_grad_norm=max_grad_norm,
    )
    
    # Training loop
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * y_batch.size(0)
            _, preds = torch.max(outputs, dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
        
        # Metrics and scheduler
        train_loss = running_loss / total
        train_acc = correct / total
        scheduler.step(train_loss)
        epsilon, _ = privacy_engine.accountant.get_privacy_spent(delta=delta)
        print(f"Epoch {epoch+1}/{epochs} | Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | ε: {epsilon:.2f}")
    
    # Evaluation
    model.eval()
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            _, preds = torch.max(outputs, dim=1)
            correct_test += (preds == y_batch).sum().item()
            total_test += y_batch.size(0)
    
    test_acc = correct_test / total_test
    epsilon, _ = privacy_engine.accountant.get_privacy_spent(delta=delta)
    print(f"\nFinal Privacy: ε={epsilon:.2f}, δ={delta} | Test Accuracy: {test_acc:.4f}")
    return model

###############################################################################
# 5) MAIN
###############################################################################
def main():
    csv_path = "ACTGAN_generated.csv"
    X, y = load_and_preprocess_data(csv_path)
    
    # Train with DP-SGD
    model = train_dp_sgd(
        X, y,
        epochs=10,
        batch_size=64,
        lr=0.005,
        max_grad_norm=1.0,
        noise_multiplier=0.7,
        delta=1e-5
    )

if __name__ == "__main__":
    main()



Epoch 1/10 | Loss: 0.6599 | Acc: 0.6341 | ε: 6.59
Epoch 2/10 | Loss: 0.6710 | Acc: 0.6642 | ε: 8.04
Epoch 3/10 | Loss: 0.7742 | Acc: 0.6285 | ε: 9.15
Epoch 4/10 | Loss: 0.7979 | Acc: 0.6626 | ε: 10.12
Epoch 5/10 | Loss: 0.9566 | Acc: 0.6282 | ε: 11.01
Epoch 6/10 | Loss: 0.9606 | Acc: 0.6612 | ε: 11.80
Epoch 7/10 | Loss: 0.9567 | Acc: 0.6749 | ε: 12.57
Epoch 8/10 | Loss: 1.0830 | Acc: 0.6563 | ε: 13.27
Epoch 9/10 | Loss: 1.0575 | Acc: 0.6659 | ε: 13.98
Epoch 10/10 | Loss: 1.1835 | Acc: 0.6358 | ε: 14.62

Final Privacy: ε=14.62, δ=1e-05 | Test Accuracy: 0.6800


In [None]:
#Employee Dataset

In [10]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from opacus import PrivacyEngine
import warnings
warnings.filterwarnings('ignore')

###############################################################################
# 1) LOAD AND PREPROCESS DATA
###############################################################################
def load_and_preprocess_data(csv_path):
    # Load the data
    df = pd.read_csv(csv_path)
    
    # Check the columns
    print("Columns in the dataset:", df.columns.tolist())
    print("Sample data:\n", df.head())
    
    # Extract features and target
    X = df.drop('LeaveOrNot', axis=1)
    y = df['LeaveOrNot']
    
    # Identify categorical and numerical columns
    categorical_cols = ['Education', 'City', 'Gender', 'EverBenched']
    numerical_cols = ['JoiningYear', 'PaymentTier', 'Age', 'ExperienceInCurrentDomain']
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(drop='first'), categorical_cols)
        ])
    
    # Fit and transform the data
    X_processed = preprocessor.fit_transform(X)
    
    # Convert to numpy arrays
    X_processed = X_processed.toarray() if hasattr(X_processed, 'toarray') else X_processed
    y_np = y.values
    
    print(f"Processed data shape: {X_processed.shape}")
    print(f"Target shape: {y_np.shape}")
    print(f"Number of classes: {len(np.unique(y_np))}")
    
    return X_processed.astype(np.float32), y_np.astype(np.int64)

###############################################################################
# 2) TORCH DATASET
###############################################################################
class MedicalDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

###############################################################################
# 3) BUILD A SIMPLE MLP WITH PYTORCH
###############################################################################
class DeeperMLP(nn.Module):
    def __init__(self, input_dim, num_classes=2):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.gn1 = nn.GroupNorm(8, 512)
        self.fc2 = nn.Linear(512, 256)
        self.gn2 = nn.GroupNorm(8, 256)
        self.fc3 = nn.Linear(256, 128)
        self.gn3 = nn.GroupNorm(8, 128)
        self.fc4 = nn.Linear(128, 64)
        self.gn4 = nn.GroupNorm(8, 64)
        self.fc5 = nn.Linear(64, num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = torch.relu(self.gn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn2(self.fc2(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn3(self.fc3(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn4(self.fc4(x)))
        x = self.dropout(x)
        x = self.fc5(x)
        return x

###############################################################################
# 4) TRAIN WITH DP-SGD (Opacus) AND EVALUATE
###############################################################################
def train_dp_sgd(X, y, epochs=10, batch_size=128, lr=0.01, max_grad_norm=1.0, noise_multiplier=1.1, delta=1e-5):
    N = len(X)
    indices = np.arange(N)
    np.random.shuffle(indices)
    X, y = X[indices], y[indices]
    split = int(0.8*N)
    X_tr, X_te = X[:split], X[split:]
    y_tr, y_te = y[:split], y[split:]
    train_ds = MedicalDataset(X_tr, y_tr)
    test_ds = MedicalDataset(X_te, y_te)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    input_dim = X.shape[1]
    num_classes = len(np.unique(y))
    model = DeeperMLP(input_dim, num_classes)
    # Switch to SGD with Momentum
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
    criterion = nn.CrossEntropyLoss()

    # Initialize PrivacyEngine with RDP accountant
    privacy_engine = PrivacyEngine(accountant="rdp")
    model, optimizer, train_loader = privacy_engine.make_private(
        module=model,
        optimizer=optimizer,
        data_loader=train_loader,
        noise_multiplier=noise_multiplier,
        max_grad_norm=max_grad_norm,
    )

    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * y_batch.size(0)
            _, preds = torch.max(outputs, dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
        train_loss = running_loss / total
        train_acc = correct / total
        scheduler.step(train_loss)
        epsilon, best_alpha = privacy_engine.accountant.get_privacy_spent(delta=delta)
        print(f"Epoch {epoch+1}/{epochs}, Loss={train_loss:.4f}, Acc={train_acc:.4f}, Eps={epsilon:.2f}")

    model.eval()
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            _, preds = torch.max(outputs, dim=1)
            correct_test += (preds == y_batch).sum().item()
            total_test += y_batch.size(0)
        test_acc = correct_test / total_test
        epsilon, best_alpha = privacy_engine.accountant.get_privacy_spent(delta=delta)
        print(f"Final Eps={epsilon:.2f} (delta={delta}), Test Acc={test_acc:.4f}")
    return model

###############################################################################
# 5) MAIN
###############################################################################
def main():
    csv_path = "Employee.csv"
    X, y = load_and_preprocess_data(csv_path)
    # Adjusted learning rate for SGD
    model = train_dp_sgd(X, y, epochs=10, batch_size=64, # Larger batch size for stability
                         lr=0.001, # Higher learning rate for SGD
                         max_grad_norm=1.0, # Adjusted gradient norm
                         noise_multiplier=1.3, # Adjusted noise multiplier
                         delta=1e-5)
    
    # Save the model
    torch.save(model.state_dict(), "dp_sgd_employee_model.pth")
    print("Model saved successfully!")

if __name__ == "__main__":
    main()

Columns in the dataset: ['Education', 'JoiningYear', 'City', 'PaymentTier', 'Age', 'Gender', 'EverBenched', 'ExperienceInCurrentDomain', 'LeaveOrNot']
Sample data:
    Education  JoiningYear       City  PaymentTier  Age  Gender EverBenched  \
0  Bachelors         2017  Bangalore            3   34    Male          No   
1  Bachelors         2013       Pune            1   28  Female          No   
2  Bachelors         2014  New Delhi            3   38  Female          No   
3    Masters         2016  Bangalore            3   27    Male          No   
4    Masters         2017       Pune            3   24    Male         Yes   

   ExperienceInCurrentDomain  LeaveOrNot  
0                          0           0  
1                          3           1  
2                          2           0  
3                          5           1  
4                          2           1  
Processed data shape: (4653, 10)
Target shape: (4653,)
Number of classes: 2
Epoch 1/10, Loss=0.9354, Acc=0.3

In [None]:
#ACTGAN Employee Dataset

In [11]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from opacus import PrivacyEngine
import warnings
warnings.filterwarnings('ignore')

###############################################################################
# 1) LOAD AND PREPROCESS DATA
###############################################################################
def load_and_preprocess_data(csv_path):
    # Load the data
    df = pd.read_csv(csv_path)
    
    # Check the columns
    print("Columns in the dataset:", df.columns.tolist())
    print("Sample data:\n", df.head())
    
    # Extract features and target
    X = df.drop('LeaveOrNot', axis=1)
    y = df['LeaveOrNot']
    
    # Identify categorical and numerical columns
    categorical_cols = ['Education', 'City', 'Gender', 'EverBenched']
    numerical_cols = ['JoiningYear', 'PaymentTier', 'Age', 'ExperienceInCurrentDomain']
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(drop='first'), categorical_cols)
        ])
    
    # Fit and transform the data
    X_processed = preprocessor.fit_transform(X)
    
    # Convert to numpy arrays
    X_processed = X_processed.toarray() if hasattr(X_processed, 'toarray') else X_processed
    y_np = y.values
    
    print(f"Processed data shape: {X_processed.shape}")
    print(f"Target shape: {y_np.shape}")
    print(f"Number of classes: {len(np.unique(y_np))}")
    
    return X_processed.astype(np.float32), y_np.astype(np.int64)

###############################################################################
# 2) TORCH DATASET
###############################################################################
class MedicalDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

###############################################################################
# 3) BUILD A SIMPLE MLP WITH PYTORCH
###############################################################################
class DeeperMLP(nn.Module):
    def __init__(self, input_dim, num_classes=2):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.gn1 = nn.GroupNorm(8, 512)
        self.fc2 = nn.Linear(512, 256)
        self.gn2 = nn.GroupNorm(8, 256)
        self.fc3 = nn.Linear(256, 128)
        self.gn3 = nn.GroupNorm(8, 128)
        self.fc4 = nn.Linear(128, 64)
        self.gn4 = nn.GroupNorm(8, 64)
        self.fc5 = nn.Linear(64, num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = torch.relu(self.gn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn2(self.fc2(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn3(self.fc3(x)))
        x = self.dropout(x)
        x = torch.relu(self.gn4(self.fc4(x)))
        x = self.dropout(x)
        x = self.fc5(x)
        return x

###############################################################################
# 4) TRAIN WITH DP-SGD (Opacus) AND EVALUATE
###############################################################################
def train_dp_sgd(X, y, epochs=10, batch_size=128, lr=0.01, max_grad_norm=1.0, noise_multiplier=1.1, delta=1e-5):
    N = len(X)
    indices = np.arange(N)
    np.random.shuffle(indices)
    X, y = X[indices], y[indices]
    split = int(0.8*N)
    X_tr, X_te = X[:split], X[split:]
    y_tr, y_te = y[:split], y[split:]
    train_ds = MedicalDataset(X_tr, y_tr)
    test_ds = MedicalDataset(X_te, y_te)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    input_dim = X.shape[1]
    num_classes = len(np.unique(y))
    model = DeeperMLP(input_dim, num_classes)
    # Switch to SGD with Momentum
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
    criterion = nn.CrossEntropyLoss()

    # Initialize PrivacyEngine with RDP accountant
    privacy_engine = PrivacyEngine(accountant="rdp")
    model, optimizer, train_loader = privacy_engine.make_private(
        module=model,
        optimizer=optimizer,
        data_loader=train_loader,
        noise_multiplier=noise_multiplier,
        max_grad_norm=max_grad_norm,
    )

    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * y_batch.size(0)
            _, preds = torch.max(outputs, dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
        train_loss = running_loss / total
        train_acc = correct / total
        scheduler.step(train_loss)
        epsilon, best_alpha = privacy_engine.accountant.get_privacy_spent(delta=delta)
        print(f"Epoch {epoch+1}/{epochs}, Loss={train_loss:.4f}, Acc={train_acc:.4f}, Eps={epsilon:.2f}")

    model.eval()
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            _, preds = torch.max(outputs, dim=1)
            correct_test += (preds == y_batch).sum().item()
            total_test += y_batch.size(0)
        test_acc = correct_test / total_test
        epsilon, best_alpha = privacy_engine.accountant.get_privacy_spent(delta=delta)
        print(f"Final Eps={epsilon:.2f} (delta={delta}), Test Acc={test_acc:.4f}")
    return model

###############################################################################
# 5) MAIN
###############################################################################
def main():
    csv_path = "tabular-actgan-employee.csv"
    X, y = load_and_preprocess_data(csv_path)
    # Adjusted learning rate for SGD
    model = train_dp_sgd(X, y, epochs=10, batch_size=64, # Larger batch size for stability
                         lr=0.001, # Higher learning rate for SGD
                         max_grad_norm=1.0, # Adjusted gradient norm
                         noise_multiplier=1.3, # Adjusted noise multiplier
                         delta=1e-5)
    
    # Save the model
    torch.save(model.state_dict(), "dp_sgd_employee_model.pth")
    print("Model saved successfully!")

if __name__ == "__main__":
    main()

Columns in the dataset: ['Education', 'JoiningYear', 'City', 'PaymentTier', 'Age', 'Gender', 'EverBenched', 'ExperienceInCurrentDomain', 'LeaveOrNot']
Sample data:
   Education  JoiningYear       City  PaymentTier  Age  Gender EverBenched  \
0   Masters         2017  New Delhi            2   38    Male          No   
1   Masters         2012  Bangalore            3   26    Male          No   
2   Masters         2015  New Delhi            2   26  Female          No   
3   Masters         2013  New Delhi            3   37  Female          No   
4   Masters         2017  New Delhi            2   36    Male          No   

   ExperienceInCurrentDomain  LeaveOrNot  
0                          2           0  
1                          4           0  
2                          4           1  
3                          1           1  
4                          2           1  
Processed data shape: (4653, 10)
Target shape: (4653,)
Number of classes: 2
Epoch 1/10, Loss=0.7750, Acc=0.5091, E