In [1]:
import numpy as np
import random
import cv2
import os
from imutils import paths
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import backend as K


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import email
import string
from bs4 import BeautifulSoup
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_curve 
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import roc_curve
import os
import warnings
warnings.filterwarnings('ignore')
np.random.seed(49)

In [3]:
import pickle

with open('email_texts.pkl', 'rb') as f:
    email_texts = pickle.load(f)

with open('label_list.pkl', 'rb') as f:
    label_list = pickle.load(f)

print("Data loaded from pickle files.")

X_train, X_test, y_train, y_test = train_test_split(email_texts, label_list, test_size=0.1, random_state=42)

Data loaded from pickle files.


In [4]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(X_train)

In [5]:
X_train = vectorizer.transform(X_train).toarray()
y_train = np.array(y_train).reshape(len(y_train), 1)

In [6]:
X_test = vectorizer.transform(X_test).toarray()
y_test = np.array(y_test).reshape(len(y_test), 1)

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score

class SimpleMLP:
    @staticmethod
    def build(input_shape):
        model = Sequential([
            Dense(64, activation='relu', input_shape=(input_shape,)),
            Dense(32, activation='relu'),
            Dense(1, activation='sigmoid')
        ])
        return model

# Function to add Laplace noise for DP
def add_laplace_noise(data, sensitivity, epsilon):
    scale = sensitivity / epsilon
    noise = np.random.laplace(0, scale, data.shape)
    return data + noise

def federated_training(num_clients, X_train, y_train, epsilon, sensitivity):
    client_data_size = X_train.shape[0] // num_clients
    client_models = []

    for i in range(num_clients):
        start = i * client_data_size
        end = start + client_data_size
        X_client, y_client = X_train[start:end], y_train[start:end]

        X_client_noisy = add_laplace_noise(X_client, sensitivity, epsilon)
        sc = StandardScaler()
        X_client_scaled = sc.fit_transform(X_client_noisy)

        # Train model on client data
        model = SimpleMLP.build(X_client_scaled.shape[1])
        model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
        model.fit(X_client_scaled, y_client, epochs=10, verbose=0)
        client_models.append(model)
    
    return client_models

# Aggregate models at server
def aggregate_models(client_models):
    
    server_model = SimpleMLP.build(client_models[0].input_shape[1])
    server_weights = [np.mean([client_model.get_weights()[layer_idx] for client_model in client_models], axis=0)
                      for layer_idx in range(len(client_models[0].get_weights()))]
    server_model.set_weights(server_weights)
    return server_model

def evaluate_utility(server_model, X_test, y_test, sensitivity, epsilon):
    noisy_X_test = add_laplace_noise(X_test, sensitivity, epsilon)
    sc = StandardScaler()
    X_test_scaled = sc.fit_transform(noisy_X_test)

    # Predict and calculate accuracy
    y_pred = (server_model.predict(X_test_scaled) > 0.5).astype(int)
    return accuracy_score(y_test, y_pred)

sensitivity = 1
num_clients = 5
epsilon_values = np.linspace(0.1, 10, 30)
accuracies = []

for epsilon in epsilon_values:
    client_models = federated_training(num_clients, X_train, y_train, epsilon, sensitivity)
    server_model = aggregate_models(client_models)
    accuracy = evaluate_utility(server_model, X_test, y_test, sensitivity, epsilon)
    accuracies.append(accuracy)

# Find optimal epsilon
optimal_index = np.argmax(accuracies)
optimal_epsilon = epsilon_values[optimal_index]

print(f"Optimal Epsilon: {optimal_epsilon}")

# Retrain and evaluate model using optimal epsilon
client_models = federated_training(num_clients, X_train, y_train, optimal_epsilon, sensitivity)
server_model = aggregate_models(client_models)
final_accuracy = evaluate_utility(server_model, X_test, y_test, sensitivity, optimal_epsilon)

print(f"Federated Model Accuracy with Differential Privacy: {final_accuracy:.2f}")


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m