In [None]:
pip install git+https://github.com/data61/python-paillier.git

In [None]:
import numpy as np
import pandas as pd
import json
import time
from contextlib import contextmanager
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score


np.random.seed(99)

def load_data_from_jsonl(file_path):
    #Reads JSONL file and extracts email texts and their corresponding labels.
    texts, labels = [], []
    with open(file_path, 'r') as f:
        for line in f:
            entry = json.loads(line)
            texts.append(entry['text'])  # Adapt based on the JSON schema
            labels.append(1 if entry['label_text'] == 'spam' else -1)  # Convert labels to numerical
    return np.array(texts), np.array(labels)

def process_datasets(train_file, test_file):
    #Load, preprocess, and transform training and testing datasets.
    print("Reading training dataset.")
    train_texts, train_labels = load_data_from_jsonl(train_file)

    print("Reading testing dataset.")
    test_texts, test_labels = load_data_from_jsonl(test_file)

    # Text vectorization (Bag-of-Words with Tfidf, without IDF)
    vectorizer = TfidfVectorizer(use_idf=False, norm=None, stop_words='english', min_df=0.001)

    # Fit vectorizer on training data and apply to test data
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)

    print(f'Feature space size: {X_train.shape[1]}')

    # Ensure both classes are present in training set
    if len(np.unique(train_labels)) < 2:
        raise ValueError("Training data must have both spam and non-spam labels.")

    print(f"Proportion of spam: {np.mean(train_labels == 1):.2f}, ham: {np.mean(train_labels == -1):.2f}")

    return X_train, train_labels, X_test, test_labels

@contextmanager
def measure_time():
    #Context manager to measure execution time.
    start = time.perf_counter()
    yield
    end = time.perf_counter()
    elapsed = end - start
    print(f'Elapsed time: {elapsed:.2f} seconds')

class SystemAdmin:
    # Responsible for model training
    def __init__(self):
        self.svm_model = SVC(kernel='linear')
        self.lr_model = LogisticRegression()
        self.perceptron_model = Perceptron()

    def train_models(self, X, y):
        #Train SVM, Logistic Regression, and Perceptron models.
        self.svm_model.fit(X, y)
        self.lr_model.fit(X, y)
        self.perceptron_model.fit(X, y)

    def make_prediction(self, model, X):
        #Make predictions with the specified model.
        return model.predict(X)


if __name__ == '__main__':
    train_path = 'train.jsonl'
    test_path = 'test.jsonl'

    X_train, y_train, X_test, y_test = process_datasets(train_path, test_path)

    # Admin operations
    admin = SystemAdmin()

    # Train models and record training time
    training_times = {}
    print("Training models.")
    for model_name, model in zip(["SVM", "Logistic Regression", "Perceptron"],
                                 [admin.svm_model, admin.lr_model, admin.perceptron_model]):
        start_time = time.perf_counter()
        admin.train_models(X_train, y_train)
        training_times[model_name] = time.perf_counter() - start_time

    # Evaluate models and store results
    results = []
    for model_name, model in zip(["SVM", "Logistic Regression", "Perceptron"],
                                 [admin.svm_model, admin.lr_model, admin.perceptron_model]):
        # Unencrypted evaluation
        start_time = time.perf_counter()
        preds = admin.make_prediction(model, X_test)
        test_time = time.perf_counter() - start_time

        error_rate = np.mean(preds != y_test)
        f1 = f1_score(y_test, preds)
        results.append((model_name, "Unencrypted", error_rate, training_times[model_name], test_time, f1))

    # Display results in a DataFrame
    df_results = pd.DataFrame(results, columns=["Model", "Type", "Test Error", "Training Time", "Test Time", "F1 Score"])
    print("\nResults Comparison:")
    print(df_results)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## SVM for unencrypted dataset

In [None]:
# # Using train dataset for now
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Training
# svm_model = SVC(kernel='linear', random_state=42)
# svm_model.fit(X_train, y_train)


# y_pred = svm_model.predict(X_test)
# print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
# print(classification_report(y_test, y_pred))

## LR for unencrypted data

In [None]:
# # Train Logistic Regression Model
# log_reg_model = LogisticRegression(random_state=42, max_iter=1000)
# log_reg_model.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = log_reg_model.predict(X_test)

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy * 100:.2f}%")

# # Detailed classification report
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))

## Paillier Encryption with PCA

In [None]:
import numpy as np
import pandas as pd
import json
import time
from contextlib import contextmanager
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

np.random.seed(99)

def load_data_from_jsonl(file_path):
    # Reads JSONL file and extracts email texts and their corresponding labels.
    texts, labels = [], []
    with open(file_path, 'r') as f:
        for line in f:
            entry = json.loads(line)
            texts.append(entry['text'])  # Adapt based on the JSON schema
            labels.append(1 if entry['label_text'] == 'spam' else -1)  # Convert labels to numerical
    return np.array(texts), np.array(labels)

def process_datasets(train_file, test_file, n_components=100):
    # Load, preprocess, and transform training and testing datasets, with PCA for dimensionality reduction.
    print("Reading training dataset.")
    train_texts, train_labels = load_data_from_jsonl(train_file)

    print("Reading testing dataset.")
    test_texts, test_labels = load_data_from_jsonl(test_file)

    # Text vectorization (Bag-of-Words with Tfidf, without IDF)
    vectorizer = TfidfVectorizer(use_idf=False, norm=None, stop_words='english', min_df=0.001)

    # Fit vectorizer on training data and apply to test data
    X_train = vectorizer.fit_transform(train_texts).toarray()  # Convert to dense array for PCA
    X_test = vectorizer.transform(test_texts).toarray()

    print(f'Original feature space size: {X_train.shape[1]}')

    # Scale the data before applying PCA
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Apply PCA for dimensionality reduction
    pca = PCA(n_components=n_components)
    X_train_reduced = pca.fit_transform(X_train_scaled)
    X_test_reduced = pca.transform(X_test_scaled)

    print(f'Reduced feature space size: {X_train_reduced.shape[1]}')

    # Ensure both classes are present in training set
    if len(np.unique(train_labels)) < 2:
        raise ValueError("Training data must have both spam and non-spam labels.")

    print(f"Proportion of spam: {np.mean(train_labels == 1):.2f}, ham: {np.mean(train_labels == -1):.2f}")

    return X_train_reduced, train_labels, X_test_reduced, test_labels

@contextmanager
def measure_time():
    # Context manager to measure execution time.
    start = time.perf_counter()
    yield
    end = time.perf_counter()
    elapsed = end - start
    print(f'Elapsed time: {elapsed:.2f} seconds')

class SystemAdmin:
    # Responsible for model training, encryption, and key generation.
    def __init__(self):
        self.svm_model = SVC(kernel='linear')
        self.lr_model = LogisticRegression(max_iter=2000)  # Increased max_iter to 2000 for better convergence
        self.perceptron_model = Perceptron()

    def generate_paillier_keys(self, key_length):
        self.pub_key, self.priv_key = paillier.generate_paillier_keypair(n_length=key_length)

    def train_models(self, X, y):
        # Train SVM, Logistic Regression, and Perceptron models.
        self.svm_model.fit(X, y)
        self.lr_model.fit(X, y)
        self.perceptron_model.fit(X, y)

    def make_prediction(self, model, X):
        # Make predictions with the specified model.
        return model.predict(X)

    def encrypt_model_params(self, model):
        # Encrypt model parameters.
        coefficients = model.coef_.toarray()[0, :] if hasattr(model.coef_, "toarray") else model.coef_[0, :]
        encrypted_weights = [self.pub_key.encrypt(float(w)) for w in coefficients]
        encrypted_intercept = self.pub_key.encrypt(float(model.intercept_[0]))
        return encrypted_weights, encrypted_intercept

    def decrypt_values(self, encrypted_values):
        # Decrypt encrypted values.
        return [self.priv_key.decrypt(val) for val in encrypted_values]

class ModelUser:
    # User who evaluates data using encrypted models.
    def __init__(self, pub_key):
        self.pub_key = pub_key

    def initialize_model(self, enc_weights, enc_intercept):
        self.weights = enc_weights
        self.intercept = enc_intercept

    def calculate_encrypted_score(self, feature_vec):
        score = self.intercept
        # Loop through all features since it's now a dense vector
        for idx in range(len(self.weights)):
            score += feature_vec[idx] * self.weights[idx]
        return score

    def evaluate_model(self, X):
        # Score test data using the encrypted model.
        return [self.calculate_encrypted_score(X[i, :]) for i in range(X.shape[0])]

if __name__ == '__main__':
    train_path = 'train.jsonl'
    test_path = 'test.jsonl'

    # Process datasets and apply PCA for dimensionality reduction
    X_train, y_train, X_test, y_test = process_datasets(train_path, test_path, n_components=100)

    # Admin operations
    admin = SystemAdmin()
    admin.generate_paillier_keys(key_length=1024)

    # Train models and record training time
    training_times = {}
    print("Training models.")
    for model_name, model in zip(["SVM", "Logistic Regression", "Perceptron"],
                                 [admin.svm_model, admin.lr_model, admin.perceptron_model]):
        start_time = time.perf_counter()
        admin.train_models(X_train, y_train)
        training_times[model_name] = time.perf_counter() - start_time

    # Evaluate models (encrypted) and store results
    results = []
    for model_name, model in zip(["SVM", "Logistic Regression", "Perceptron"],
                                 [admin.svm_model, admin.lr_model, admin.perceptron_model]):
        # Encrypt model parameters
        enc_weights, enc_intercept = admin.encrypt_model_params(model)

        # User-side encrypted evaluation
        user = ModelUser(admin.pub_key)
        user.initialize_model(enc_weights, enc_intercept)

        start_time = time.perf_counter()
        enc_scores = user.evaluate_model(X_test)
        encrypted_test_time = time.perf_counter() - start_time

        decrypted_scores = admin.decrypt_values(enc_scores)
        enc_error_rate = np.mean(np.sign(decrypted_scores) != y_test)
        enc_f1 = f1_score(y_test, np.sign(decrypted_scores))
        results.append((model_name, "Encrypted", enc_error_rate, training_times[model_name], encrypted_test_time, enc_f1))

    # Display results in a DataFrame
    df_results = pd.DataFrame(results, columns=["Model", "Type", "Test Error", "Training Time", "Test Time", "F1 Score"])
    print("\nResults Comparison:")
    print(df_results)
