In [None]:
pip install git+https://github.com/data61/python-paillier.git

In [None]:
import pandas as pd
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import re
from gensim.models import Word2Vec

# Load the dataset
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)

file_path = 'test.jsonl'
df = read_jsonl(file_path)

# Text preprocessing
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):  # Convert to lowercase, Remove digits and special characters
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\W+', ' ', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

df['cleaned_text'] = df['text'].apply(preprocess_text)

# Using Word2Vec for word embedding
sentences = [text.split() for text in df['cleaned_text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Aggregate word embeddings for each email (average of word embeddings)
def get_sentence_embedding(sentence, model):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

df['embedding'] = df['cleaned_text'].apply(lambda x: get_sentence_embedding(x, word2vec_model))




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## SVM for unencrypted dataset

In [None]:
# Using train dataset for now
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)


y_pred = svm_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

## LR for unencrypted data

In [None]:
# Train Logistic Regression Model
log_reg_model = LogisticRegression(random_state=42, max_iter=1000)
log_reg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## Paillier Encryption

In [None]:
##
!pip install phe

In [None]:
import json
import time
import os.path
from contextlib import contextmanager

import numpy as np
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer

import phe as paillier

np.random.seed(42)

def load_data_from_jsonl(file_path):
    """Load data from a JSONL file."""
    emails = []
    labels = []
    with open(file_path, 'r') as f:
        for line in f:
            entry = json.loads(line)
            emails.append(entry['text'])  # Adjust based on your JSON field
            labels.append(1 if entry['label_text'] == 'spam' else -1)  # Adjust label logic as necessary
    return np.array(emails), np.array(labels)

def preprocess_data(file_path):
    """
    Get the Enron e-mails from disk.
    Represent them as bag-of-words.
    Shuffle and split train/test.
    """

    print("Importing dataset from disk...")
    # Load emails from your JSONL file
    emails, labels = load_data_from_jsonl(file_path)

    # Words count, keep only frequent words
    count_vect = CountVectorizer(decode_error='replace', stop_words='english',
                                  min_df=0.001)
    X = count_vect.fit_transform(emails)

    print('Vocabulary size: %d' % X.shape[1])

    # Shuffle
    perm = np.random.permutation(X.shape[0])
    X, y = X[perm, :], labels[perm]

    # Split train and test
    split = 500  # Number of samples to use for training
    X_train, X_test = X[-split:, :], X[:-split, :]
    y_train, y_test = y[-split:], y[:-split]

    # Check unique labels in training set
    unique_labels = np.unique(y_train)
    print("Unique labels in training set:", unique_labels)
    print(y_train)

    # Ensure both classes are present
    if len(unique_labels) < 2:
        raise ValueError("Training set must contain both spam and ham examples.")

    print("Labels in trainset are {:.2f} spam : {:.2f} ham".format(
        np.mean(y_train == 1), np.mean(y_train == -1)))

    return X_train, y_train, X_test, y_test


@contextmanager
def timer():
    """Helper for measuring runtime"""
    time0 = time.perf_counter()
    yield
    print('[elapsed time: %.2f s]' % (time.perf_counter() - time0))


class Alice:
    """Represents the model owner who trains the SVM model."""
    def __init__(self):
        self.model = SVC(kernel='linear')  # SVM with linear kernel

    def generate_paillier_keypair(self, n_length):
        self.pubkey, self.privkey = paillier.generate_paillier_keypair(n_length=n_length)

    def fit(self, X, y):
        self.model = self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)

    def encrypt_weights(self):
        # Convert sparse matrix to dense format (NumPy array)
        coef = self.model.coef_.toarray()[0, :]
        # Encrypt the coefficients
        encrypted_weights = [self.pubkey.encrypt(float(coef[i])) for i in range(coef.shape[0])]
        # Encrypt the intercept
        encrypted_intercept = self.pubkey.encrypt(float(self.model.intercept_[0]))
        return encrypted_weights, encrypted_intercept

    def decrypt_scores(self, encrypted_scores):
        return [self.privkey.decrypt(s) for s in encrypted_scores]



class Bob:
    """Scores local plaintext data using the encrypted model."""
    def __init__(self, pubkey):
        self.pubkey = pubkey

    def set_weights(self, weights, intercept):
        self.weights = weights
        self.intercept = intercept

    def encrypted_score(self, x):
        score = self.intercept
        _, idx = x.nonzero()
        for i in idx:
            score += x[0, i] * self.weights[i]
        return score

    def encrypted_evaluate(self, X):
        return [self.encrypted_score(X[i, :]) for i in range(X.shape[0])]


if __name__ == '__main__':
    file_path = 'test.jsonl'  # Specify your JSONL file path here
    X, y, X_test, y_test = preprocess_data(file_path)

    print("Alice: Generating paillier keypair")
    alice = Alice()
    alice.generate_paillier_keypair(n_length=1024)

    print("Alice: Learning spam classifier")
    with timer() as t:
        alice.fit(X, y)

    print("Classify with model in the clear -- what Alice would get having Bob's data locally")
    with timer() as t:
        error = np.mean(alice.predict(X_test) != y_test)
    print("Error {:.3f}".format(error))

    print("Alice: Encrypting classifier")
    with timer() as t:
        encrypted_weights, encrypted_intercept = alice.encrypt_weights()

    print("Bob: Scoring with encrypted classifier")
    bob = Bob(alice.pubkey)
    bob.set_weights(encrypted_weights, encrypted_intercept)
    with timer() as t:
        encrypted_scores = bob.encrypted_evaluate(X_test)

    print("Alice: Decrypting Bob's scores")
    with timer() as t:
        scores = alice.decrypt_scores(encrypted_scores)
    error = np.mean(np.sign(scores) != y_test)
    print("Error {:.3f} -- this is not known to Alice, who does not possess the ground truth labels".format(error))
