In [None]:
import pandas as pd
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import re
from gensim.models import Word2Vec

# Load the dataset
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)

file_path = 'test.jsonl'
df = read_jsonl(file_path)

# Text preprocessing
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):  # Convert to lowercase, Remove digits and special characters
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\W+', ' ', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

df['cleaned_text'] = df['text'].apply(preprocess_text)

# Using Word2Vec for word embedding
sentences = [text.split() for text in df['cleaned_text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Aggregate word embeddings for each email (average of word embeddings)
def get_sentence_embedding(sentence, model):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

df['embedding'] = df['cleaned_text'].apply(lambda x: get_sentence_embedding(x, word2vec_model))




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## SVM for unencrypted dataset

In [None]:
# Using train dataset for now
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)


y_pred = svm_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

## LR for unencrypted data

In [None]:
# Train Logistic Regression Model
log_reg_model = LogisticRegression(random_state=42, max_iter=1000)
log_reg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## Paillier Encryption

In [None]:
##
!pip install phe

In [None]:
import json
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from phe import paillier

# Load JSON dataset
with open('test.jsonl', 'r') as f:
    data = json.load(f)

# Preprocess text data (tokenize, remove stop words, stem/lemmatize)
# ... (implement preprocessing steps as needed)

# Create word embeddings
sentences = [text.split() for text in data]
model = Word2Vec(sentences, min_count=1, vector_size=100)  # Adjust vector size as needed

# Generate Paillier key pair
public_key, private_key = paillier.generate_keys(1024)  # Adjust key size as needed

# Convert text to numerical vectors and encrypt
def text_to_vector_and_encrypt(text):
    words = text.split()
    vectors = [model[word] for word in words if word in model.wv]
    if vectors:
        vector = np.mean(vectors, axis=0)
        encrypted_vector = public_key.encrypt(vector)
        return encrypted_vector
    else:
        return None  # Handle empty vectors as needed

encrypted_X = [text_to_vector_and_encrypt(text) for text in data]

# Assuming your target variable is stored in 'y'
X_train, X_test, y_train, y_test = train_test_split(encrypted_X, y, test_size=0.2)

# Train SVM model on encrypted data
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Predict on encrypted test data
y_pred = svm_model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)