In [305]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import re



In [306]:
data = pd.read_csv('C:\\Users\\layin\\Downloads\\archive (2)\\email_spam.csv')
data.head()

Unnamed: 0,title,text,type
0,?? the secrets to SUCCESS,"Hi James,\n\nHave you claim your complimentary...",spam
1,?? You Earned 500 GCLoot Points,"\nalt_text\nCongratulations, you just earned\n...",not spam
2,?? Your GitHub launch code,"Here's your GitHub launch code, @Mortyj420!\n ...",not spam
3,[The Virtual Reward Center] Re: ** Clarifications,"Hello,\n \nThank you for contacting the Virtua...",not spam
4,"10-1 MLB Expert Inside, Plus Everything You Ne...","Hey Prachanda Rawal,\n\nToday's newsletter is ...",spam


In [307]:
# Function to clean the email text by removing URLs, special characters, and numbers
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters and numbers, retain only alphabets
    text = re.sub(r'\@\w+|\#|\d+', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [308]:
# Applying the cleaning function to the 'text' column
data['cleaned_text'] = data['text'].apply(clean_text)


In [309]:
# Display the first few rows of the cleaned data
data[['text', 'cleaned_text']].head()

Unnamed: 0,text,cleaned_text
0,"Hi James,\n\nHave you claim your complimentary...","Hi James, Have you claim your complimentary gi..."
1,"\nalt_text\nCongratulations, you just earned\n...","alt_text Congratulations, you just earned You ..."
2,"Here's your GitHub launch code, @Mortyj420!\n ...","Here's your GitHub launch code, ! an octocat s..."
3,"Hello,\n \nThank you for contacting the Virtua...","Hello, Thank you for contacting the Virtual Re..."
4,"Hey Prachanda Rawal,\n\nToday's newsletter is ...","Hey Prachanda Rawal, Today's newsletter is Jam..."


In [310]:
#Encode the labels (spam: 1, not spam: 0)
le = LabelEncoder()
data['label'] = le.fit_transform(data['type'])


In [311]:
# TF-IDF Vectorization to convert text into numerical features
# Using unigrams and bigrams, and limiting to 2000 features
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1, 2))
X = vectorizer.fit_transform(data['cleaned_text'])  # Feature matrix
y = data['label']  # Labels


In [312]:
#Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [313]:
# Displaying the shapes of the datasets
print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")
print(f"Training labels shape: {y_train.shape}, Test labels shape: {y_test.shape}")



Training set shape: (67, 2000), Test set shape: (17, 2000)
Training labels shape: (67,), Test labels shape: (17,)


In [314]:
# This section trains a Logistic Regression model
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV



In [315]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)


In [316]:
#Initialize the Logistic Regression model with class weights
logistic_model = LogisticRegression(class_weight='balanced')

In [317]:
#Define the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.001, 0.005, 0.01, 0.1, 1],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': ['balanced']
}


In [318]:
# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_resampled, y_train_resampled)



In [319]:
#Train the Logistic Regression model using the best parameters
best_logistic_model = grid_search.best_estimator_
best_logistic_model.fit(X_train_resampled, y_train_resampled)




In [320]:
#Predict on the test set
y_pred_logistic = best_logistic_model.predict(X_test)

In [321]:
#Adjust the decision threshold for better recall (e.g., set to 0.4)
y_pred_logistic_class = [1 if pred > 0.2 else 0 for pred in y_pred_logistic]


In [322]:
#Evaluate the Logistic Regression model (adapted for classification)
logistic_accuracy = accuracy_score(y_test, y_pred_logistic_class)
print(f"Logistic Regression Test Accuracy: {logistic_accuracy}")
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logistic_class))

Logistic Regression Test Accuracy: 0.8823529411764706
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.82      0.90        11
           1       0.75      1.00      0.86         6

    accuracy                           0.88        17
   macro avg       0.88      0.91      0.88        17
weighted avg       0.91      0.88      0.88        17



In [323]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

In [324]:
# Apply SMOTE to balance the dataset by oversampling the minority class (spam)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [325]:
# Initialize the Linear Regression model
linear_model = LinearRegression()

In [326]:
# Train the Linear Regression model on the resampled training data
linear_model.fit(X_train_resampled, y_train_resampled)

In [327]:
# Predict on the test set (Linear Regression will give continuous outputs)
y_pred_linear = linear_model.predict(X_test)

In [328]:
# Convert the continuous predictions into binary classifications
# Adjusting the threshold to 0.4 to improve recall for spam emails
y_pred_linear_class = [1 if pred > 0.4 else 0 for pred in y_pred_linear]


In [329]:
# Evaluate the Linear Regression model (adapted for classification)
linear_accuracy = accuracy_score(y_test, y_pred_linear_class)
print(f"Linear Regression Test Accuracy: {linear_accuracy}")
print("Linear Regression Classification Report:")
print(classification_report(y_test, y_pred_linear_class))

Linear Regression Test Accuracy: 0.9411764705882353
Linear Regression Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      0.83      0.91         6

    accuracy                           0.94        17
   macro avg       0.96      0.92      0.93        17
weighted avg       0.95      0.94      0.94        17



In [358]:
import unittest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

class TestSpamEmailClassifiers(unittest.TestCase):

    # Test Logistic Regression
    def test_logistic_regression_accuracy(self):
        logistic_accuracy = accuracy_score(y_test, y_pred_logistic_class)
        self.assertGreaterEqual(logistic_accuracy, 0.7, "Logistic Regression accuracy is too low.")
    
    def test_logistic_regression_precision(self):
        # Precision threshold for Logistic Regression (spam class)
        spam_precision = precision_score(y_test, y_pred_logistic_class, pos_label=1)
        self.assertGreaterEqual(spam_precision, 0.75, "Logistic Regression precision for spam is too low.")
    
    def test_logistic_regression_recall(self):
        # Recall threshold for Logistic Regression (spam class)
        spam_recall = recall_score(y_test, y_pred_logistic_class, pos_label=1)
        self.assertGreaterEqual(spam_recall, 0.5, "Logistic Regression recall for spam is too low.")
    
    def test_logistic_regression_f1(self):
        # F1-score threshold for Logistic Regression (spam class)
        spam_f1 = f1_score(y_test, y_pred_logistic_class, pos_label=1)
        self.assertGreaterEqual(spam_f1, 0.6, "Logistic Regression F1-score for spam is too low.")

    # Test Linear Regression (adapted for classification)
    def test_linear_regression_accuracy(self):
        linear_accuracy = accuracy_score(y_test, y_pred_linear_class)
        self.assertGreaterEqual(linear_accuracy, 0.7, "Linear Regression accuracy is too low.")
    
    def test_linear_regression_precision(self):
        # Precision threshold for Linear Regression (spam class)
        spam_precision = precision_score(y_test, y_pred_linear_class, pos_label=1)
        self.assertGreaterEqual(spam_precision, 0.75, "Linear Regression precision for spam is too low.")
    
    def test_linear_regression_recall(self):
        # Recall threshold for Linear Regression (spam class)
        spam_recall = recall_score(y_test, y_pred_linear_class, pos_label=1)
        self.assertGreaterEqual(spam_recall, 0.5, "Linear Regression recall for spam is too low.")
    
    def test_linear_regression_f1(self):
        # F1-score threshold for Linear Regression (spam class)
        spam_f1 = f1_score(y_test, y_pred_linear_class, pos_label=1)
        self.assertGreaterEqual(spam_f1, 0.6, "Linear Regression F1-score for spam is too low.")

# Run the unit tests for both models
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)


........
----------------------------------------------------------------------
Ran 8 tests in 0.078s

OK
