In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
path = "Resources/spam_email_dataset.csv"
spam_email_df = pd.read_csv(path)

# Review the DataFrame
spam_email_df.head()

Unnamed: 0,Email,Subject,Sender,Recipient,Date,Time,Attachments,Link Count,Word Count,Uppercase Count,Exclamation Count,Question Count,Dollar Count,Punctuation Count,HTML Tags Count,Spam Indicator
0,mikerusso@example.net,Even hotel community church.,emilyscott@example.org,gregorysmith@example.org,13-02-2023,04:13,3,0,191,32,3,3,0,11,1,1
1,waynebailey@example.org,Try themselves guess fight white agreement thu...,annwhite@example.net,gonzalezdaniel@example.net,09-08-2023,06:15,3,9,45,1,2,1,2,8,2,1
2,jill43@example.com,Environmental commercial off seem any conference.,david88@example.net,michellebaker@example.net,16-05-2023,01:32,3,9,52,1,4,0,2,10,1,1
3,johnsonkaren@example.org,Smile real TV father commercial day increase.,lindaalvarez@example.com,schroedertodd@example.com,25-04-2023,14:50,2,3,75,6,0,2,0,8,2,0
4,markwilson@example.org,Fast stage he oil institution.,vstafford@example.com,emilywilliams@example.com,11-07-2023,21:44,3,5,299,279,0,2,0,6,1,1


In [3]:
# Remove non-float columns
spam_email_df = spam_email_df.drop(columns=['Email', 'Subject', 'Sender', 'Recipient', 'Date', 'Time'])

In [4]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = spam_email_df['Spam Indicator']

# Separate the X variable, the features
X = spam_email_df.drop(columns=['Spam Indicator'])

In [5]:
# Review the y variable Series
y.head(10)

0    1
1    1
2    1
3    0
4    1
5    1
6    0
7    0
8    0
9    0
Name: Spam Indicator, dtype: int64

In [6]:
# Review the X variable DataFrame
X.head(10)

Unnamed: 0,Attachments,Link Count,Word Count,Uppercase Count,Exclamation Count,Question Count,Dollar Count,Punctuation Count,HTML Tags Count
0,3,0,191,32,3,3,0,11,1
1,3,9,45,1,2,1,2,8,2
2,3,9,52,1,4,0,2,10,1
3,2,3,75,6,0,2,0,8,2
4,3,5,299,279,0,2,0,6,1
5,1,9,268,256,4,2,2,7,0
6,3,6,117,2,1,3,2,1,0
7,1,1,163,61,0,3,1,12,4
8,2,5,79,47,4,2,1,5,2
9,0,4,130,92,2,3,1,0,4


In [7]:
# Check the balance of our target values
y.value_counts()

0    3018
1    2982
Name: Spam Indicator, dtype: int64

In [8]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [9]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=500, random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

In [10]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,1,0
1,1,0
2,0,1
3,1,0
4,0,1
5,1,0
6,1,1
7,1,1
8,0,0
9,1,0


In [11]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.49640700891047573

In [12]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[411, 343],
       [412, 334]], dtype=int64)

In [13]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.50      0.55      0.52       754
           1       0.49      0.45      0.47       746

    accuracy                           0.50      1500
   macro avg       0.50      0.50      0.50      1500
weighted avg       0.50      0.50      0.50      1500



In [14]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_over, y_over = model.fit_resample(X_train, y_train)

In [15]:
# Count the distinct values of the resampled labels data
y_over.nunique()

2

In [16]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fit the model using the resampled training data
classifier.fit(X_over, y_over)

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Predictions": predictions, "Actual": y_test}).reset_index(drop=True)

results.head(10)

Unnamed: 0,Predictions,Actual
0,1,0
1,1,0
2,0,1
3,1,0
4,0,1
5,1,0
6,1,1
7,1,1
8,0,0
9,1,0


In [17]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.4932762531912019

In [18]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[380, 374],
       [386, 360]], dtype=int64)

In [19]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.50      0.50      0.50       754
           1       0.49      0.48      0.49       746

    accuracy                           0.49      1500
   macro avg       0.49      0.49      0.49      1500
weighted avg       0.49      0.49      0.49      1500

