In [52]:
# Dataset added to directory for ease of access during project.

# Import libraries

# Use pandas for data cleaning
import pandas as pd
import re

In [53]:
# Read CSV file
df = pd.read_csv("emails.csv")

# Display the first 5 rows to verify the code worked.
print(df.head())

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1


In [54]:
# Check size of dataset
print("This is what the dataset looks like to start.")
rows, columns = df.shape
print(f"The number of rows: {rows}")
print(f"The number of columns: {columns}")


This is what the dataset looks like to start.
The number of rows: 5728
The number of columns: 2


In [55]:
# Remove null values

df.dropna(inplace=True)

In [56]:
# Compare size of dataset after removing null values

# Check size of dataset
print("This is what the dataset looks like to after removing null values.")
rows, columns = df.shape
print(f"The number of rows: {rows}")
print(f"The number of columns: {columns}")

This is what the dataset looks like to after removing null values.
The number of rows: 5728
The number of columns: 2


In [57]:
# There were no missing values

In [58]:
# Get an idea of the text contents -- view first row
with pd.option_context('display.max_colwidth', None):
    cell_content = df.iloc[2, 0]
    print(cell_content)

Subject: unbelievable new homes made easy  im wanting to show you this  homeowner  you have been pre - approved for a $ 454 , 169 home loan at a 3 . 72 fixed rate .  this offer is being extended to you unconditionally and your credit is in no way a factor .  to take advantage of this limited time opportunity  all we ask is that you visit our website and complete  the 1 minute post approval form  look foward to hearing from you ,  dorcas pittman


In [59]:
# Check if same number of spam/non-spam e-mails
counts = df.iloc[:, 1].value_counts()
print("Counts of 1s and 0s in Column 2 (Index 1):\n")
print(counts)

# To get counts:
if 1 in counts:
    count_of_ones = counts[1]
    print(f"\nNumber of 1's: {count_of_ones}")
else:
    print("\nNumber of 1's: 0")

if 0 in counts:
    count_of_zeros = counts[0]
    print(f"Number of 0's: {count_of_zeros}")
else:
    print("Number of 0's: 0")

Counts of 1s and 0s in Column 2 (Index 1):

spam
0    4360
1    1368
Name: count, dtype: int64

Number of 1's: 1368
Number of 0's: 4360


In [60]:
# Most of the data is non-spam.
# Will need to adjust weights for heavier penalty for mis-classifying the minority class.
# If we simply get rid of extras, that's A LOT of wasted data!
# XGBoost or LightGBM handles class imbalance better.

In [61]:
# Feature Extraction
# Number of exclamation marks -- high sense of urgency

output_file = 'emails_with_features.csv'
email_column_index = 0  # the first column

try:
    first_column_name = df.columns[email_column_index]
    df['Exclamation_Count'] = df[first_column_name].astype(str).str.count('!')

    # Display the first few rows to show the result
    print("\n--- First 5 Rows of the Updated Data ---")
    print(df.head())

    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_file, index=False, header=False)
    print(f"\nSuccessfully saved the result to '{output_file}'.")

except FileNotFoundError:
    print(f"Error: The file '{df}' was not found. Please ensure it is in the same directory as your Python script.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


--- First 5 Rows of the Updated Data ---
                                                text  spam  Exclamation_Count
0  Subject: naturally irresistible your corporate...     1                  0
1  Subject: the stock trading gunslinger  fanny i...     1                  0
2  Subject: unbelievable new homes made easy  im ...     1                  0
3  Subject: 4 color printing special  request add...     1                  2
4  Subject: do not have money , get software cds ...     1                  1

Successfully saved the result to 'emails_with_features.csv'.


In [62]:
print(df.columns)

Index(['text', 'spam', 'Exclamation_Count'], dtype='object')


In [63]:
# Certain words flagged for urgency

# First FLAGGED_WORDS commented out b/c incorrectly flagged legitimate e-mails
# FLAGGED_WORDS = ["hurry", "limited time", "last chance", "owe", "failure", "final", "notification"]

FLAGGED_WORDS = ["immediate", "failure", "final", "check out", "offer"]

def count_flagged_words(text):
    total_count = 0

    if pd.isna(text):
        return 0

    text_lower = str(text).lower()

    for word in FLAGGED_WORDS:
        total_count += text_lower.count(word)

    return total_count

try:
    column_name = df.columns[0] # Name of 1st column
    df[column_name] = df[column_name].astype(str)
    df['flagged_words'] = df[column_name].apply(count_flagged_words)

    print("\n--- First 5 Rows of the Updated Data ---")
    print(df.head())

except Exception as e:
    print(f"An unexpected error occurred: {e}")


--- First 5 Rows of the Updated Data ---
                                                text  spam  Exclamation_Count  \
0  Subject: naturally irresistible your corporate...     1                  0   
1  Subject: the stock trading gunslinger  fanny i...     1                  0   
2  Subject: unbelievable new homes made easy  im ...     1                  0   
3  Subject: 4 color printing special  request add...     1                  2   
4  Subject: do not have money , get software cds ...     1                  1   

   flagged_words  
0              0  
1              0  
2              1  
3              0  
4              0  


In [64]:
# Count totals in whole dataset

total_exclamations = df['Exclamation_Count'].sum()
total_flagged_words = df['flagged_words'].sum()

print(f"Total number of exclamation marks across the whole dataset: {int(total_exclamations)}")
print(f"Total number of flagged words across the whole dataset: {int(total_flagged_words)}")

Total number of exclamation marks across the whole dataset: 5175
Total number of flagged words across the whole dataset: 2221


In [65]:
# Compare features with spam and non-spam groups
spam_groups = df.groupby('spam')

# 2. Calculate the mean of the feature columns for each group
comparison_result = spam_groups[['Exclamation_Count', 'flagged_words']].mean()

# Rename the index values for better readability
comparison_result.rename(index={0: 'Not Spam (0)', 1: 'Spam (1)'}, inplace=True)

print("--- Average Feature Counts by Spam Status ---")
print(comparison_result)

--- Average Feature Counts by Spam Status ---
              Exclamation_Count  flagged_words
spam                                          
Not Spam (0)           0.427752       0.321789
Spam (1)               2.419591       0.597953


In [66]:
# Initial count of flagged_words indicates they are appearing more frequently in the non-spam emails. We need to change the flagged-words we are selecting.

In [67]:
# Second change to FLAGGED_WORDS made very little difference.
# Get top 10 words from the spam folder. These will become contents for FLAGGED_WORDS

# Filter for spam emails
spam_df = df[df['spam'] == 1]

# Extract the 'text' column and perform cleaning/counting
top_10_spam_words = (
    spam_df['text']
    .str.lower()
    # Remove all punctuation and special characters
    .str.replace(r'[^\w\s]', '', regex=True)
    .str.split(expand=True)
    .stack()
    .value_counts()
    .head(10)
)

print(top_10_spam_words)

_       13556
the      8975
to       8165
and      6517
of       5629
you      4920
a        4695
in       3879
your     3730
for      3186
Name: count, dtype: int64


In [68]:
# Compare to entire dataset

# Extract the 'text' column and perform cleaning/counting
top_10_words_all = (
    df['text']
    .str.lower()
    # Remove all punctuation and special characters
    .str.replace(r'[^\w\s]', '', regex=True)
    .str.split(expand=True)
    .stack()
    .value_counts()
    .head(10)
)

print(top_10_words_all)

the    50110
to     41736
and    27478
of     23757
a      19945
you    19154
in     17975
i      17421
_      17234
for    16696
Name: count, dtype: int64


In [69]:
output_file = 'emails_with_features.csv'
email_column_index = 0  # the first column

try:
    first_column_name = df.columns[email_column_index]
    df['Question_Count'] = df[first_column_name].astype(str).str.count(r'\?')

    # Display the first few rows to show the result
    print("\n--- First 5 Rows of the Updated Data ---")
    print(df.head())

    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_file, index=False, header=False)
    print(f"\nSuccessfully saved the result to '{output_file}'.")

except FileNotFoundError:
    print(f"Error: The file '{df}' was not found. Please ensure it is in the same directory as your Python script.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


--- First 5 Rows of the Updated Data ---
                                                text  spam  Exclamation_Count  \
0  Subject: naturally irresistible your corporate...     1                  0   
1  Subject: the stock trading gunslinger  fanny i...     1                  0   
2  Subject: unbelievable new homes made easy  im ...     1                  0   
3  Subject: 4 color printing special  request add...     1                  2   
4  Subject: do not have money , get software cds ...     1                  1   

   flagged_words  Question_Count  
0              0               0  
1              0               0  
2              1               0  
3              0               0  
4              0               1  

Successfully saved the result to 'emails_with_features.csv'.


In [70]:
# Compare spam and non-spam e-mails

import pandas as pd

df = pd.read_csv('emails_with_features.csv', header=None)

# Assign the new column names
df.columns = ['text', 'spam', 'Exclamation_Marks', 'flagged_words', 'Question_Count']

print(df.head())

# Create Boolean Feature Columns
df['has_question'] = df['text'].astype(str).str.contains(r'\?')

df['has_exclamation'] = df['text'].astype(str).str.contains(r'!')

# Group and Calculate Percentages
comparison_result = (
    df.groupby('spam')[['has_question', 'has_exclamation']]
    .mean() * 100
)

# Format and Display the Results
# Rename the index and columns for a clear final output
comparison_result.rename(
    index={0: 'Non-Spam', 1: 'Spam'},
    columns={
        'has_question': 'Percent with ?',
        'has_exclamation': 'Percent with !'
    },
    inplace=True
)

print(comparison_result)


                                                text  spam  Exclamation_Marks  \
0  Subject: naturally irresistible your corporate...     1                  0   
1  Subject: the stock trading gunslinger  fanny i...     1                  0   
2  Subject: unbelievable new homes made easy  im ...     1                  0   
3  Subject: 4 color printing special  request add...     1                  2   
4  Subject: do not have money , get software cds ...     1                  1   

   flagged_words  Question_Count  
0              0               0  
1              0               0  
2              1               0  
3              0               0  
4              0               1  
          Percent with ?  Percent with !
spam                                    
Non-Spam       37.362385       22.339450
Spam           39.985380       59.649123
