In [None]:
#import packages
import pandas as pd
import numpy as np

In [None]:
import nltk
from nltk.corpus import words #looking through each word in the email and see if it's an actual word in english then we're gonna add it to a dictionary

In [None]:
vocabulary = {} #key:word, value:index
data = pd.read_csv('/content/emails.csv')
nltk.download('words') #downloading words resourse
set_words = set(words.words()) #creating a set and populating it with words obtained from the words library.

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
len(set_words)

235892

Data Exploration

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [None]:
data.shape

(5728, 2)

In [None]:
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [None]:
data.isnull().sum()

text    0
spam    0
dtype: int64

In [None]:
data.iloc[2,0]

'Subject: unbelievable new homes made easy  im wanting to show you this  homeowner  you have been pre - approved for a $ 454 , 169 home loan at a 3 . 72 fixed rate .  this offer is being extended to you unconditionally and your credit is in no way a factor .  to take advantage of this limited time opportunity  all we ask is that you visit our website and complete  the 1 minute post approval form  look foward to hearing from you ,  dorcas pittman'

In [None]:
unique_values = data['spam'].unique()
unique_values

array([1, 0])

In [None]:
data.iloc[2500,0]

'Subject: carnegie mellon recruiting  i received the following email this afternoon .  - kevin k .  - - - - - - - - - - - - - - - - - - - - - - forwarded by kevin kindall / corp / enron on 11 / 16 / 2000  05 : 22 pm - - - - - - - - - - - - - - - - - - - - - - - - - - -  sallygould on 11 / 16 / 2000 03 : 38 : 44 pm  to : kevin . kindall @ enron . com  cc :  subject : carnegie mellon recruiting  kevin ,  jean eisel asked that i connect with you about recruiting comp . finance  students .  please contact me with questions you might have about the recruiting  process or if you have some dates in mind for coming to campus .  i look forward to hearing from you .  regards ,  sally gould  recruiting coordinator  gsia - carnegie mellon university  412 - 268 - 1311  412 - 268 - 4146 ( fx )'

Building vocabulary for words seen in the email.csv<br>


In [None]:
def build_vocab(curr_email): #take words and store each at a specific index
  index = len(vocabulary)
  for word in curr_email:
    if word.lower() not in vocabulary and word.lower() in set_words:
      vocabulary[word] = index
      index += 1

In [None]:
%%capture
if __name__ == '__main__':
  for i in range(data.shape[0]):
    curr_email = data.iloc[i,0].split()
    print(f"Current email is {i}/{data.shape[0]} and the length of vocabulary is {len(vocabulary)}")

    build_vocab(curr_email)

In [None]:
len(vocabulary)

12011

Go through each email and map into X,y dataset. Map each email into some frequency array.<br>
We have 12000 words in our vocabulary so each training example is going to be 12000 features.

In [None]:
X = np.zeros((data.shape[0], len(vocabulary)))
y = np.zeros((data.shape[0]))

for i in range(data.shape[0]):
  email = data.iloc[i,0].split()

  for email_word in email:
    if email_word.lower() in vocabulary:#Checks if the lowercase version of the word is in the predefined vocabulary.
      X[i, vocabulary[email_word]] += 1
      y[i] = data.iloc[i,1]

In [None]:
np.savetxt('X.txt', X, delimiter=',')
np.savetxt('y.txt', y, fmt='%d')

Naive Bayes

In [None]:
class NaiveBayes():
  def __init__(self,X,y): #X -> all the training samples and features, y -> target class
    self.num_examples,self.num_features = X.shape
    self.num_classes = len(np.unique(y))
    self.eps = 1e-6 # introduces a level of numerical stability, ensuring that the calculations remain well-behaved and don't encounter problems associated with extremely small numbers

  def fit(self,X,y):# training phase of naive bayes
     self.classes_mean = {}
     self.classes_variance = {}
     self.classes_prior = {}

     for c in range(self.num_classes):
      X_c = X[y==c] # pickout emails from a specific class, spam and not spam

      self.classes_mean[c] = np.mean(X_c,axis=0) #find mean for this specific class
      self.classes_variance[c] = np.var(X_c,axis=0)
      self.classes_prior[c] = X_c.shape[0]/self.num_examples

  def predict(self,X): #prediction
    #for each email we find the prob of it being spam and not spam
    probs = np.zeros((self.num_examples,self.num_classes))

    for c in range(self.num_classes):
      prior = self.classes_prior[c]
      likelihood = self.density_function(X,self.classes_mean[c],self.classes_variance[c])
      probs[:,c] = likelihood + np.log(prior) #likelihood * prior , we use log function to prevent underflow

    return np.argmax(probs,1)

  def density_function(self,x, mean, variance): #gaussian naive bayes -> p rob is normally distributed
    const = -((self.num_features/2)*np.log(2*np.pi)) - 0.5*(np.sum(np.log(variance + self.eps)))
    probs = 0.5* np.sum(np.power(x - mean,2)/(variance + self.eps),1)
    return const - probs #normally these are multiplied, since we're taking the log, it converts the product into a sum

  def is_spam(self, email):
    email_features = np.zeros((1, len(vocabulary)))
    email_words = email.split()
    for word in email_words:
      if word in vocabulary:
        email_features[0, vocabulary[word]] += 1
    prediction = self.predict(email_features)
    return prediction[0]


if __name__ == '__main__':

  X = np.loadtxt('X.txt',delimiter=',')
  y = np.loadtxt('y.txt')

  NB = NaiveBayes(X,y)
  NB.fit(X,y)
  y_pred = NB.predict(X)

In [None]:
print(X.shape)
print(y.shape)

(5728, 12011)
(5728,)


In [None]:
NB = NaiveBayes(X,y)
NB.fit(X,y)
y_pred = NB.predict(X)

print(sum(y_pred == y)/X.shape[0]) #accuracy

0.9149790502793296


In [None]:
y_pred

array([1, 1, 1, ..., 0, 0, 0])

In [None]:
test_email = input("Enter the email:")
is_spam = NB.is_spam(test_email)
if is_spam == 1:
  print("The email is classified as spam.")
else:
  print("The email is not classified as spam.")

Enter the email:Subject: Urgent Opportunity - Act Now for Huge Profits!  Hello Sarah Miller,  We hope this message finds you well. We have a time-sensitive investment opportunity that is too good to miss!  Exciting Opportunity: Our groundbreaking investment scheme guarantees massive returns in just a few days!  Join our exclusive group of successful investors who have already benefited from this incredible offer.  Act now and secure your financial future!  Best regards, John Reynolds Investment Mastery Group
The email is classified as spam.


In [None]:
'''Spam:
Subject: Urgent Opportunity - Act Now for Huge Profits!

Hello Sarah Miller,

We hope this message finds you well. We have a time-sensitive investment opportunity that is too good to miss!

Exciting Opportunity: Our groundbreaking investment scheme guarantees massive returns in just a few days!

Join our exclusive group of successful investors who have already benefited from this incredible offer.

Act now and secure your financial future!

Best regards,
John Reynolds
Investment Mastery Group
'''

In [None]:
'''Not Spam:
Subject: Follow-Up on Our Recent Meeting

Dear Alex Thompson,

I hope this email finds you well. I wanted to follow up on our recent meeting and express my appreciation for the insightful discussion we had.

During our conversation, we touched upon the upcoming project deadline. I found your perspectives on streamlining the workflow particularly interesting and believe there is potential for collaboration on process improvement initiatives.

I would love to schedule another meeting at your convenience to delve deeper into these opportunities. Please let me know a time that works best for you.

Thank you once again for your time and valuable insights. I look forward to continuing our conversation.

Best regards,

Emily Rodriguez
Project Manager
ABC Corporation
emily.rodriguez@email.com
(555) 123-4567'''