In [11]:
import pandas as pd 
import numpy as np
import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from collections import Counter
from collections import defaultdict

In [12]:
data = pd.read_csv('spam_ham_dataset.csv')


In [13]:
def split_subject_body(text):
    # Splitting by the first occurrence of "\r\n" which seems to separate the subject from the body
    parts = text.split("\r\n", 1)
    subject = parts[0].replace("Subject: ", "").strip() if len(parts) > 0 else ""
    body = parts[1].strip() if len(parts) > 1 else ""
    return subject, body

# Apply the function to the dataframe
data[['subject', 'body']] = data.apply(lambda row: pd.Series(split_subject_body(row['text'])), axis=1)

In [14]:
def count_words_clean_subject(subject):
    # Remove punctuation
    clean_subject = re.sub(r'[^\w\s]', '', subject)
    # Count the words
    word_count = len(clean_subject.split())
    return word_count

# Apply the function to the subject column to update the word count
data['subject_length'] = data['subject'].apply(count_words_clean_subject)
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num,subject,body,subject_length
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,enron methanol ; meter # : 988291,this is a follow up to the note i gave you on ...,4
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,"hpl nom for january 9 , 2001",( see attached file : hplnol 09 . xls )\r\n- h...,6
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,neon retreat,"ho ho ho , we ' re around to that most wonderf...",2
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,"photoshop , windows , office . cheap . main tr...",abasements darer prudently fortuitous undergon...,6
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,re : indian springs,this deal is to book the teco pvr revenue . it...,3


In [15]:
def count_words_excluding_subject(text):
    subject_end_index = text.find("\n\n")
    if subject_end_index != -1:
        content = text[subject_end_index:].strip()
    else:
        content = text
        
    word_count = len(content.split())
    return word_count

In [16]:
data['body_length'] = data['text'].apply(count_words_excluding_subject)

In [17]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num,subject,body,subject_length,body_length
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,enron methanol ; meter # : 988291,this is a follow up to the note i gave you on ...,4,67
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,"hpl nom for january 9 , 2001",( see attached file : hplnol 09 . xls )\r\n- h...,6,23
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,neon retreat,"ho ho ho , we ' re around to that most wonderf...",2,550
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,"photoshop , windows , office . cheap . main tr...",abasements darer prudently fortuitous undergon...,6,48
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,re : indian springs,this deal is to book the teco pvr revenue . it...,3,70


In [18]:
!pip install pandas nltk scikit-learn



In [19]:
def preprocess_and_count_words(texts):
    # Combining all texts into one large string(Without spaces)
    combined_text = " ".join(texts)
    # Removing punctuation
    translator = str.maketrans('', '', string.punctuation)
    text_nopunc = combined_text.translate(translator)
    text_lower = text_nopunc.lower()
    words = text_lower.split()
    filtered_words = [word for word in words if word not in ENGLISH_STOP_WORDS]
    word_counts = Counter(filtered_words)
    return word_counts

In [20]:
spam_df = data[data['label'] == 'spam']
word_frequencies_spam = preprocess_and_count_words(spam_df['text'])
print(word_frequencies_spam.most_common(25))

[('subject', 1657), ('s', 1316), ('3', 1239), ('com', 992), ('http', 983), ('2', 858), ('company', 728), ('1', 720), ('0', 698), ('e', 631), ('www', 587), ('00', 585), ('information', 520), ('font', 515), ('5', 510), ('td', 504), ('t', 502), ('4', 494), ('statements', 476), ('email', 474), ('price', 471), ('d', 469), ('7', 457), ('new', 432), ('nbsp', 418)]


In [21]:
top_25_spam_words = [word for word, frequency in word_frequencies_spam.most_common(25)]
top_25_spam_words

['subject',
 's',
 '3',
 'com',
 'http',
 '2',
 'company',
 '1',
 '0',
 'e',
 'www',
 '00',
 'information',
 'font',
 '5',
 'td',
 't',
 '4',
 'statements',
 'email',
 'price',
 'd',
 '7',
 'new',
 'nbsp']

In [22]:
def count_spam_words_in_email(email, spam_words):
    words = email.lower().split()
    return sum(word in spam_words for word in words)

In [24]:
data['spam_word_count'] = data['text'].apply(count_spam_words_in_email, spam_words=set(top_25_spam_words))
data.head(10)

Unnamed: 0.1,Unnamed: 0,label,text,label_num,subject,body,subject_length,body_length,spam_word_count
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,enron methanol ; meter # : 988291,this is a follow up to the note i gave you on ...,4,67,4
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,"hpl nom for january 9 , 2001",( see attached file : hplnol 09 . xls )\r\n- h...,6,23,0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,neon retreat,"ho ho ho , we ' re around to that most wonderf...",2,550,16
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,"photoshop , windows , office . cheap . main tr...",abasements darer prudently fortuitous undergon...,6,48,0
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,re : indian springs,this deal is to book the teco pvr revenue . it...,3,70,2
5,2949,ham,Subject: ehronline web address change\r\nthis ...,0,ehronline web address change,this message is intended for ehronline users o...,4,98,3
6,2793,ham,Subject: spring savings certificate - take 30 ...,0,spring savings certificate - take 30 % off,save 30 % when you use our customer appreciati...,6,385,11
7,4185,spam,Subject: looking for medication ? we ` re the ...,1,looking for medication ? we ` re the best sour...,it is difficult to make our material condition...,8,164,3
8,2641,ham,Subject: noms / actual flow for 2 / 26\r\nwe a...,0,noms / actual flow for 2 / 26,we agree\r\n- - - - - - - - - - - - - - - - - ...,6,170,8
9,1870,ham,"Subject: nominations for oct . 21 - 23 , 2000\...",0,"nominations for oct . 21 - 23 , 2000",( see attached file : hplnl 021 . xls )\r\n- h...,6,25,0
