In [1]:
import pandas as pd
import os

In [2]:
ham_path = r"C:\Sculpture\NLP_PRojects\Email_Spam_Detection\archive\enron6\ham"
spam_path = r"C:\Sculpture\NLP_PRojects\Email_Spam_Detection\archive\enron6\spam"

In [3]:
ham_emails = [os.path.join(ham_path, f) for f in os.listdir(ham_path)]
spam_emails = [os.path.join(spam_path, f) for f in os.listdir(spam_path)]

In [4]:
dataset = {
    "text" : [],
    "label" : []
}

In [5]:
for email in ham_emails:
    with open(email, "r", encoding="utf-8", errors="ignore") as f:
        dataset["text"].append(f.read())
        dataset['label'].append("ham")

for email in spam_emails:
    with open(email, "r", encoding="utf-8", errors="ignore") as f:
        dataset['text'].append(f.read())
        dataset['label'].append("spam")

In [6]:
dataframe = pd.DataFrame(dataset)
dataframe.head()

Unnamed: 0,text,label
0,Subject: key dates and impact of upcoming sap ...,ham
1,Subject: transportation to resort\nplease be i...,ham
2,Subject: human resources organization\nas enro...,ham
3,"Subject: what do you want to know today ?\n"" a...",ham
4,"Subject: tw weekly , 6 - 9 - 00\nplease see th...",ham


In [7]:
dataframe.shape

(6000, 2)

In [8]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6000 non-null   object
 1   label   6000 non-null   object
dtypes: object(2)
memory usage: 93.9+ KB


In [9]:
dataframe.nunique()

text     5989
label       2
dtype: int64

In [10]:
print(dataframe.loc[dataframe['label'] == "ham"].shape)
dataframe.loc[dataframe['label'] == "spam"].shape

(1500, 2)


(4500, 2)

In [11]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [12]:
def preprocess(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_stop])

In [13]:
dataframe['Processed_Text'] = dataframe['text'].apply(preprocess)

In [14]:
dataframe['Processed_Text'][0]

'subject key date impact upcoming sap implementation \n week project apollo conduct final sap \n implementation \x01 implementation impact approximately 12 000 new \n user plus exist system user sap bring new dynamic enron \n enhance timely flow sharing specific project human resource \n procurement financial information business unit \n continent \n final implementation retire multiple disparate system replace \n common integrated system encompass process include \n payroll timekeepe benefit project management numerous financial \n process \n employee empower update view personal information \n intranet base ehronline single end sap s self service \n functionality enron s global information system gis \n thing individual able update personal information include \n w 4 address personal banking information manage individual \n time new time entry tool view benefit election view \n personal payroll information line \n enron employee pay corporate payroll houston exclude \n azurix employe

In [15]:
from sklearn.model_selection import train_test_split

x = dataframe['Processed_Text']
y = dataframe['label']

In [16]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size=0.8, random_state=42
)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

x_train_vex = vectorizer.fit_transform(x_train)
x_test_vex = vectorizer.transform(x_test)

In [18]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train_vex, y_train)

In [32]:
from sklearn.metrics import classification_report
y_pred = model.predict(x_test_vex)
y_pred


array(['spam', 'spam', 'ham', ..., 'spam', 'spam', 'spam'], dtype='<U4')

In [21]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

         ham       0.73      0.99      0.84       243
        spam       1.00      0.91      0.95       957

    accuracy                           0.92      1200
   macro avg       0.86      0.95      0.90      1200
weighted avg       0.94      0.92      0.93      1200



In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.99      0.73      0.84       330
        spam       0.91      1.00      0.95       870

    accuracy                           0.92      1200
   macro avg       0.95      0.86      0.90      1200
weighted avg       0.93      0.92      0.92      1200



In [28]:
def predict_email(text):
    process = preprocess(text)
    vectorize = vectorizer.transform([process])
    prediction = model.predict(vectorize)
    return prediction[0]

In [31]:
new  = dataframe['text'][0]
print(predict_email(new))

ham


In [30]:
dataframe['text'][0]

"Subject: key dates and impact of upcoming sap implementation\nover the next few weeks , project apollo and beyond will conduct its final sap\nimplementation \x01 ) this implementation will impact approximately 12 , 000 new\nusers plus all existing system users . sap brings a new dynamic to enron ,\nenhancing the timely flow and sharing of specific project , human resources ,\nprocurement , and financial information across business units and across\ncontinents .\nthis final implementation will retire multiple , disparate systems and replace\nthem with a common , integrated system encompassing many processes including\npayroll , timekeeping , benefits , project management , and numerous financial\nprocesses .\nemployees will be empowered to update and / or view their personal information\nvia the intranet - based ehronline - - a single front - end to sap ' s self service\nfunctionality and enron ' s global information system ( gis ) . among other\nthings , individuals will be able to up