In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('enron_spam_data.csv')

In [3]:
ham_emails = data[data['Spam/Ham'] == 'ham']
spam_emails = data[data['Spam/Ham'] == 'spam']

In [4]:
ham_to_delete = ham_emails.sample(n=2000, random_state=42)
spam_to_delete = spam_emails.sample(n=2000, random_state=42)

In [5]:
data_cleaned = data.drop(ham_to_delete.index).drop(spam_to_delete.index)

In [6]:
data_cleaned.to_csv('cleaned_emails.csv',index = False)

In [8]:
df = pd.read_csv('cleaned_emails.csv')

In [9]:
df = df.drop(columns =['Date'])

In [10]:
df.head

<bound method NDFrame.head of        Message ID                                            Subject  \
0               0                       christmas tree farm pictures   
1               1                           vastar resources , inc .   
2               2                       calpine daily gas nomination   
3               4                          meter 7268 nov allocation   
4               5                           mcmullen gas for 11 / 99   
...           ...                                                ...   
29711       33708                                           proposal   
29712       33709                         cure premature ejaculation   
29713       33710                                   need your vics ?   
29714       33711  = ? iso - 8859 - 1 ? q ? good _ news _ c = eda...   
29715       33713              the next generation online pharmacy .   

                                                 Message Spam/Ham  
0                                    

In [11]:
df['Subject'] = df['Subject'].fillna('')
df['Message'] = df['Message'].fillna('')
df['Spam/Ham']  = df['Spam/Ham'].fillna('')

In [12]:
df.head
df['content'] = df['Subject'] + ' ' + df['Message']

In [13]:
X = df['content']
y = df['Spam/Ham']

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [18]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf,y_train)

In [19]:
y_pred = nb_model.predict(X_test_tfidf)

In [22]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)
print("Confusion Matrix: \n",conf_matrix)

Accuracy: 0.9899046550757151
Classification Report:
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99      4456
        spam       0.99      0.99      0.99      4459

    accuracy                           0.99      8915
   macro avg       0.99      0.99      0.99      8915
weighted avg       0.99      0.99      0.99      8915

Confusion Matrix: 
 [[4410   46]
 [  44 4415]]


<font color = 'Blue'><b>Naive Bayes</b> Multinomial</font>

In [21]:
def classify_emails(subject,message):
    combined_txt = subject + '' + message
    input_tfidf = vectorizer.transform([combined_txt])
    prediction = nb_model.predict(input_tfidf)[0]
    return 'spam' if prediction == 'spam' else 'ham'

subject = "Congratulations! You've won a prize"
message = "Click here to claim your $1,000,000 prize now!"
result = classify_emails(subject, message)
print(f"This email is: {result}")

This email is: spam
