In [5]:
import pandas as pd
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [9]:
# Load the dataset
df = pd.read_csv('subset.csv')  # Adjust file name as needed

# Display the structure
print(df.head())


                                        file  \
0             kaminski-v/all_documents/2500.   
1                 hayslett-r/sent_items/375.   
2                       germany-c/appala/86.   
3  kaminski-v/c/technote/mail/techmemos/257.   
4                        farmer-d/tufco/716.   

                                             message  
0  Message-ID: <21731374.1075856243409.JavaMail.e...  
1  Message-ID: <15611337.1075862281661.JavaMail.e...  
2  Message-ID: <30542256.1075853724942.JavaMail.e...  
3  Message-ID: <19614710.1075857036430.JavaMail.e...  
4  Message-ID: <13594729.1075854345152.JavaMail.e...  


In [11]:
def clean_email_content(content):
    # Remove email metadata
    metadata_pattern = r"^(Message-ID:|From:|To:|Subject:|Mime-Version:|Content-Type:|X-.*:).*\n?"
    content = re.sub(metadata_pattern, '', content, flags=re.MULTILINE)
    
    # Remove non-alphanumeric characters
    content = re.sub(r'\W+', ' ', content)
    
    # Convert to lowercase
    content = content.lower()
    
    return content

# Apply cleaning function
df['cleaned_message'] = df['message'].apply(clean_email_content)

# Display cleaned data
print(df[['file', 'cleaned_message']].head())


                                        file  \
0             kaminski-v/all_documents/2500.   
1                 hayslett-r/sent_items/375.   
2                       germany-c/appala/86.   
3  kaminski-v/c/technote/mail/techmemos/257.   
4                        farmer-d/tufco/716.   

                                     cleaned_message  
0  date thu 21 dec 2000 05 36 00 0800 pst content...  
1  date mon 19 nov 2001 12 56 12 0800 pst content...  
2  date fri 10 mar 2000 04 28 00 0800 pst cc carr...  
3  date mon 10 jan 2000 07 35 00 0800 pst cc vinc...  
4  date fri 16 mar 2001 03 45 00 0800 pst daren j...  


In [13]:
# Assume spam emails are stored in directories with "spam" in their path
df['label'] = df['file'].apply(lambda x: 1 if 'spam' in x else 0)  # 1 for spam, 0 for ham


In [15]:
# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Transform the messages into TF-IDF features
X = vectorizer.fit_transform(df['cleaned_message']).toarray()

# Labels
y = df['label']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
# Naive Bayes Classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


In [19]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Detailed Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 100.00%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [21]:
import joblib

# Save the model and TF-IDF vectorizer
joblib.dump(model, 'naive_bayes_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Load for inference
loaded_model = joblib.load('naive_bayes_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')


In [23]:
def classify_email(email_content):
    cleaned_content = clean_email_content(email_content)
    transformed_content = loaded_vectorizer.transform([cleaned_content]).toarray()
    prediction = loaded_model.predict(transformed_content)
    return "Spam" if prediction[0] == 1 else "Ham"

# Example email
new_email = "Congratulations! You've won a $1,000 gift card. Claim now!"
print(classify_email(new_email))


Ham
