 Implement e-mail spam filtering using text classification algorithm with appropriate dataset

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import re

# Load the dataset (replace 'emails.csv' with your actual file path)
data = pd.read_csv('emails.csv')
data.head()

# Split the data into features (text) and target (spam)
X = data['text']
y = data['spam']

# Convert text to numerical features using TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2)

# Create an SVM classifier
svm_classifier = SVC(kernel='linear')

# Train the SVM classifier
svm_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
class_report = classification_report(y_test, y_pred, target_names=['Not Spam', 'Spam'])
print("Classification Report:\n", class_report)

# Define a function to classify email subjects
def classify_email(subject):
    cleaned_subject = re.sub(r'^Subject:\s*', '', subject)  # Remove "Subject:" prefix
    vectorized_subject = vectorizer.transform([cleaned_subject])
    prediction = svm_classifier.predict(vectorized_subject)
    if prediction[0] == 1:
        return "Spam"
    else:
        return "Not Spam"

# Ask the user to enter an email subject
user_input = input("Enter an email subject: ")
classification_result = classify_email(user_input)
print("Classification:", classification_result)


Accuracy: 1.00
Classification Report:
               precision    recall  f1-score   support

    Not Spam       0.99      1.00      1.00       886
        Spam       1.00      0.98      0.99       260

    accuracy                           1.00      1146
   macro avg       1.00      0.99      0.99      1146
weighted avg       1.00      1.00      1.00      1146



Enter an email subject:  hello this is recruter from cisco


Classification: Not Spam


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import random
df = pd.read_csv('emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [7]:

# Convert text data into numerical features
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])  # 'text' column contains the email content
y = df['spam']  # 'spam' column contains labels (1 for spam, 0 for ham)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
# Print one example from y_pred
random_index = random.randint(0, len(y_pred) - 1)  # Get a random index
print(f"Example Prediction:\nText: {X_test[random_index].toarray()}\nPredicted Spam: {y_pred[random_index]}")

Accuracy: 0.9912739965095986
Example Prediction:
Text: [[0 0 0 ... 0 0 0]]
Predicted Spam: 1
