In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/email.csv")
df.columns = ['label', 'message']  # Rename for simplicity


In [None]:
df['label'] = df['label'].astype(str).str.strip().str.lower()     # Remove spaces & lowercase
df['label'] = df['label'].map({'ham': 0, 'spam': 1})              # Map ham to 0, spam to 1
df.dropna(subset=['label'], inplace=True)                        # Remove rows with missing labels
df['message'] = df['message'].astype(str).str.lower()            # Lowercase all messages


In [None]:
# Labels
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=3000)
X = tfidf.fit_transform(df['message']).toarray()
y = df['label']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9847533632286996

Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       966
         1.0       1.00      0.89      0.94       149

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.97      1115
weighted avg       0.99      0.98      0.98      1115


Confusion Matrix:
 [[966   0]
 [ 17 132]]


In [None]:
# Function to predict new messages
def predict_message(message):
    message = message.lower()                            # lowercase
    vector = tfidf.transform([message]).toarray()        # convert to TF-IDF
    prediction = model.predict(vector)[0]                # predict
    label = 'SPAM' if prediction == 1 else 'HAM'
    return label

# Examples
examples = [
    "Congratulations! You've won a free iPhone. Click here to claim now!",
    "Hey, are we still meeting for dinner tonight?",
    "This is not spam. It's a business proposal from your friend.",
    "URGENT: Your account has been compromised. Log in immediately to secure it.",
    "Reminder: Your dentist appointment is tomorrow at 3 PM."
]

# Predict for each example
for msg in examples:
    result = predict_message(msg)
    print(f"Message: {msg}\n→ Prediction: {result}\n")


Message: Congratulations! You've won a free iPhone. Click here to claim now!
→ Prediction: SPAM

Message: Hey, are we still meeting for dinner tonight?
→ Prediction: HAM

Message: This is not spam. It's a business proposal from your friend.
→ Prediction: HAM

Message: URGENT: Your account has been compromised. Log in immediately to secure it.
→ Prediction: SPAM

Message: Reminder: Your dentist appointment is tomorrow at 3 PM.
→ Prediction: HAM



In [None]:
# Save the model
joblib.dump(model, 'spam_classifier_model.joblib')



['spam_classifier_model.joblib']

In [None]:
# Save the TF-IDF vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.joblib')


['tfidf_vectorizer.joblib']

In [None]:
from google.colab import files

files.download('spam_classifier_model.joblib')
files.download('tfidf_vectorizer.joblib')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>