# Problem statement

In [None]:
To build a machine learning model that can accurately classify whether an email message is spam (unwanted/junk email) or 
ham (legitimate email), based on its text content.

In [None]:
# work roadmap

Load and preprocess a dataset of labeled email/SMS messages.
Convert text messages into numerical format using TF-IDF Vectorization.
Train a Naive Bayes classifier to detect spam messages.
Evaluate the model using appropriate metrics such as accuracy, precision, recall, and F1-score.
Allow real-time testing of new messages to check if they are spam or not.

You now have a fully working spam email detector using:
Preprocessing
TF-IDF
Naive Bayes ML
Evaluation metrics
Real-time testing

# 1. Import libraries

In [108]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import string
import re

# Gathering data

In [110]:
data = pd.read_csv("email_data.csv")
print(data.head())


   Unnamed: 0 label                                            message
0           0   ham  Go until jurong point, crazy.. Available only ...
1           1   ham                      Ok lar... Joking wif u oni...
2           2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3           3   ham  U dun say so early hor... U c already then say...
4           4   ham  Nah I don't think he goes to usf, he lives aro...


# 3. cleaning data

In [112]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = ''.join(char for char in text if char not in string.punctuation)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text


In [114]:
data['message'] = data['message'].apply(clean_text)

In [116]:
data

Unnamed: 0.1,Unnamed: 0,label,message
0,0,ham,go until jurong point crazy available only in ...
1,1,ham,ok lar joking wif u oni
2,2,spam,free entry in a wkly comp to win fa cup final ...
3,3,ham,u dun say so early hor u c already then say
4,4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...,...
5567,5567,spam,this is the nd time we have tried contact u u ...
5568,5568,ham,will Ã¼ b going to esplanade fr home
5569,5569,ham,pity was in mood for that soany other suggestions
5570,5570,ham,the guy did some bitching but i acted like id ...


# 4. convert data

In [83]:
data.drop('Unnamed: 0',axis = 1, inplace = True)

In [85]:
data

Unnamed: 0,label,message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final ...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the nd time we have tried contact u u ...
5568,ham,will Ã¼ b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggestions
5570,ham,the guy did some bitching but i acted like id ...


In [87]:
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# 5. split data

In [89]:
X = data['message']      # input text
y = data['label']        # spam or ham

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 6. create vector

In [91]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


# 7. train model

In [58]:
model = MultinomialNB()
model.fit(X_train_vec, y_train)


# 8. evaluate model

In [65]:
y_train_pred = model.predict(X_train_vec)

print("ðŸ“˜ Training Metrics:")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Precision:", precision_score(y_train, y_train_pred))
print("Recall:", recall_score(y_train, y_train_pred))
print("F1 Score:", f1_score(y_train, y_train_pred))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred))
print("Classification Report:\n", classification_report(y_train, y_train_pred))



ðŸ“˜ Training Metrics:
Accuracy: 0.9762171864482836
Precision: 1.0
Recall: 0.822742474916388
F1 Score: 0.9027522935779817
Confusion Matrix:
 [[3859    0]
 [ 106  492]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99      3859
           1       1.00      0.82      0.90       598

    accuracy                           0.98      4457
   macro avg       0.99      0.91      0.94      4457
weighted avg       0.98      0.98      0.98      4457



1

# 9. check on single data

In [21]:
def check_email(text):
    vec = vectorizer.transform([text])
    result = model.predict(vec)[0]
    return "Spam" if result == 1 else "Not Spam"

# Try it out!
print(check_email("Congratulations! You've won a free iPhone. Click here to claim."))


Spam


In [118]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
docs = [
    "I love machine learning",
    "Machine learning is amazing",
    "I love deep learning and machine learning"
]

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(docs)

# Get the feature names (words)
features = vectorizer.get_feature_names_out()

# Convert to dense format for display
import pandas as pd
df = pd.DataFrame(tfidf_matrix.toarray(), columns=features)

# Display the TF-IDF table
print(df)


    amazing       and      deep        is  learning      love   machine
0  0.000000  0.000000  0.000000  0.000000  0.522842  0.673255  0.522842
1  0.608845  0.000000  0.000000  0.608845  0.359594  0.000000  0.359594
2  0.000000  0.480984  0.480984  0.000000  0.568154  0.365801  0.284077
