In [44]:
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
dataset = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]


In [46]:
dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [47]:
dataset.columns = ['label', 'text']  # change the collumn name
dataset.head(4)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...


In [48]:
dataset.isnull().sum()

label    0
text     0
dtype: int64

In [49]:
dataset['label'] = dataset['label'].map({'ham': 0, 'spam': 1})


In [50]:
#Text Cleaning & Preprocessing

In [51]:
nltk.download('stopwords')
ps = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [ps.stem(word) for word in words if word not in stopwords.words('english')]
    return " ".join(words)

dataset['cleaned_text'] = dataset['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to C:\Users\Avishank
[nltk_data]     Dwivedi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
# Convert Text to Numeric (TF-IDF)

In [53]:
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(dataset['cleaned_text']).toarray()
y = dataset['label'].values

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [55]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Create and train the model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict on test set
y_pred = nb_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.97847533632287

Confusion Matrix:
 [[965   0]
 [ 24 126]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [56]:
import joblib

# Save the model
joblib.dump(nb_model, "spam_classifier_model.pkl")

# Save the TF-IDF vectorizer
joblib.dump(tfidf, "tfidf_vectorizer.pkl")



['tfidf_vectorizer.pkl']

In [57]:
import joblib

# Load model and vectorizer
model = joblib.load("spam_classifier_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Text preprocessing function
def preprocess_text(text):
    import string
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    import nltk
    nltk.download('stopwords')

    ps = PorterStemmer()
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [ps.stem(word) for word in words if word not in stopwords.words('english')]
    return " ".join(words)

# Example user input
sample = "You won a free ticket to Bahamas!"
cleaned = preprocess_text(sample)
vectorized = vectorizer.transform([cleaned])
prediction = model.predict(vectorized)

print("🔴 Spam" if prediction[0] == 1 else "🟢 Ham")


🟢 Ham


[nltk_data] Downloading package stopwords to C:\Users\Avishank
[nltk_data]     Dwivedi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
