In [1]:
!pip install scikit-learn pandas matplotlib



In [2]:
import pandas as pd

# Download and load the dataset directly from UCI
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!unzip smsspamcollection.zip

# Read the file
df = pd.read_csv("SMSSpamCollection", sep='\t', header=None, names=['label', 'message'])
df.head()

--2025-06-26 04:27:34--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘smsspamcollection.zip’

smsspamcollection.z     [   <=>              ] 198.65K   328KB/s    in 0.6s    

2025-06-26 04:27:35 (328 KB/s) - ‘smsspamcollection.zip’ saved [203415]

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Any missing values?
print(df.isnull().sum())

# How many spam and ham messages?
df['label'].value_counts()


label      0
message    0
dtype: int64


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


In [4]:
import string

def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

df['clean_msg'] = df['message'].apply(clean_text)
df[['message', 'clean_msg']].head()


Unnamed: 0,message,clean_msg
0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [5]:
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})
df[['label', 'label_num']].head()


Unnamed: 0,label,label_num
0,ham,0
1,ham,0
2,spam,1
3,ham,0
4,ham,0


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(df['clean_msg'])
y = df['label_num']


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.naive_bayes import MultinomialNB

# Create the model
model = MultinomialNB()

# Train the model on the training data
model.fit(X_train, y_train)

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict on test data
y_pred = model.predict(X_test)

# Show how good the model is
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("✅ Precision:", precision_score(y_test, y_pred))
print("✅ Recall:", recall_score(y_test, y_pred))
print("✅ F1 Score:", f1_score(y_test, y_pred))

✅ Accuracy: 0.9659192825112107
✅ Precision: 1.0
✅ Recall: 0.7449664429530202
✅ F1 Score: 0.8538461538461538


In [11]:
def predict_spam(message):
    import string
    # Clean message
    message = message.lower()
    message = ''.join([char for char in message if char not in string.punctuation])

    # Vectorize it
    message_vec = tfidf.transform([message])

    # Predict
    prediction = model.predict(message_vec)[0]

    return "💬 SPAM 🚨" if prediction == 1 else "💬 NOT SPAM ✅"

In [12]:
predict_spam("Congratulations! You’ve won a free car. Click here to claim it.")
predict_spam("Hey, just checking in about our meeting tomorrow.")

'💬 NOT SPAM ✅'

In [13]:
predict_spam("Congratulations! You’ve won a free car. Click here to claim it.")


'💬 SPAM 🚨'

In [14]:
import pickle

# Save the trained model
with open('spam_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the TF-IDF vectorizer too
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [15]:
from google.colab import files
files.download('spam_model.pkl')
files.download('tfidf_vectorizer.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
import string

def clean_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    return text

df['clean_msg'] = df['message'].apply(clean_text)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(df['clean_msg'])
y = df['label'].map({'ham': 0, 'spam': 1})

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X, y)

In [17]:
import pickle

# Save again
with open('spam_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [18]:
from google.colab import files
files.download('spam_model.pkl')
files.download('tfidf_vectorizer.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>