SMS Spam Detection using NLP


In [1]:
# Problem Statement
#Classify SMS messages as spam or ham(not spam)

In [2]:
import pandas as pd
df = pd.read_csv("spam.csv",encoding="latin-1")# Dataset SMS Spam Collection Dataset
df =df[['v1', 'v2']]
df.columns = ["labels","text"]
print(df.head())


  labels                                               text
0    ham  Go until jurong point, crazy.. Available only ...
1    ham                      Ok lar... Joking wif u oni...
2   spam  Free entry in 2 a wkly comp to win FA Cup fina...
3    ham  U dun say so early hor... U c already then say...
4    ham  Nah I don't think he goes to usf, he lives aro...


In [3]:
df['labels'] = df['labels'].map({'ham' : 0,'spam' : 1})

In [4]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADITHYAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADITHYAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADITHYAN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
def preprocessor_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [ w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return ' '.join(tokens)

In [6]:
df['clean_text'] = df['text'].apply(preprocessor_text)

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df['clean_text'],df['labels'],test_size = 0.5, random_state = 42)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2),max_features= 5000)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000,class_weight='balanced')

model.fit(X_train_vec, y_train)

In [10]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9680545585068198
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      2411
           1       0.85      0.93      0.89       375

    accuracy                           0.97      2786
   macro avg       0.92      0.95      0.93      2786
weighted avg       0.97      0.97      0.97      2786



In [11]:
def predict_sms(msg):
    msg_clean = preprocessor_text(msg)
    msg_vec = vectorizer.transform([msg_clean])
    result = model.predict(msg_vec)
    return "Spam" if result[0] == 1 else "Not Spam"

print(predict_sms("Congratulations! You won a free ticket"))
print(predict_sms("Can we talk tomorrow morning?"))

Spam
Not Spam


Using a Transformer Model Instead of Classic ML


In [13]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

sms = "Congratulations! You won a free lottery ticket"
labels = ["spam", "not spam"]

result = classifier(sms, labels)
print(result)
messages = [
    "Win a free iPhone now!",
    "Are we meeting tomorrow?",
    "Limited time offer, click now"
]

for msg in messages:
    result = classifier(msg, ["spam", "not spam"])
    print(msg, "→", result["labels"][0])

Device set to use cpu


{'sequence': 'Congratulations! You won a free lottery ticket', 'labels': ['not spam', 'spam'], 'scores': [0.9142686724662781, 0.08573128283023834]}
Win a free iPhone now! → not spam
Are we meeting tomorrow? → not spam
Limited time offer, click now → not spam
