#Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,precision_score

#Loading Dataset

In [2]:
df=pd.read_csv("smsspamdetection.csv", encoding='latin-1')
df=df[['label','msg']]
df

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


#Declaring feature & target column

In [3]:
x=df[['msg']]
y=df['label']

#Checking if the Target labels is balanced or not

In [4]:
y.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


#Declaring Train & Test data

In [5]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=10)

#Converting Feature column using **TfidfVectorizer**

In [6]:
vectr=TfidfVectorizer()
xtrain=vectr.fit_transform(xtrain['msg'])
xtest=vectr.transform(xtest['msg'])

#Importing Library for **LogisticRegression** Model

In [7]:
from sklearn.linear_model import LogisticRegression

#Creating and training **LogisticRegresion** model

In [8]:
model=LogisticRegression()
model.fit(xtrain,ytrain)

#**LogisticRegression** Model Evaluation

In [9]:
ypred=model.predict(xtest)
LR_trainscore=model.score(xtrain,ytrain)
LR_testscore=model.score(xtest,ytest)
LR_conmatrix=confusion_matrix(ytest,ypred)
LR_f1score=f1_score(ytest,ypred,pos_label='ham')
LR_precscore=precision_score(ytest,ypred,pos_label='ham')

In [30]:
print(f"LogisticRegression Model:\nTrain-score: {LR_trainscore}\nTest-score: {LR_testscore}\nf1 score: {LR_f1score}\nprecision score: {LR_precscore}\n{LR_conmatrix}")

LogisticRegression Model:
Train-score: 0.9759928202827014
Test-score: 0.9668161434977578
f1 score: 0.9811320754716981
precision score: 0.9658634538152611
[[962   3]
 [ 34 116]]


#Importing Library for **DecisionTree** Model

In [11]:
from sklearn.tree import DecisionTreeClassifier

#Creating and Training **DecisionTree** model by using **GridSearchCV**

In [12]:
pip=Pipeline(steps=[("model",DecisionTreeClassifier())])
param={"model__criterion":['entropy','gini'],"model__max_depth":[5,10,15,20],"model__min_samples_split":[2,4,6,8],"model__min_samples_leaf":[2,4,6,8]}
model1=GridSearchCV(pip,param,cv=5,n_jobs=-1)
model1.fit(xtrain,ytrain)

#**DecisionTree** Model Evaluation

In [13]:
ypred=model1.predict(xtest)
DT_trainscore=model1.score(xtrain,ytrain)
DT_testscore=model1.score(xtest,ytest)
DT_conmatrix=confusion_matrix(ytest,ypred)
DT_f1score=f1_score(ytest,ypred,pos_label='ham')
DT_precscore=precision_score(ytest,ypred,pos_label='ham')

In [14]:
print(f"DecisionTree Model\nTrain-score: {DT_trainscore}\nTest-score: {DT_testscore}\nf1 score: {DT_f1score}\nprecision score: {DT_precscore}\n{DT_conmatrix}")


DecisionTree Model
Train-score: 0.9858649315683194
Test-score: 0.9506726457399103
f1 score: 0.9720101781170484
precision score: 0.955
[[955  10]
 [ 45 105]]


#Importing Library for **SVC** Model

In [15]:
from sklearn.svm import SVC

#Creating and Training **SVC** Model by using **GridSearchCV**

In [16]:
pip=Pipeline(steps=[("model",SVC())])
param={"model__kernel":['linear','sigmoid','rbf']}
model2=GridSearchCV(pip,param,cv=5,n_jobs=-1)
model2.fit(xtrain,ytrain)

#**SVC** Model Evaluation

In [17]:
ypred=model2.predict(xtest)
SVC_trainscore=model2.score(xtrain,ytrain)
SVC_testscore=model2.score(xtest,ytest)
SVC_conmatrix=confusion_matrix(ytest,ypred)
SVC_f1score=f1_score(ytest,ypred,pos_label='ham')
SVC_precscore=precision_score(ytest,ypred,pos_label='ham')

In [18]:
print(f"SVC Model\nTrain-score: {SVC_trainscore}\nTest-score: {SVC_testscore}\nf1 score: {SVC_f1score}\nprecision score: {SVC_precscore}\n{SVC_conmatrix}")


SVC Model
Train-score: 0.9966345075162666
Test-score: 0.9856502242152466
f1 score: 0.9917525773195877
precision score: 0.9866666666666667
[[962   3]
 [ 13 137]]


#Importing Library for **KNN** Model

In [19]:
from sklearn.neighbors import KNeighborsClassifier

#Creating and Training **KNN** Model


In [20]:
model3=KNeighborsClassifier()
model3.fit(xtrain,ytrain)

#**KNN** Model Evaluation

In [21]:
ypred=model3.predict(xtest)
KNN_trainscore=model3.score(xtrain,ytrain)
KNN_testscore=model3.score(xtest,ytest)
KNN_conmatrix=confusion_matrix(ytest,ypred)
KNN_f1score=f1_score(ytest,ypred,pos_label='ham')
KNN_precscore=precision_score(ytest,ypred,pos_label='ham')

In [22]:
print(f"KNN Model\nTrain-score: {KNN_trainscore}\nTest-score: {KNN_testscore}\nf1 score: {KNN_f1score}\nprecision score: {KNN_precscore}\n{KNN_conmatrix}")

KNN Model
Train-score: 0.923042405205295
Test-score: 0.9040358744394619
f1 score: 0.9474717722140402
precision score: 0.9001865671641791
[[965   0]
 [107  43]]


#Importing Library for **MultinomialNB** Model

In [23]:
from sklearn.naive_bayes import MultinomialNB

#Creating and Training **MultinomialNB** Model


In [24]:
model4=MultinomialNB()
model4.fit(xtrain,ytrain)

#**MultinomialNB** Model Evaluation

In [25]:
ypred=model4.predict(xtest)
NB_trainscore=model4.score(xtrain,ytrain)
NB_testscore=model4.score(xtest,ytest)
NB_conmatrix=confusion_matrix(ytest,ypred)
NB_f1score=f1_score(ytest,ypred,pos_label='ham')
NB_precscore=precision_score(ytest,ypred,pos_label='ham')

In [26]:
print(f"MultinomialNB Model\nTrain-score: {NB_trainscore}\nTest-score: {NB_testscore}\nf1 score: {NB_f1score}\nprecision score: {NB_precscore}\n{NB_conmatrix}")

MultinomialNB Model
Train-score: 0.974646623289208
Test-score: 0.9596412556053812
f1 score: 0.9772151898734177
precision score: 0.9554455445544554
[[965   0]
 [ 45 105]]


##I have tested with 5 models (LogisticRegression,DecisionTreeClassifier,SVC,KNeighborsClassifier,MultinomialNB).After the evaluation SVC performs well, so I'm choosing SVC Model for SMS Spam Detection.

#Predicting with new input

In [27]:
newx1=vectr.transform(["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845"])
model2.predict(newx1)

array(['spam'], dtype=object)

#Using **LLM Model** (Text Classification) from **HuggingFace** for verifying if my SVC model is predicting valid output

In [28]:
import os
import requests

os.environ['HF_TOKEN']="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"
API_URL = "https://router.huggingface.co/hf-inference/models/mrm8488/bert-tiny-finetuned-sms-spam-detection"
headers = {
    "Authorization": f"Bearer {os.environ['HF_TOKEN']}",
}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

output = query({
    "inputs": "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 0845",
})
print(output)

[[{'label': 'LABEL_1', 'score': 0.8987986445426941}, {'label': 'LABEL_0', 'score': 0.10120132565498352}]]


####(Label_0 = not_spam, Label_1 = spam). Here the LLM model gives score 0.898 (for Label_1) higher than 0.101 (for Label_0).So it is predicting that input is spam.So we can say that our SVC model is performing well.

#Using **LLM Model "SmolLM3"** (Text Generation) from **HuggingFace** for verifying why our SVC model is predicting 'spam'.

In [33]:
import os
import requests

os.environ['HF_TOKEN']="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"
API_URL = "https://router.huggingface.co/hf-inference/models/HuggingFaceTB/SmolLM3-3B/v1/chat/completions"
headers = {
    "Authorization": f"Bearer {os.environ['HF_TOKEN']}",
}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

response = query({
    "messages": [
        {
            "role": "user",
            "content": "I have given input: 'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&Cs apply 0845' and got output: 'spam' from SVC ml model which classifies sms spam/not spam.Give conclusion."
        }
    ],
    "model": "HuggingFaceTB/SmolLM3-3B"
})

print(response["choices"][0]["message"]['content'])

<think>
Okay, let's see. The user provided an input SMS message and got a "spam" classification from an SMS spam detection model. They want a conclusion based on this.

First, I need to analyze the input message. The message is: "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&Cs apply 0845". 

Looking at the content, it mentions a free entry to a weekly competition to win FA Cup tickets. The competition is on May 21, 2005. There's an offer to receive an entry question by texting FA to 87121, with a standard text rate. Also, there are Terms and Conditions and a call number 0845.

Now, the model classified this as spam. Let's think about the factors that would lead to a spam classification. Typically, spam messages have certain characteristics:

1. **Urgency or Time Constraints**: Phrases like "only until [date]" or "limited time offer" can indicate urgency.
2. **Promotions or Offers**: If the message is promo

#**Conclusion:**
##The model likely classified the input as spam due to the presence of promotional language, urgency, call-to-action in text, and technical spam patterns. These elements are classic indicators of spam, which the model has learned to recognize through training data.