In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import string
import re

In [2]:
sms_data=pd.read_csv('./data/spam.csv',encoding='latin',names=['v1', 'v2', 'v3','v4','v5'])
sms_data.describe()



Unnamed: 0,v1,v2,v3,v4,v5
count,5573,5573,50,12,6
unique,3,5170,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [3]:
sms_data.head()

Unnamed: 0,v1,v2,v3,v4,v5
0,v1,v2,,,
1,ham,"Go until jurong point, crazy.. Available only ...",,,
2,ham,Ok lar... Joking wif u oni...,,,
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
4,ham,U dun say so early hor... U c already then say...,,,


# preprocessing

In [4]:
sms_data.shape

(5573, 5)

In [5]:
sms_data.isnull().sum()

v1       0
v2       0
v3    5523
v4    5561
v5    5567
dtype: int64

In [6]:
#drop all null columns
columns_to_drop=['v3','v4','v5']
sms_data.drop(columns=columns_to_drop,axis=1, inplace=True)

# Print the DataFrame to verify the change
print(sms_data.head())


     v1                                                 v2
0    v1                                                 v2
1   ham  Go until jurong point, crazy.. Available only ...
2   ham                      Ok lar... Joking wif u oni...
3  spam  Free entry in 2 a wkly comp to win FA Cup fina...
4   ham  U dun say so early hor... U c already then say...


In [7]:
sms_data.shape

(5573, 2)

In [8]:
#encoding spam as 1 and ham as 0
from sklearn.preprocessing import LabelEncoder


In [9]:
encoder=LabelEncoder()
data=encoder.fit_transform(sms_data['v1'])
data

array([2, 0, 0, ..., 0, 0, 0])

In [10]:
sms_data['result']=data
sms_data.drop('v1',axis=1,inplace=True)
sms_data = sms_data.drop(0, axis=0)

In [11]:
sms_data

Unnamed: 0,v2,result
1,"Go until jurong point, crazy.. Available only ...",0
2,Ok lar... Joking wif u oni...,0
3,Free entry in 2 a wkly comp to win FA Cup fina...,1
4,U dun say so early hor... U c already then say...,0
5,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5568,This is the 2nd time we have tried 2 contact u...,1
5569,Will Ì_ b going to esplanade fr home?,0
5570,"Pity, * was in mood for that. So...any other s...",0
5571,The guy did some bitching but I acted like i'd...,0


In [12]:
#re
def clean_words(description):
    description =  description.lower()
    # Remove URLs
    description = re.sub(r'http\S+', '', description)
    # Remove punctuations
    description = re.sub(f'[{string.punctuation}]','',description)
    # Remove numbers
    description = re.sub(f'[{string.digits}]','',description)
    # Remove single charachters 
    description = re.sub(r'\s+[a-zA-Z]\s+', ' ', description)
    description = re.sub(r'<[^<>=]+>','', description)  # replacing HTML text with a blank
    description = re.sub('[^\s]+@[^\s]+','emailaddr', description) # replace email addresses with 'emailaddr'
    description = re.sub(r'[$]+','dollar', description) 

    return description

In [13]:
sms_data['sms']=sms_data["v2"].apply(clean_words)
sms_data

Unnamed: 0,v2,result,sms
1,"Go until jurong point, crazy.. Available only ...",0,go until jurong point crazy available only in ...
2,Ok lar... Joking wif u oni...,0,ok lar joking wif oni
3,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry in wkly comp to win fa cup final tk...
4,U dun say so early hor... U c already then say...,0,u dun say so early hor c already then say
5,"Nah I don't think he goes to usf, he lives aro...",0,nah dont think he goes to usf he lives around ...
...,...,...,...
5568,This is the 2nd time we have tried 2 contact u...,1,this is the nd time we have tried contact u h...
5569,Will Ì_ b going to esplanade fr home?,0,will ì going to esplanade fr home
5570,"Pity, * was in mood for that. So...any other s...",0,pity was in mood for that soany other suggest...
5571,The guy did some bitching but I acted like i'd...,0,the guy did some bitching but acted like id be...


In [14]:
x=sms_data['sms']
y=sms_data['result']


In [21]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization=TfidfVectorizer()
xv_train=vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

# creating model and finding accuracy

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix

In [29]:
logistic_model=LogisticRegression()
logistic_model.fit(xv_train,y_train)

In [30]:
y_pred=logistic_model.predict(xv_test)

In [31]:
logistic_model.score(xv_test,y_test)

0.9612347451543432

In [34]:
lr_accuracy=accuracy_score(y_test,y_pred)
lr_accuracy

0.9612347451543432

In [35]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1201
           1       0.99      0.72      0.84       192

    accuracy                           0.96      1393
   macro avg       0.98      0.86      0.91      1393
weighted avg       0.96      0.96      0.96      1393



#using svm

In [36]:
from sklearn.svm import SVC

In [37]:
svm_model = SVC()
svm_model.fit(xv_train, y_train)

In [38]:
# Make predictions on the test data
y_pred_svm = svm_model.predict(xv_test)

# Calculate accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f'Accuracy using SVM: {accuracy_svm}')


Accuracy using SVM: 0.9791816223977028


In [39]:
# Print other metrics if needed
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1201
           1       0.99      0.85      0.92       192

    accuracy                           0.98      1393
   macro avg       0.99      0.93      0.95      1393
weighted avg       0.98      0.98      0.98      1393



Conclusion:


Both models performed exceptionally well, achieving high accuracy rates above 96%.

The SVM model demonstrated a slightly higher accuracy (97.92%) compared to the Logistic Regression model (96.12%).