# Task-4: SMS Spam Classification

Description: Build an AI model that can classify SMS messages as spam or
legitimate. Use techniques like TF-IDF or word embeddings with
classifiers like Naive Bayes, Logistic Regression, or Support Vector
Machines to identify spam messages

## Collecting & Unzipping data

In [1]:
import zipfile
data=zipfile.ZipFile('archive.zip')
data.extractall()
data.close()

## Importing libraries & dependencies

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import pickle
import nltk
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ariji\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Loading the data

In [4]:
data=pd.read_csv('spam.csv',encoding='ISO-8859-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
data.shape

(5572, 5)

## Preprocessing data

In [6]:
data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [7]:
# Removing irrelevant columns
cols=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
data.drop(columns=cols,inplace=True)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [9]:
data.isnull().sum()

v1    0
v2    0
dtype: int64

In [10]:
# Removing stopwords
from nltk.corpus import stopwords
sw = stopwords.words('english')

def stopword(text) : 
    txt = [word.lower() for word in text.split() if word.lower() not in sw]
    return txt 

In [11]:
data['v2']=data['v2'].apply(stopword)
data.head()

Unnamed: 0,v1,v2
0,ham,"[go, jurong, point,, crazy.., available, bugis..."
1,ham,"[ok, lar..., joking, wif, u, oni...]"
2,spam,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,"[u, dun, say, early, hor..., u, c, already, sa..."
4,ham,"[nah, think, goes, usf,, lives, around, though]"


### Stemming

In [12]:
from nltk.stem.snowball import SnowballStemmer 

ss = SnowballStemmer("english")

def stemming(text) : 
    text = [ss.stem(word) for word in text if word.split()]
    return "".join(text)


In [13]:
data['v2']=data['v2'].apply(stemming)

In [14]:
data.head()

Unnamed: 0,v1,v2
0,ham,"gojurongpoint,crazy..availbugingreatworldlaebu..."
1,ham,oklar...jokewifuoni...
2,spam,freeentri2wklicompwinfacupfinaltkts21stmay2005...
3,ham,udunsayearlihor...ucalreadisay...
4,ham,"nahthinkgoeusf,livearoundthough"


### TF-IDF Vectorization

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer 

tfid_vect = TfidfVectorizer()

tfid_matrix = tfid_vect.fit_transform(data['v2'])

print(f"Type :{type(tfid_matrix)} , Matrix at 0 : {tfid_matrix[0]} , Shape : {tfid_matrix.shape}")

Type :<class 'scipy.sparse._csr.csr_matrix'> , Matrix at 0 :   (0, 1827)	0.5056391989470028
  (0, 1030)	0.5056391989470028
  (0, 2166)	0.48268727087494234
  (0, 3635)	0.5056391989470028 , Shape : (5572, 12124)


In [16]:
array = tfid_matrix.todense()

In [17]:
array

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
df1 = pd.DataFrame(array)

In [19]:
df1[df1[10]!=0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12114,12115,12116,12117,12118,12119,12120,12121,12122,12123
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
df1['v1']=data['v1']

In [21]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12115,12116,12117,12118,12119,12120,12121,12122,12123,v1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,spam
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham


In [22]:
dataset=df1.copy()

In [23]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Columns: 12125 entries, 0 to v1
dtypes: float64(12124), object(1)
memory usage: 515.4+ MB


In [24]:
dataset.to_csv('cleaned_data.csv')

## Splittiing the data

In [25]:
from sklearn.model_selection import train_test_split
x=dataset.drop(columns=['v1'])
y=dataset['v1']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=34)

In [26]:
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((4457, 12124), (4457,), (1115, 12124), (1115,))

## Building Model

In [27]:
from sklearn.naive_bayes import BernoulliNB , MultinomialNB , GaussianNB 
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score ,classification_report
classifier=[
    ['LogisticRegression : ',LogisticRegression(max_iter=1000)],
    ['Gaussian Naive Bayes : ',GaussianNB()],
    ['Bernoulli Naive Bayes : ',BernoulliNB()],
    ['Multinomial Naive Bayes : ',MultinomialNB()],
    ['SVM : ',SVC()],
]

In [28]:
for name,clf in classifier:
    clf.fit(x_train,y_train)
    y_pred=clf.predict(x_test)
    print(f'{name} : {accuracy_score(y_test,y_pred)}')

LogisticRegression :  : 0.8887892376681614
Gaussian Naive Bayes :  : 0.46905829596412557
Bernoulli Naive Bayes :  : 0.9022421524663677
Multinomial Naive Bayes :  : 0.9210762331838565
SVM :  : 0.9228699551569507


In [29]:
cv=SVC()
cv.fit(x_train,y_train)
y_pred=cv.predict(x_test)
print(accuracy_score(y_pred,y_test))

0.9228699551569507


In [30]:
pickle.dump(cv,open('best_model.pkl','wb'))