<a href="https://colab.research.google.com/github/aabhapingle/spam-classifier/blob/main/spam_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from google.colab import files
  
  
uploaded = files.upload()

Saving SMSSpamCollection to SMSSpamCollection


In [8]:
import pandas as pd
import io

IMP - check file name and extension while using read_csv if you are using this method for uploading in this method

In [9]:
messages = pd.read_csv(io.BytesIO(uploaded['SMSSpamCollection']),sep='\t',names=["label", "message"])

sep = '\t' helps us to seperate the columns because here they are tab spaced

In [10]:
messages[:3]

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [11]:
import re
import nltk
from nltk.corpus import stopwords

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
from nltk.stem.porter import PorterStemmer  # for stemming words

In [19]:
ps = PorterStemmer()

In [20]:
corpus = []

In [21]:
corpus

[]

In [22]:
for i in range(0, len(messages)):
  review = re.sub('[^a-zA-Z]',' ', messages['message'][i])
  review = review.lower() # lower case all the sentences
  review = review.split() # split into words

  # apply stemming
  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review) 


In [24]:
corpus[:4]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say']

Creating a bag of words of the model

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
cv = CountVectorizer(max_features = 5000) # take only 5000 maximum features
# column heading is a word 
# every row is a sentence
# entry in the matrix = 1 if that word is present in the sentence
# otherwise 0

In [36]:
X = cv.fit_transform(corpus).toarray()

In [37]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [38]:
X.size

27860000

In [41]:
X.shape  # y dimension of the matix has changed, we are taking only 5000 max values

(5572, 5000)

In [42]:
5572*6296

35081312

Converting labels to numerical values

In [48]:
y = pd.get_dummies(messages['label'])

In [50]:
y[:4]

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0


In [51]:
y = y.iloc[:,1].values  # remove spam column

In [52]:
y.shape

(5572,)

In [54]:
y # ham -> 0 spam -> 1

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

Train Test Split

In [55]:
from sklearn.model_selection import train_test_split

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)

In [57]:
# Training model using Naive Bayes Classifier

In [58]:
X_train.shape

(4457, 5000)

In [59]:
X_test.shape

(1115, 5000)

In [63]:
from sklearn.naive_bayes import MultinomialNB

In [64]:
spam_detect_model = MultinomialNB().fit(X_train, y_train) # fit

In [65]:
y_pred = spam_detect_model.predict(X_test) # predict

In [66]:
from sklearn.metrics import confusion_matrix

In [67]:
confusion_m = confusion_matrix(y_test, y_pred)

In [68]:
confusion_m

array([[946,   9],
       [  8, 152]])

In [69]:
# 946 out of 946+152 have been predicted correctly

In [70]:
from sklearn.metrics import accuracy_score

In [71]:
accuracy = accuracy_score(y_test, y_pred)

In [72]:
accuracy

0.9847533632286996