In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('SMSSpamCollection',sep='\t',names=["label","message"])  #\t is a separetor will divide dataset into two columns as lable(spam/ham) and message is just seperated by a tab


In [3]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


#PRE-PROCESSING

In [17]:
#Importing Libraries
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
#We are using Stemming in the given dataset. This is due to the fact that we are 
#not bothered about semantic relation between words which is better captured by other methods like word2vec, Bert etc
ps= PorterStemmer()

In [7]:
corpus = []
for i in range(len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['message'][i]) #aprt from a-z and A-Z,every special charchter like (, ? / .) will be replced by spaces 
    review = review.lower() #alphabets in small letter
    review = review.split() # splitting words from sentences and storing
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] #stemming & removing unnecessary words using Stopwords
    review = ' '.join(review)
    corpus.append(review)

#Bag of words

In [8]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2500)
X = cv.fit_transform(corpus).toarray()

In [9]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
#to convert label columns into dummies variable like 0,1,etc
y=pd.get_dummies(df['label'])
y=y.iloc[:,1].values

In [11]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [12]:
# Train test split
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test=train_test_split(X,y,test_size=0.20, random_state=0)

#Naive Bayes Classifier

In [13]:
#Training model using Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
spam_detector_model = MultinomialNB().fit(X_train,y_train)

In [14]:
y_pred=spam_detector_model.predict(X_test)

# Accuracy and Confusion Matrix

In [15]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
confusion_m=confusion_matrix(y_test,y_pred)
confusion_m

array([[946,   9],
       [  7, 153]])

946+153 are correctly predicted and 9+7 are incorrectly predicted

In [16]:
#accuracy score
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.9856502242152466