In [9]:
import pandas as pd
import seaborn as sns
messages=pd.read_csv('SMSSpamCollection',sep='\t',names=["Label","Message"])

#Pre Processing

In [2]:
messages.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
messages.tail()

Unnamed: 0,Label,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [5]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Label    5572 non-null   object
 1   Message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
messages.describe()

Unnamed: 0,Label,Message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [7]:
messages.shape

(5572, 2)

In [8]:
messages.isnull().any()

Label      False
Message    False
dtype: bool

In [13]:
import re  #used for regular expressions
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords #to remove extra words like the,in,with,of,for etc.
from nltk.stem.porter import PorterStemmer  #used for stemming words-reduces the words to the root words

ps=PorterStemmer()
corpus=[]
for i in range(0,len(messages)):
  review=re.sub('[^a-zA-Z]',' ',messages['Message'][i]) #to remove all characters except a-z and A-Z
  review=review.lower()
  review=review.split()
  review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
  review=' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


#Bag of Words model

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=3000)
x=cv.fit_transform(corpus).toarray()
y=pd.get_dummies(messages['Label']) #to convert the words to dummy variables so that the computer can understand it
y=y.iloc[:,1].values #y is dependent feature

In [15]:
print(x)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [16]:
print(y)

[0 0 1 ... 0 0 0]


#Train Test Split


In [17]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1) #20% observations go into test set

In [18]:
print(x_train)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [19]:
print(x_test)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [20]:
print(y_train)

[0 0 0 ... 0 1 0]


In [21]:
print(y_test)

[0 0 0 ... 0 0 0]


#Train using Naive Bayes classification method

In [22]:
from sklearn.naive_bayes import MultinomialNB
spam_detection_model=MultinomialNB().fit(x_train,y_train)

In [23]:
y_pred=spam_detection_model.predict(x_test)

In [24]:
print(y_pred)

[0 0 0 ... 0 0 0]


#Comparison Between y_pred and y_test using confusion matrix

In [25]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[962   6]
 [  5 142]]


#Accuracy of the model

In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9901345291479821