In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('SMSSpamCollection' , sep='\t', names=['label','messages'])

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
label       5572 non-null object
messages    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [4]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [5]:
data.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
#Packages for NLP
# 1. NLTK (Natural Language Processing Toolkit)
# 2. Spacy 
# 3. sklearn

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prashantn/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
#Text Preprocessing
# 1. Remove Punctuations
# 2. Convert Sentences into words
# 3. Remove the Stopwords
# 4. Words are in lower case

In [10]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
d = 'Welcome To SL !'

In [20]:
from nltk.corpus import stopwords
def textPreprocessing(feature):
    removePunctuations = [character for character in feature if character not in string.punctuation]
    sentencesWithoutPunctuation = ''.join(removePunctuations)
    words = sentencesWithoutPunctuation.split(" ")
    finalWords = [word for word in words if word.lower() not in stopwords.words('english')]
    return finalWords

In [21]:
textPreprocessing(d)

['Welcome', 'SL', '']

In [22]:
#Use SKLearn to create BOW

from sklearn.feature_extraction.text import CountVectorizer
wordVector = CountVectorizer(analyzer=textPreprocessing)
finalWordVector = wordVector.fit(data['messages'])

In [23]:
bagWords = wordVector.transform(data['messages'])

In [24]:
bagWords

<5572x11427 sparse matrix of type '<class 'numpy.int64'>'
	with 51591 stored elements in Compressed Sparse Row format>

In [29]:
#Apply TFIDF on BOW

from sklearn.feature_extraction.text import TfidfTransformer
tfidfobject = TfidfTransformer().fit(bagWords)

In [30]:
featureArray = tfidfobject.transform(bagWords)

In [31]:
featureArray

<5572x11427 sparse matrix of type '<class 'numpy.float64'>'
	with 51591 stored elements in Compressed Sparse Row format>

In [32]:
#naiveBayes ---> MultinomialNB Classifier (For Text Data)
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(featureArray,data['label'])


In [33]:
model

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [34]:
model.score(featureArray,data['label'])

0.9791816223977028

In [35]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(data['label'],model.predict(featureArray))
cm

array([[4825,    0],
       [ 116,  631]])

In [36]:
# Client is not fine if ham sms goes to spam folder

In [39]:
#Deployment Example
inputSMS = input("Enter SMS to Detect: ")
preproces = textPreprocessing(inputSMS)
bow = finalWordVector.transform(preproces)
feature = tfidfobject.transform(bow)
print("Classify: ",model.predict(feature)[0])

Enter SMS to Detect: Welcome To Simplilearn. Enjoy ML class
Classify:  ham


In [40]:
from sklearn.linear_model import LogisticRegression
model2 = LogisticRegression().fit(featureArray,data['label'])
model2.score(featureArray,data['label'])




0.9641062455132807

In [41]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(data['label'],model2.predict(featureArray))
cm

array([[4821,    4],
       [ 196,  551]])