In [1]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np

In [2]:
# load the data set
df = pd.read_table('SMSSpamCollection', header=None, encoding='utf-8')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


In [6]:
# convert the labels to binary values
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
labels = encoder.fit_transform(classes)

print(labels[0:20])

[0 0 1 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 1]


In [8]:
# store the text messages
all_sms = df[1]
print(all_sms[-5:])

5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: 1, dtype: object


In [9]:
# replace email addresses
clean_sms = all_sms.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddress')

# replace urls
clean_sms = clean_sms.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')

# replace money symbols
clean_sms = clean_sms.str.replace(r'€|\$', 'moneysymbol')

# replace phone numbers
clean_sms = clean_sms.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumber')

# replace numbers
clean_sms = clean_sms.str.replace(r'\d+(\.\d+)?', 'number')

# replace punctuations
clean_sms = clean_sms.str.replace(r'[^\w\d\s]', ' ')

# replace whitespaces
clean_sms = clean_sms.str.replace(r'\s+', ' ')

# replace leading and trailing whitespaces
clean_sms = clean_sms.str.replace(r'^\s+|\s+?$', '')

# change words to lower case
clean_sms = clean_sms.str.lower()
print(clean_sms)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in number a wkly comp to win fa cup...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbernd time we have tried number...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [10]:
# remove stop words
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))
clean_sms = clean_sms.apply(lambda x: ' '.join(w for w in x.split() if w not in stopWords))

# stemming
ps = nltk.PorterStemmer()
clean_sms = clean_sms.apply(lambda x: ' '.join(ps.stem(w) for w in x.split()))

In [11]:
print(clean_sms[:5])

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri number wkli comp win fa cup final t...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
Name: 1, dtype: object


In [12]:
# tokenize the sms
from nltk.tokenize import word_tokenize
allWords = []
for sms in clean_sms:
    words = word_tokenize(sms)
    for w in words:
        allWords.append(w)
        
allWords = nltk.FreqDist(allWords)

In [13]:
# print most common words
print('Total number of words: {}'.format(len(allWords)))
print('20 Most common words: {}'.format(allWords.most_common(20)))

Total number of words: 6562
20 Most common words: [('number', 3071), ('u', 1207), ('call', 679), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266), ('like', 261), ('got', 252), ('time', 252), ('good', 248), ('want', 247), ('text', 231)]


In [16]:
# use 2500 most common words as features
wordFeatures = allWords.most_common(2500)
wordFeatures = [item[0] for item in wordFeatures]
print(wordFeatures[:5])

['number', 'u', 'call', 'go', 'get']


In [17]:
# function to fine the feature words in sms
def findFeatures(message):
    words = word_tokenize(message)
    features = {}
    for word in wordFeatures:
        #print(word)
        features[word] = (word in words)
    return features

In [20]:
# test the findFeatures function
features = findFeatures(clean_sms[0])
for key, value in features.items():
    if value == True:
        print(key)
print(clean_sms[0])

go
got
n
great
wat
e
world
point
avail
crazi
bugi
la
cine
buffet
go jurong point crazi avail bugi n great world la e buffet cine got amor wat


In [22]:
# find the feature words in all the sms and make feature set
messages = list(zip(clean_sms, labels))

seed = 10
np.random.seed = seed
np.random.shuffle(messages)

# find features in all the texts
featureSets = [(findFeatures(sms), labels) for sms, labels in messages]

In [24]:
# training testing split
from sklearn import model_selection
train, test = model_selection.train_test_split(featureSets, test_size=0.2, random_state=seed)

In [26]:
# import different classifiers from scikitlearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import model_selection

In [28]:
# get all the potential models
modelNames = ['SVC Linear', 'K-Nearest Neighbours', 'Decision Tree',
              'Random Forest', 'Logistic Regression', 'Naive Bayes', 'SGD Classifier']

classifiers = [SVC(kernel='linear'),
               KNeighborsClassifier(n_neighbors=3),
               DecisionTreeClassifier(),
               RandomForestClassifier(),
               LogisticRegression(),
               MultinomialNB(),
               SGDClassifier(max_iter=150)]

models = list(zip(modelNames, classifiers))

In [29]:
# wrap the models is nltk classifier and train the models
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(train)
    accuracy = nltk.classify.accuracy(nltk_model, test)*100
    print('{}: Accuracy {}'.format(name, accuracy))

SVC Linear: Accuracy 98.65470852017937
K-Nearest Neighbours: Accuracy 95.96412556053812
Decision Tree: Accuracy 97.13004484304932
Random Forest: Accuracy 98.83408071748879
Logistic Regression: Accuracy 98.56502242152466
Naive Bayes: Accuracy 98.29596412556054
SGD Classifier: Accuracy 98.65470852017937


In [32]:
# remove low performing classifiers and run the voting classifier
from sklearn.ensemble import VotingClassifier

modelNames = ['SVC Classifier', 'Random Forest', 'Logistic Regression', 'Naive Bayes', 'SGD Classifier']

classifiers = [SVC(kernel='linear'),
               RandomForestClassifier(),
               LogisticRegression(),
               MultinomialNB(),
               SGDClassifier(max_iter=150)]

models = list(zip(modelNames, classifiers))

nltk_voting = SklearnClassifier(VotingClassifier(estimators=models, voting='hard', n_jobs=-1))
nltk_voting.train(train)
accuracy = nltk.classify.accuracy(nltk_voting, test)*100
print('Voting Classifier Accuracy {}'.format(accuracy))

Voting Classifier Accuracy 98.65470852017937


In [33]:
# class label predictions on test data
sms_features, labels = zip(*test) # unzip the test data
predictions = nltk_voting.classify_many(sms_features)

In [35]:
# accuracy report and confusion matrix
print(classification_report(labels, predictions))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       973
           1       1.00      0.89      0.94       142

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [36]:
# print the number of misslabeled sms
pd.DataFrame(confusion_matrix(labels, predictions), index=[['actual', 'actual'], ['Not-Spam', 'Spam']],
            columns=[['predicted', 'predicted'], ['Not-Spam', 'Spam']])

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,Not-Spam,Spam
actual,Not-Spam,973,0
actual,Spam,15,127
