In [30]:
import sys
import nltk
#nltk.download('stopwords')
import sklearn
import pandas as pd
import numpy as np


print("Python:{}".format(sys.version))
print("NLTK:{}".format(nltk.__version__))
print("sklearn:{}".format(sklearn.__version__))
print("pandas:{}".format(pd.__version__))
print("numpy:{}".format(np.__version__))


Python:3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)]
NLTK:3.4.5
sklearn:0.22.2.post1
pandas:0.25.3
numpy:1.18.1


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andguez\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 1. Load the Dataset

In [31]:
import pandas as pd
import numpy as np

# Load the dataset of sms messages
df = pd.read_table("SMSSpamCollection",header= None, encoding='utf-8')

In [32]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [33]:
# check class distribution
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## 2. Preprocess the Data

In [34]:
# conver class labels to binary values, 0 = ham, 1 = spam

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [35]:
# store the SMS message data
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [36]:
# use regular expressions to remplace email addresses, urls, phone numbers, other numbers, symbols

# replace email addresses with 'emailaddr'

processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddr')

# replace urls with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')

#replace  mony symbols with 'moneysymb'
processed = processed.str.replace(r'£|\$','moneysymb')

#replace 10 digit phone number with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumber')

# replace normal numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?','numbr')

In [37]:
# remove punctation

processed = processed.str.replace(r'[^\w\d\s]',' ')

# replace whitespave between terms with a single space
processed = processed.str.replace(r'\s+',' ')

#remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$','')

In [38]:
# change word to lower case - HolLe, HELLO, hello are all the same
processed = processed.str.lower()

In [39]:
# remove stop words from text messages

from nltk.corpus import stopwords


stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [40]:
#remove word stems using a Porter stemmer

ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [41]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    numbrnd time tri numbr contact u u moneysymbnu...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object


In [57]:
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
#Creating a bag of words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)

all_words = nltk.FreqDist(all_words)

In [58]:
# print the total number of words and the  most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words))

Number of words: 6579
Most common words: [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [108]:
#use the 1500 most common words as features
words_features = [x[0] for x in list(all_words.most_common(3000))] 

In [109]:
#define a find_features function
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in words_features:
        features[word] = (word in words)
    
    return features

#lets see an example
features = find_features(processed[0])
print(processed[0])
for key,value in features.items():
    if value == True:
        print(key)

go jurong point crazi avail bugi n great world la e buffet cine got amor wat
go
got
n
great
wat
e
world
point
avail
crazi
bugi
la
cine
buffet


In [110]:
# find features for all messages
messages = list(zip(processed,Y))

#define a seed for repriducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

#call find_features function for SMS messages
featuresets = [(find_features(text),label) for (text,label) in messages]

In [111]:
#split training and testing data sets using sklearn
from sklearn import model_selection

training,testing = model_selection.train_test_split(featuresets,test_size = 0.25,random_state = seed)

In [112]:
print('Training: {}'.format(len(training)))
print('Testing: {}'.format(len(testing)))

Training: 4179
Testing: 1393


## 4.Sckit-Learn Classifiers with NLTK

In [113]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [114]:
# Define models to train
names = ['K Nearest Neighbors', 'Decision Tree','Random Forest','Logistic Regression','SGD Classifier','Navie Bayes','SVM Line']

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(), 
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names,classifiers))

In [115]:
# wrap models in NLTK
from nltk.classify.scikitlearn import SklearnClassifier

for name,model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model,testing) * 100
    print('{}: Accurency: {}'.format(name,accuracy))

K Nearest Neighbors: Accurency: 92.17516152189519
Decision Tree: Accurency: 97.84637473079684
Random Forest: Accurency: 98.06173725771716
Logistic Regression: Accurency: 98.7078248384781
SGD Classifier: Accurency: 98.49246231155779
Navie Bayes: Accurency: 98.06173725771716
SVM Line: Accurency: 98.56424982053123


In [119]:
# ensemble method - voting classifier
from sklearn.ensemble import VotingClassifier

# Define models to train
names = ['K Nearest Neighbors', 'Decision Tree','Random Forest','Logistic Regression','SGD Classifier','Navie Bayes','SVM Line']

classifiers = [
    #KNeighborsClassifier(),
    #DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(), 
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names,classifiers))

nltk_esemble = SklearnClassifier(VotingClassifier(estimators = models,voting = 'hard',n_jobs = -1))
nltk_esemble.train(training)
accuracy = nltk.classify.accuracy(nltk_esemble,testing) * 100
print('Emsable Accurency: {}'.format(accuracy))

Emsable Accurency: 98.77961234745155


In [120]:
# make class label prediction for testing set
text_features, labels = list(zip(*testing))

prediction = nltk_esemble.classify_many(text_features)

In [121]:
# print a confusion matrix and a classification report
print(classification_report(labels,prediction))

pd.DataFrame(
    confusion_matrix(labels,prediction),
    index= [['actual','actual'],['ham','spam']],
    columns = [['predicted','predicted'],['ham','spam']]
)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1176
           1       1.00      0.92      0.96       217

    accuracy                           0.99      1393
   macro avg       0.99      0.96      0.98      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1176,0
actual,spam,17,200
