1.Importing the required libraries

In [2]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np


2.Load the dataset

In [3]:
df = pd.read_table('SMSSpamCollection' , header=None , encoding = 'utf-8')

In [4]:
# Print useful information about the data
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
# Check class distribution

classes = df[0]
print(classes.value_counts())


ham     4825
spam     747
Name: 0, dtype: int64


3.Preprocess the data

In [7]:
# Convert class labels to binary values

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(Y[:10])

[0 0 1 0 0 1 0 0 1 1]


In [8]:
# Store sms message data

text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [10]:
# use regular expressions to replace email addresses, URLs, phone numbers, other numbers

# Replace email addresses with 'email'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddr')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [11]:
# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [13]:
# Change words to lower case as being in capitals doesn't convey any meaning

processed = processed.str.lower()
print(processed)


0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been numbr week...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile numbr months or more u r entit...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from numbr to numbr nu...
12      urgent you have won a numbr week free membersh...
13      i ve been searching for the right words to tha...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                 oh k i m watching here
17      eh u r

In [14]:
# remove stop words like 'the' etc. as they do not add any value 
from nltk.corpus import stopwords

# remove stop words from messages
stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x:' '.join(term for term in x.split() if term not in stop_words))

In [20]:
# Remove word stems using Porter stemmer

ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

4.Generate features by tokenizing

In [22]:
# Begin the process of tokenizing
from nltk.tokenize import word_tokenize

# Create a bag of words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [23]:
# Print the total number of words and the most common words

print('Number of words : {}'.format(len(all_words)))
print('15 most commong words : {}'.format(all_words.most_common(15)))

Number of words : 6579
15 most commong words : [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [24]:
# Use 1500 most common words as features

word_features = list(all_words.keys())[:1500]


In [25]:
# Define a function that will determine if any of the above 1500 words are present in the review

def find_features(message):
    words = word_tokenize(message)
    features= {}
    for word in word_features:
        features[word]=(word in words)
        
        return features
        
    

In [34]:
# Finding features of all messages
messages = zip(processed,Y)

#Define a seed for reproducibility
seed = 1

np.random_seed = seed


# Find features for all messages

featuresets =[(find_features(text), label) for (text,label) in messages]


In [36]:
# Splitting the featuresets into training and test sets for training and testing the model respectively

from sklearn import model_selection

(training,testing)= model_selection.train_test_split(featuresets, test_size =0.25 ,random_state =seed)


In [37]:
print(len(training))

print(len(testing))

4179
1393


5.Training various classifiers using the training and testing datasets

In [55]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel ='linear'))

#Train the model using training data
model.train(training)

#Test the model and measure accuracy using testing data
accuracy = nltk.classify.accuracy(model,testing)*100
print('SVC Accuracy :{}'.format(accuracy))


SVC Accuracy :86.71931083991386


In [59]:
# Now training on various other models

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

names =["K Nearest Neighbors","Decision Tree","Random forest" ,"Logistic Regression","SGD CLassifier" "Naive Bayes","SVM Linear"]

classifiers = [KNeighborsClassifier(),
              DecisionTreeClassifier(),
              RandomForestClassifier(),
              LogisticRegression(),
              SGDClassifier(max_iter=100),
              MultinomialNB(),
              SVC(kernel='linear')]

models =zip(names,classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))
    

K Nearest Neighbors Accuracy: 86.71931083991386
Decision Tree Accuracy: 86.71931083991386
Random forest Accuracy: 86.71931083991386
Logistic Regression Accuracy: 86.71931083991386
SGD CLassifierNaive Bayes Accuracy: 86.71931083991386
SVM Linear Accuracy: 86.71931083991386


In [None]:
#Using the voting classifier to get the best accuracy
from sklearn.ensemble import VotingClassifier

names =["K Nearest Neighbors","Decision Tree","Random forest" ,"Logistic Regression","SGD CLassifier" "Naive Bayes","SVM Linear"]

classifiers = [KNeighborsClassifier(),
              DecisionTreeClassifier(),
              RandomForestClassifier(),
              LogisticRegression(),
              SGDClassifier(max_iter=100),
              MultinomialNB(),
              SVC(kernel='linear')]

models = list(zip(names,classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators= models,voting='hard',n_jobs= -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing)*100

print(accuracy)

In [None]:
# Make class label predictions
txt_features,labels = zip(*testing)
prediction = nltk_ensemble.classify_many(txt_features)