In [1]:
import nltk
import sklearn
import pandas
import numpy


In [2]:
import pandas as pd
import numpy as np
df = pd.read_table('SMSSpamCollection',header=None,encoding = 'utf-8')


In [3]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(classes)
print(y[:10])

[0 0 1 0 0 1 0 0 1 1]


In [6]:
text_messages = df[1]

In [None]:
print(text_messages)

In [8]:
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddress')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [9]:
processed = processed.str.replace(r'[^\w\d\s]',' ')

In [10]:
processed = processed.str.replace(r'\s+', ' ' )
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [11]:
processed = processed.str.lower()

In [12]:
print(processed[:10])

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in numbr a wkly comp to win fa cup ...
3          u dun say so early hor u c already then say
4    nah i don t think he goes to usf he lives arou...
5    freemsg hey there darling it s been numbr week...
6    even my brother is not like to speak with me t...
7    as per your request melle melle oru minnaminun...
8    winner as a valued network customer you have b...
9    had your mobile numbr months or more u r entit...
Name: 1, dtype: object


In [None]:
from nltk.corpus import stopwords
words = set(stopwords.words('english'))


In [15]:
processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in words))

In [16]:
ps = nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [17]:
print(processed[:10])

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri numbr wkli comp win fa cup final tk...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
5    freemsg hey darl numbr week word back like fun...
6        even brother like speak treat like aid patent
7    per request mell mell oru minnaminungint nurun...
8    winner valu network custom select receivea mon...
9    mobil numbr month u r entitl updat latest colo...
Name: 1, dtype: object


In [22]:
from nltk.tokenize import word_tokenize

all = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all.append(w)
        
all = nltk.FreqDist(all)

In [25]:
#word_features = list(all.keys())[:1500]
word_features = list(dict(all.most_common(1500)).keys())

In [26]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

features = find_features(processed[0])
for key,value in features.items():
    if value == True:
        print(key)
    
        


go
got
n
great
wat
e
world
point
avail
crazi
bugi
la
cine


In [33]:
messages = list(zip(processed,y))
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

featuresets = [(find_features(text),label) for (text,label) in messages]

In [34]:
from sklearn import model_selection
training,testing = model_selection.train_test_split(featuresets,test_size=0.25,random_state=seed)

In [35]:
print(len(training))
print(len(testing))

4179
1393


In [36]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [37]:
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

In [38]:
models = zip(names,classifiers)

In [39]:
from nltk.classify.scikitlearn import SklearnClassifier
for name,model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model,testing)*100
    print("{}:{}".format(name,accuracy))
    

K Nearest Neighbors:94.68772433596554
Decision Tree:97.5592246949031
Random Forest:97.98994974874373
Logistic Regression:98.92318736539842
SGD Classifier:98.34888729361091
Naive Bayes:98.49246231155779
SVM Linear:98.7078248384781


In [41]:
from sklearn.ensemble import VotingClassifier
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names,classifiers))
nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard',n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble,testing)*100
print("{}".format(accuracy))


98.92318736539842


  if diff:


In [43]:
txt_features,labels = zip(*testing)
prediction = nltk_ensemble.classify_many(txt_features)

  if diff:


In [44]:
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

             precision    recall  f1-score   support

          0       0.99      1.00      0.99      1211
          1       0.98      0.94      0.96       182

avg / total       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1207,4
actual,spam,11,171
