# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

# Importing dataset

In [2]:
df = pd.read_table('SMSSPamCollection',header = None)

  """Entry point for launching an IPython kernel.


In [3]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.describe()

Unnamed: 0,0,1
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
print(df[0].value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


# Data Preprocessing

## Label Encoding

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [7]:
y = label_encoder.fit_transform(df[0])

In [8]:
X = df[1]

In [9]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: 1, dtype: object

# Cleaning the text

In [10]:
# use regular expressions to replace email addresses, URLs, phone numbers, other numbers, regexlib.com

# Replace email addresses with 'email'
X = X.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddress')

# Replace URLs with 'webaddress'
X = X.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')

# Replace money symbols with 'moneysymbol'
X = X.str.replace(r'£|\$', 'moneysymbol')
    
# Replace 10 digit phone numbers
X = X.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumbers')
    
# Replace numbers with 'number'
X = X.str.replace(r'\d+(\.\d+)?', 'number')

# Remove punctuation
X = X.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
X = X.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
X = X.str.replace(r'^\s+|\s+?$', '')

# change words to lower case
X = X.str.lower()

print(X)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in number a wkly comp to win fa cup...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been number wee...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile number months or more u r enti...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from number to number ...
12      urgent you have won a number week free members...
13      i ve been searching for the right words to tha...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                 oh k i m watching here
17      eh u r

In [11]:
# for removing stop words and stemming words
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 5572):
    review = X[i]
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /Users/LEE/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Feature Generating

In [12]:
# Creating bag of words model
from nltk.tokenize import word_tokenize

all_words = []

for word in corpus:
    words = word_tokenize(word)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [13]:
all_words

FreqDist({'number': 2759, 'u': 1207, 'call': 674, 'go': 456, 'get': 451, 'ur': 391, 'gt': 318, 'lt': 316, 'come': 304, 'moneysymbolnumb': 303, ...})

In [14]:
len(all_words)

6574

In [15]:
# Most common 1000 words
word_features = list(all_words.keys())[:1000]

In [16]:
# The find_features function will determine which of the 1000 word features are contained in the review
def find_features(text):
    words = word_tokenize(text)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

# Lets see an example!
features = find_features(X[0])
for key, value in features.items():
    if value == True:
        print(key)

go
jurong
point
n
great
world
la
e
buffet
cine
got
wat


In [17]:
# for all the messages
messages = list(zip(corpus, y))

np.random.seed = 0
np.random.shuffle(messages)

# call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

# Train Test Split

In [18]:
from sklearn.model_selection import train_test_split

training, testing = train_test_split(featuresets, test_size = 0.2, random_state = 0)
print(len(training))
print(len(testing))

4457
1115


# Using Classifiers

In [19]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define models to train
names = ["Logistic Regression", "K Nearest Neighbors", "SVM Linear", "SVM rbf", "Decision Tree", "Random Forest"]

classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(),
    SVC(kernel = 'linear'),
    SVC(kernel = 'rbf'),
    DecisionTreeClassifier(),
    RandomForestClassifier()
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))



Logistic Regression Accuracy: 98.02690582959642
K Nearest Neighbors Accuracy: 92.82511210762333
SVM Linear Accuracy: 98.02690582959642




SVM rbf Accuracy: 89.5067264573991
Decision Tree Accuracy: 96.7713004484305




Random Forest Accuracy: 96.59192825112108
