In [56]:
# import libraries
import sys
import pandas as pd
import numpy as np
import nltk
import sklearn

In [57]:
# load dataframe https://www.kaggle.com/datatattle/email-classification-nlp
from google.colab import drive
drive.mount('/content/drive')

df_train = pd.read_csv('/content/SMS_train.csv', header=None, encoding = 'ISO-8859-1')[1:]
df_test = pd.read_csv('/content/SMS_test.csv', header=None, encoding = 'ISO-8859-1')[1:]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [58]:
# check to see if data loaded correctly, gain some insight into data
# print(df_train.info())
# print(df_train.head())
# print(df_test.info())
# print(df_test.head())

In [59]:
# check class distribution
classes_train = df_train[2]
classes_test = df_test[2]
# print("train:\n",classes_train.value_counts())
# print("test:\n",classes_test.value_counts())

In [60]:
# begin preprocessing data
from sklearn.preprocessing import LabelEncoder

# convert class labels to binary values, 0 = ham and 1 = spam
encoder = LabelEncoder()
Y_train = encoder.fit_transform(classes_train)
Y_test = encoder.fit_transform(classes_test)

# print(Y_train[:10])
# print(Y_test[:10])

In [61]:
# store the email message data
messages_train = df_train[1]
messages_test = df_test[1]
# print(messages_train[:10])
# print(messages_test[:10])

In [91]:
# preprocess message data

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
ps = nltk.PorterStemmer()

def process_messages(processed_arg):
  # change all words to lower case
  processed = processed_arg.str.lower()

  # use regular expressions to clean data

  # Replace email addresses with 'email'
  processed = processed.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddress')

  # Replace URLs with 'webaddress'
  processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')

  # Replace money symbols with 'moneysymb'
  processed = processed.str.replace(r'£|\$', 'moneysymb')
      
  # Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
  processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumbr')
      
  # Replace numbers with 'numbr'
  processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

  # Remove punctuation
  processed = processed.str.replace(r'[^\w\d\s]', ' ')

  # Replace whitespace between terms with a single space
  processed = processed.str.replace(r'\s+', ' ')

  # Remove leading and trailing whitespace
  processed = processed.str.replace(r'^\s+|\s+?$', '')

  # Remove stop words from text messages
  processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

  # Remove word stems using a Porter stemmer
  return processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

processed_train = process_messages(messages_train)
# print(processed_train)
processed_test = process_messages(messages_test)
# print(processed_test)

In [94]:
from nltk.tokenize import word_tokenize

# create bag of words
all_words = []

for message in processed_train:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [95]:
# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 2484
Most common words: [('numbr', 478), ('u', 207), ('call', 124), ('go', 81), ('get', 75), ('ur', 66), ('moneysymbnumbr', 57), ('free', 47), ('come', 46), ('ok', 45), ('time', 44), ('gt', 43), ('day', 42), ('lt', 41), ('like', 40)]


In [103]:
# use 1000 most common words as features
word_features = list(all_words.keys())[:1000]

# The find_features function will determine which of the 1000 word features are contained in the review
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

In [107]:
messages_train = list(zip(processed_train, Y_train))
messages_test = list(zip(processed_test, Y_test))

# define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages_train)
np.random.shuffle(messages_test)

# call find_features function for each email
training = [(find_features(text), label) for (text, label) in messages_train]
testing = [(find_features(text), label) for (text, label) in messages_test]

In [111]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 51.2
Decision Tree Accuracy: 95.19999999999999
Random Forest Accuracy: 88.0
Logistic Regression Accuracy: 90.4
SGD Classifier Accuracy: 92.80000000000001
Naive Bayes Accuracy: 93.60000000000001
SVM Linear Accuracy: 91.2


In [113]:
# Making a Voting Classifier with the top performing models

from sklearn.ensemble import VotingClassifier

names = ["Decision Tree", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    DecisionTreeClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 91.2


In [114]:
# make class label prediction for testing set
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

In [115]:
# print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

              precision    recall  f1-score   support

           0       0.86      1.00      0.92        49
           1       1.00      0.89      0.94        76

    accuracy                           0.94       125
   macro avg       0.93      0.95      0.93       125
weighted avg       0.94      0.94      0.94       125



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,49,0
actual,spam,8,68
