In [1]:
#import NLTK library and pandas
import nltk
import pandas as pd

In [2]:
# read the csv file to a dataframe
messages = pd.read_csv('SPAM text message 20170820 - Data.csv')

In [3]:
# first 5 rows of the dataframe
messages.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# to clean the text from commonly used words and punctuation, we import these
import string
from nltk.corpus import stopwords

In [5]:
# create a function that can take in a message and clean it, removing puncutation and "stopwords", then returning the "cleaned" version
def text_process(mess):
    """
    1. remove punctuation
    2. remove stop words
    3. return list of clean text
    """
    
    nopunc = [letter for letter in mess if letter not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [6]:
# CountVectorizer coverts the messages into 'bags of words', which build a large sparse matrix for the entire message set
# Term Frequency, Inverse Document Frequency measures the importance of each word in the message and compare it against all the messages to detemine its importance
# we use Naive Bayes for the text classifier
# Pipeline is used to simplify the action of feeding in training and test data for cross validation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [7]:
# A pipleine estimator is built, combining the above processes
pipeline = Pipeline([
    ('bag of words',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifer',MultinomialNB())
])

In [8]:
# Split our data into training and testing sets
from sklearn.model_selection import train_test_split

In [9]:
X_train,X_test,y_train,y_test = train_test_split(messages['Message'],messages['Category'],test_size=0.33)

In [10]:
# Fit the data to the pipeline estimator
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('bag of words',
                 CountVectorizer(analyzer=<function text_process at 0x1a22866560>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifer',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [11]:
# We create predictions
pred = pipeline.predict(X_test)

In [12]:
from sklearn.metrics import confusion_matrix,classification_report

In [13]:
# Here's the results of how well our model performed at predicitng 'spam' messages from 'ham' (regular) messages
print(confusion_matrix(y_test,pred))
print('')
print(classification_report(y_test,pred))

[[1608    0]
 [  62  169]]

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1608
        spam       1.00      0.73      0.84       231

    accuracy                           0.97      1839
   macro avg       0.98      0.87      0.91      1839
weighted avg       0.97      0.97      0.96      1839

