In [1]:
from nltk.corpus import stopwords
import pandas as pd
import string


In [2]:
data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', names=['target','id','date','flag','user','text'])

In [3]:
data

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [4]:
# create a function that can take in a message and clean it, removing puncutation and "stopwords", then returning the "cleaned" version
def text_process(mess):
    """
    1. remove punctuation
    2. remove stop words
    3. return list of clean text
    """
    
    nopunc = [letter for letter in mess if letter not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [5]:
# CountVectorizer coverts the messages into 'bags of words', which build a large sparse matrix for the entire message set
# Term Frequency, Inverse Document Frequency measures the importance of each word in the message and compare it against all the messages to detemine its importance
# we use Naive Bayes for the text classifier
# Pipeline is used to simplify the action of feeding in training and test data for cross validation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [6]:
# A pipleine estimator is built, combining the above processes
pipeline = Pipeline([
    ('bag of words',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifer',MultinomialNB())
])

In [7]:
# Split our data into training and testing sets
from sklearn.model_selection import train_test_split

In [8]:
X_train,X_test,y_train,y_test = train_test_split(data['text'],data['target'],test_size=0.33)

In [9]:
# Fit the data to the pipeline estimator
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('bag of words',
                 CountVectorizer(analyzer=<function text_process at 0x1a196ed8c0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifer',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [10]:
# We create predictions
pred = pipeline.predict(X_test)

In [23]:
from sklearn.metrics import confusion_matrix,classification_report


In [24]:
# Here's the results of how well our model performed at predicitng 'spam' messages from 'ham' (regular) messages
print(confusion_matrix(y_test,pred))
print('')
print(classification_report(y_test,pred))

[[214399  49190]
 [ 75375 189036]]

              precision    recall  f1-score   support

           0       0.74      0.81      0.77    263589
           4       0.79      0.71      0.75    264411

    accuracy                           0.76    528000
   macro avg       0.77      0.76      0.76    528000
weighted avg       0.77      0.76      0.76    528000

