In [4]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

stop = stopwords.words('english')
stop.append('')
porter = PorterStemmer()

def del_stop(text):
    return list(set(text).difference(set(stop)))

def tokenizer_porter(text):
    return [porter.stem(word) for word in text]

def create_ngram_model():
    tfidf_ngrams = TfidfVectorizer(ngram_range=(1,3), analyzer = 'word', binary = False)
    clf = MultinomialNB()
    return Pipeline([('vect', tfidf_ngrams), ('clf', clf)])


In [5]:
labels = {'ham': 0, 'spam': 1}
df = pd.DataFrame()

In [6]:
text = open('SMSSpamCollection.txt', 'r')
symbol = ['\t','\n' '\\', '/']

In [7]:
for i in text:
    for j in symbol:
        i = i.replace(j, ' ')
    i = i.replace('\t', ' ')
    i = i.replace('\n', ' ')
    i = i.replace('\\', ' ')
    i = i.lower()
    i = i.split(' ')
    df = df.append([[labels[i[0]], i[1:]]], ignore_index = True)

In [8]:
df.columns = ['spam', 'text']


In [9]:
X_train = df.loc[:5000, 'text'].values
X_test = df.loc[5000:, 'text'].values
y_train = df.loc[:5000, 'spam'].values
y_test = df.loc[5000:, 'spam'].values

X = df.loc[:, 'text'].values

In [10]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer_porter],
                'clf__alpha': [0.5, 0.75, 1] 
              },
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer':  [tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None ,'l1', 'l2'],
                'clf__alpha': [0.5, 0.75, 1]
              },
              ]

MNB_tfidf = Pipeline([('vect', tfidf),
                     ('clf',  MultinomialNB())])

gs_MNB_tfidf = GridSearchCV(MNB_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [11]:
gs_MNB_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  4.7min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', '...ed6ae8>], 'vect__use_idf': [False], 'vect__norm': [None, 'l1', 'l2'], 'clf__alpha': [0.5, 0.75, 1]}],
       pre_dispatch='2*n_jobs', refit=True, return_tr

In [12]:
print('Best parameter set: %s ' % gs_MNB_tfidf.best_params_)
print('CV Accuracy: %.5f' % gs_MNB_tfidf.best_score_)

Best parameter set: {'clf__alpha': 0.5, 'vect__ngram_range': (1, 1), 'vect__norm': None, 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer_porter at 0x10aed6ae8>, 'vect__use_idf': False} 
CV Accuracy: 0.98480


In [13]:
clf = gs_MNB_tfidf.best_estimator_
print('Test Accuracy: %.5f' % clf.score(X_test, y_test))

Test Accuracy: 0.99652


In [14]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer_porter],
               'clf__alpha': [0.5, 0.75, 1]
              },
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer':  [tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
                'clf__alpha': [0.5, 0.75, 1]
              },
              ]

BNB_tfidf = Pipeline([('vect', tfidf),
                     ('clf',  BernoulliNB())])

gs_BNB_tfidf = GridSearchCV(BNB_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [15]:
gs_BNB_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ... vocabulary=None)), ('clf', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', '...ter at 0x10aed6ae8>], 'vect__use_idf': [False], 'vect__norm': [None], 'clf__alpha': [0.5, 0.75, 1]}],
       pre_dispatch='2*n_jobs', refit=True, return_tr

In [16]:
print('Best parameter set: %s ' % gs_BNB_tfidf.best_params_)
print('CV Accuracy: %.5f' % gs_BNB_tfidf.best_score_)

Best parameter set: {'clf__alpha': 0.5, 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer_porter at 0x10aed6ae8>} 
CV Accuracy: 0.97760


In [17]:
clf = gs_BNB_tfidf.best_estimator_
print('Test Accuracy: %.5f' % clf.score(X_test, y_test))

Test Accuracy: 0.98084
