In [12]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC, LinearSVR, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
import string
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from spacy.lang.en.stop_words import STOP_WORDS

In [4]:
df = pd.read_csv('./dataset/spam.csv', encoding='latin-1').drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df.head()

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
X = df.message
y = LabelEncoder().fit_transform(df['class'])

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42,
                                                    stratify=y,
                                                    test_size=0.2)

In [46]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words=STOP_WORDS)),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())
])

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),
               'clf__fit_prior': (True, False),
}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring='f1')
gs_clf = gs_clf.fit(X_train, y_train)

In [47]:
y_pred_test = gs_clf.predict(X_test)

In [48]:
confusion_matrix(y_test, y_pred_test)

array([[964,   2],
       [ 10, 139]])

In [49]:
gs_clf.best_params_

{'clf__alpha': 0.01,
 'clf__fit_prior': True,
 'tfidf__use_idf': False,
 'vect__ngram_range': (1, 2)}

**Next step**
 - tuning more parameters on this model
 - develop this model in app.py
 - develop html index & prediction page
 - create CSS for this 2 html 

In [51]:
model = Pipeline([('vect', CountVectorizer(stop_words=STOP_WORDS, 
                                            ngram_range=(1, 2))),
                      ('clf', MultinomialNB(alpha=1e-2))
])

model.fit(X, y)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words={"'d", "'ll", "'m", "'re", "'s",
                                             "'ve", 'a', 'about', 'above',
                                             'across', 'after', 'afterwards',
                                             'again', 'against', 'all',
                                             'almost', 'alone', 'along',
                                             'already', 'also', 'although',
                                             'always', 'am', 'among', 'amongst',
         

In [52]:
y_pred = model.predict(X)
confusion_matrix(y, y_pred)

array([[4820,    5],
       [   0,  747]])

In [54]:
print(model.predict(['Bonjour comment ça va ?']))

[0]
