In [31]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC, LinearSVR, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.linear_model import SGDClassifier

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from spacy.lang.en.stop_words import STOP_WORDS

In [4]:
df = pd.read_csv('./dataset/spam.csv', encoding='latin-1').drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df.head()

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
X = df.message
y = LabelEncoder().fit_transform(df['class'])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42,
                                                    stratify=y,
                                                    test_size=0.2)

In [13]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words=STOP_WORDS)),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())
])

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3, 1e-4),
}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [15]:
y_pred_test = gs_clf.predict(X_test)

In [17]:
confusion_matrix(y_test, y_pred_test)

array([[964,   2],
       [ 10, 139]])

In [39]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words=STOP_WORDS)),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())
])

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3, 1e-4),
}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring='f1')
gs_clf = gs_clf.fit(X_train, y_train)

y_pred_test = gs_clf.predict(X_test)

confusion_matrix(y_test, y_pred_test)

array([[964,   2],
       [ 10, 139]])

In [40]:
gs_clf.best_params_

{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}

In [41]:
y_pred_train = gs_clf.predict(X_train)

confusion_matrix(y_train, y_pred_train)

array([[3856,    3],
       [   0,  598]])

**Next step**
 - tuning more parameters on this model
 - develop this model in app.py
 - develop html index & prediction page
 - create CSS for this 2 html 