## Natural Language Processing (NLP) & Text Classification with Naive Bayes

Data source :https://archive.ics.uci.edu/ml/machine-learning-databases/00228/

In [51]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [52]:
df = pd.read_csv('SMSSpamCollection.csv',sep='\t', names=['Status','Message'])

In [53]:
df.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
Status     5572 non-null object
Message    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [55]:
# how many Status=='spam'?
len(df[df.Status=='spam'])

747

In [56]:
# how many Status=='ham'?
len(df[df.Status=='ham'])

4825

In [57]:
# change 'ham' to '1' and 'spam' to '0':
# use Broadcasting:
df.loc[df["Status"]=='ham',"Status"] = 1
df.loc[df["Status"]=='spam',"Status"] = 0

In [58]:
df.head()

Unnamed: 0,Status,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
Status     5572 non-null object
Message    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [60]:
df["Status"].value_counts()

1    4825
0     747
Name: Status, dtype: int64

In [61]:
# features:
X = df["Message"]

In [62]:
# target:
y = df["Status"]

In [63]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [64]:
cv = CountVectorizer()

In [65]:
# count number of words in a df or in a list of text strings:
X_trans = cv.fit_transform(["Hi How are you How are you doing", "Hi what's up", "Wow that's awesome"])

In [66]:
print(X_trans)

  (0, 2)	1
  (0, 9)	2
  (0, 0)	2
  (0, 4)	2
  (0, 3)	1
  (1, 6)	1
  (1, 7)	1
  (1, 3)	1
  (2, 1)	1
  (2, 5)	1
  (2, 8)	1


In [67]:
# Rows are each document (or each text string)
# Columns are each word. Use cv.get_feature_names() to get each word.
X_trans = X_trans.toarray()
X_trans

array([[2, 0, 1, 1, 2, 0, 0, 0, 0, 2],
       [0, 0, 0, 1, 0, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 0, 0, 1, 0]], dtype=int64)

In [68]:
X_trans.shape

(3, 10)

In [30]:
# .get_feature_names() shows the names for each column:
# This is also the number of words in your bag-of-words.

cv.get_feature_names()

['are', 'awesome', 'doing', 'hi', 'how', 'that', 'up', 'what', 'wow', 'you']

In [31]:
# each word & its counts:

cv.vocabulary_

{'are': 0,
 'awesome': 1,
 'doing': 2,
 'hi': 3,
 'how': 4,
 'that': 5,
 'up': 6,
 'what': 7,
 'wow': 8,
 'you': 9}

In [32]:
# cv.inverse_transform gives each document's word list
cv.inverse_transform(X_trans)

[array(['are', 'doing', 'hi', 'how', 'you'],
       dtype='<U7'), array(['hi', 'up', 'what'],
       dtype='<U7'), array(['awesome', 'that', 'wow'],
       dtype='<U7')]

----
Now, let's train with the ham/spam df:

In [69]:
cv1 = CountVectorizer()

In [70]:
X_trans1 = cv1.fit_transform(X)

In [71]:
X_trans1 = X_trans1.toarray()
X_trans1

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [72]:
# Now we have successfully transformed the X_train data from text to numerical features usable for machine learning.
# So now we can do any ML algorithms (Classification, NaiveBayes, etc.) to X_train.


In [106]:
# Convert back to check each document's word list.
# for the X_train df: each row is a document; each column is a word.

cv1.inverse_transform(X_trans1)

[array(['amore', 'available', 'buffet', 'bugis', 'cine', 'crazy', 'go',
        'got', 'great', 'in', 'jurong', 'la', 'only', 'point', 'there',
        'until', 'wat', 'world'],
       dtype='<U34'), array(['joking', 'lar', 'ok', 'oni', 'wif'],
       dtype='<U34'), array(['08452810075over18', '2005', '21st', '87121', 'apply', 'comp',
        'cup', 'entry', 'fa', 'final', 'free', 'in', 'may', 'question',
        'rate', 'receive', 'std', 'text', 'tkts', 'to', 'txt', 'win', 'wkly'],
       dtype='<U34'), array(['already', 'dun', 'early', 'hor', 'say', 'so', 'then'],
       dtype='<U34'), array(['around', 'don', 'goes', 'he', 'here', 'lives', 'nah', 'think',
        'though', 'to', 'usf'],
       dtype='<U34'), array(['50', 'and', 'back', 'been', 'chgs', 'darling', 'for', 'freemsg',
        'fun', 'hey', 'it', 'like', 'no', 'now', 'ok', 'rcv', 'send',
        'some', 'std', 'still', 'tb', 'there', 'to', 'up', 'week', 'word',
        'xxx', 'you'],
       dtype='<U34'), array(['aids', 'b

In [74]:
# Check the first document (first row)'s word list.

cv1.inverse_transform(X_trans1[0])

[array(['amore', 'available', 'buffet', 'bugis', 'cine', 'crazy', 'go',
        'got', 'great', 'in', 'jurong', 'la', 'only', 'point', 'there',
        'until', 'wat', 'world'],
       dtype='<U34')]

In [77]:
# Compare to original X_train's first row:
X.iloc[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

----
## Use TfidfVectorizer:

When you have large datasets, it's better to use TfidfVectorizer than CountVectorizer.

TF-IDF stands for 

`Term-Frequency X Inverse Document Frequency`.

In the standard `CountVectorizer` model above, we used just the term frequency in a document of words in our vocabulary. 
In TF-IDF, we weight this term frequency by the inverse of its popularity in all documents. 
For example, if the word "movie" showed up in all the documents, it would not have much predictive value. 
It could actually be considered a stopword. By weighing its counts by 1 divided by its overall frequency, 
we downweight it. We can then use this TF-IDF weighted features as inputs to any classifier. 
**TF-IDF is essentially a measure of term importance, and of how discriminative a word is in a corpus.** 
 

Now, let's train with the ham/spam df:

In [74]:
tv = TfidfVectorizer(min_df=1, stop_words='english')

In [75]:
X_trans2 = tv.fit_transform(X)

In [76]:
print(X_trans2)

  (0, 4248)	0.350974793771
  (0, 5779)	0.274516665356
  (0, 2282)	0.271805817771
  (0, 1284)	0.262517699531
  (0, 1719)	0.296504924062
  (0, 3555)	0.193878669458
  (0, 8285)	0.237407158009
  (0, 4374)	0.296504924062
  (0, 1717)	0.335043378172
  (0, 2007)	0.296504924062
  (0, 3515)	0.164538318188
  (0, 1064)	0.350974793771
  (0, 8083)	0.196103322364
  (1, 5377)	0.271894406942
  (1, 4410)	0.408325854926
  (1, 4216)	0.523680433204
  (1, 8191)	0.431629575855
  (1, 5403)	0.546624314131
  (2, 3280)	0.116760286502
  (2, 2889)	0.364402259602
  (2, 8243)	0.192879844072
  (2, 2123)	0.196869828236
  (2, 8203)	0.149533154919
  (2, 3018)	0.475509428526
  (2, 2341)	0.204185153803
  :	:
  (5567, 2781)	0.232324875812
  (5567, 309)	0.243174910383
  (5567, 710)	0.250051234574
  (5567, 5840)	0.28016724436
  (5567, 165)	0.336894073438
  (5567, 5297)	0.336894073438
  (5568, 3813)	0.365205977748
  (5568, 3484)	0.364236977683
  (5568, 3267)	0.559709862066
  (5568, 2921)	0.648599173768
  (5569, 5022)	0.506896

In [77]:
X_trans2 = X_trans2.toarray()
X_trans2

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [78]:
X_trans2.shape

(5572, 8444)

In [80]:
tv.get_feature_names()

['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '084

In [81]:
tv.vocabulary_

{'jurong': 4248,
 'point': 5779,
 'crazy': 2282,
 'available': 1284,
 'bugis': 1719,
 'great': 3555,
 'world': 8285,
 'la': 4374,
 'buffet': 1717,
 'cine': 2007,
 'got': 3515,
 'amore': 1064,
 'wat': 8083,
 'ok': 5377,
 'lar': 4410,
 'joking': 4216,
 'wif': 8191,
 'oni': 5403,
 'free': 3280,
 'entry': 2889,
 'wkly': 8243,
 'comp': 2123,
 'win': 8203,
 'fa': 3018,
 'cup': 2341,
 'final': 3135,
 'tkts': 7569,
 '21st': 412,
 '2005': 403,
 'text': 7437,
 '87121': 794,
 'receive': 6158,
 'question': 6052,
 'std': 7077,
 'txt': 7754,
 'rate': 6104,
 'apply': 1141,
 '08452810075over18': 77,
 'dun': 2751,
 'say': 6496,
 'early': 2770,
 'hor': 3840,
 'nah': 5124,
 'don': 2663,
 'think': 7492,
 'goes': 3479,
 'usf': 7892,
 'lives': 4562,
 'freemsg': 3287,
 'hey': 3757,
 'darling': 2398,
 'week': 8129,
 'word': 8276,
 'like': 4512,
 'fun': 3342,
 'tb': 7372,
 'xxx': 8350,
 'chgs': 1961,
 'send': 6584,
 '50': 616,
 'rcv': 6116,
 'brother': 1689,
 'speak': 6959,
 'treat': 7687,
 'aids': 997,
 'pate

In [82]:
tv.inverse_transform(X_trans2[0])

[array(['amore', 'available', 'buffet', 'bugis', 'cine', 'crazy', 'got',
        'great', 'jurong', 'la', 'point', 'wat', 'world'],
       dtype='<U34')]

In [83]:
X.iloc[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

----
## Naive Bayes

Now that features X are transformed into X_trans2 (from text to numerical values), we can do a text classification with Naive Bayes:

In [87]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [84]:
# To do classification with Naive Bayes, we need the target to be integer values:
y = y.astype('int')

In [89]:
# train-test split the data:
X_train, X_test, y_train, y_test = train_test_split(X_trans2, y, test_size=0.2, random_state=4)

In [90]:
# use scikit-learn's MultinomialNB() classifier with default parameters:
clf = MultinomialNB()

# train the classifier over the training set:
clf.fit(X_train, y_train)

# get the accuracy scores for both the training and the test sets:
score_train = clf.score(X_train, y_train)
score_test = clf.score(X_test, y_test)

# print the accuracy scores for both the training and the test sets:
print('The accuracy score for the training set is {:.4f}'.format(score_train))
print('The accuracy score for the testing set is {:.4f}'.format(score_test))

The accuracy score for the training set is 0.9805
The accuracy score for the testing set is 0.9587


----
### Making Predictions

Let's predict either the first email in X_train is spam or ham:

In [94]:
# X_train[0] in text form:
tv.inverse_transform(X_train[0])

[array(['checking', 'going', 'got', 'haha', 'lor', 'mails', 'online',
        'replying', 'sleeping', 'spys', 'wat'],
       dtype='<U34')]

In [93]:
y_train[0]  # real y is 'ham'

1

In [99]:
clf.predict(X_train)

array([1, 1, 0, ..., 1, 1, 0])

In [100]:
clf.predict(X_train)[0]   # correct classification

1

In [97]:
clf.predict_proba(X_train)  # two columns with first column being '0' and second column being '1'

array([[ 0.00287374,  0.99712626],
       [ 0.01937702,  0.98062298],
       [ 0.99235401,  0.00764599],
       ..., 
       [ 0.01719932,  0.98280068],
       [ 0.04203405,  0.95796595],
       [ 0.54241261,  0.45758739]])

In [101]:
clf.predict_proba(X_train)[0]  # first email in X_train has 0.99712626 probability of being classified as '1'.

array([ 0.00287374,  0.99712626])

----
## Industry Standard - Use Pipeline

In real world, we use pipeline.

In [134]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict 
from sklearn.metrics import classification_report, confusion_matrix

In [107]:
df = pd.read_csv('SMSSpamCollection.csv',sep='\t', names=['Status','Message'])
df.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [119]:
# features:
X = df["Message"]

# target:
y = df["Status"]


In [None]:
# train-test split:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)


In [120]:
# X_train

In [121]:
# y_train

In [122]:
# Make a pipeline:

pipe = make_pipeline(CountVectorizer(), TfidfTransformer(), MultinomialNB())  
pipe

Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
  ...alse, use_idf=True)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [123]:
# Fit the pipeline:  

# Now we can directly pass message text data and the pipeline will do our pre-processing for us! 
# We can treat it as a model/estimator API:

pipe.fit(X_train, y_train)


Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
  ...alse, use_idf=True)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [130]:
# Predictions on the testing data:

y_test_hat = pipe.predict(X_test)


In [131]:
# classification table:
print(classification_report(y_test, y_test_hat))

             precision    recall  f1-score   support

        ham       0.95      1.00      0.98       980
       spam       1.00      0.63      0.77       135

avg / total       0.96      0.96      0.95      1115



----
#### Instead of train-test split, use cross-validation on the whole data set:

In [132]:
predicted = cross_val_predict(pipe, X, y, cv=3, n_jobs=-1)

In [133]:
# classification table:
print(classification_report(y, predicted))

             precision    recall  f1-score   support

        ham       0.95      1.00      0.97      4825
       spam       1.00      0.67      0.80       747

avg / total       0.96      0.96      0.95      5572



----
Try another model: random forest

In [135]:
pipe2 = make_pipeline(CountVectorizer(), TfidfTransformer(), RandomForestClassifier())  
pipe2

Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
  ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [137]:
predicted2 = cross_val_predict(pipe2, X, y, cv=3, n_jobs=-1)

In [138]:
# classification table:
print(classification_report(y, predicted2))

             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      4825
       spam       0.99      0.78      0.88       747

avg / total       0.97      0.97      0.97      5572



----
Try another pipeline:  

In [139]:
pipe3 = make_pipeline(TfidfVectorizer(), MultinomialNB())  
pipe3

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i...   vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [140]:
predicted3 = cross_val_predict(pipe3, X, y, cv=3, n_jobs=-1)

In [141]:
# classification table:
print(classification_report(y, predicted3))

             precision    recall  f1-score   support

        ham       0.95      1.00      0.97      4825
       spam       1.00      0.67      0.80       747

avg / total       0.96      0.96      0.95      5572

