## TextBlob is a text processing library for Python
https://textblob.readthedocs.io/en/dev/

## The machine learning functions in TextBlob wrap NLTK
https://github.com/sloria/TextBlob

https://github.com/nltk/nltk/tree/develop/nltk/classify

## NLTK Naive Bayes
https://github.com/nltk/nltk/blob/develop/nltk/classify/naivebayes.py

In [66]:
import re
import random
import collections

from os import listdir

from nltk.corpus import stopwords
from string import punctuation

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='english', ignore_stopwords=True)

In [9]:
# The Newsgroup Data
# http://scikit-learn.org/stable/datasets/twenty_newsgroups.html#newsgroups
from sklearn.datasets import fetch_20newsgroups
texts = fetch_20newsgroups(subset='train')
dir(texts)

sklearn.datasets.base.Bunch

In [67]:
doc_path = 'C:\\Users\\Artur_Zahreba\\Desktop\\WorkFusion\\P&G\\sample email June 15\\DocsTxt'

doc_types = sorted(listdir(doc_path))

doc_texts = []

for doc_type in doc_types:
    filenames = sorted(listdir(doc_path + '\\' + doc_type))
    filenames = filter(lambda x: re.search(r'\.txt$', x), filenames)
    for filename in filenames:
        with open(doc_path + '\\' + doc_type + '\\' + filename, 'r') as ffile:
            txt = ffile.read()
            doc_texts.append((doc_type, filename, txt))
            ffile.close()

random.shuffle(doc_texts)            
lables, _, texts = zip(*doc_texts)
texts = (texts)

lable_dict = dict(zip(list(set(lables)), range(len(set(lables)))))

lables_int = list(map(lambda x: lable_dict[x], lables))

txts_tpl = collections.namedtuple('txts_tpl',['data','target_names','target'])
texts = txts_tpl(texts, lables, lables_int)

print(len(texts.target))

194


In [68]:
# Mapping of classes to numbers
for i in range(20):
    print(i,texts.target_names[i])

0 Approved
1 Approved
2 Approved
3 Approved
4 Approved
5 Other
6 Approved
7 Approved
8 Approved
9 Approved
10 Approved
11 Other
12 Approved
13 Approved
14 Approved
15 Approved
16 Approved
17 Other
18 Approved
19 Approved


In [69]:
texts.data[0]

"From: Velasquez, Tyrone\nSent: Friday, May 12, 2017 8:53 AM\nTo: Ng, RuBee; Lu, Darryl\nCc: Le, Wenyi; onetravelretail,\n\nIon; Tan, HerWen; Low, Jevon; Shin, Jiwon\nSubject: RE: Approval Needed - Release TRAVEL\n\nRETAIL Order | 2002825715 DFS VENTURE SINGAPORE PTE LTD [SK-II ] 13 May 2017\n\nThanks\n\nRubee. Approved\n\nTyrone\n\nTyrone\n\nBob C. Velasquez\n\nOut\n\nof Office (AMJ):\nLeaves:\nMay\n\n25 / Jun 9, 12-16\n\nPublic\n\nHolidays:\n\nJun\n\n26\n\nThis electronic message transmission\n\ncontains information which may be confidential. The information is intended for\n\nthe use of the individual or entity named above. If you are not the intended\n\nrecipient, and have received this electronic transmission in error, please\n\nnotify sender then delete immediately.\n\nThis message may contain Employee\n\nPersonally Identifiable Information (PII).\n\nBeforesharing this information with an internal or external contact, it\n\nis your responsibility to make sure there is a valid bus

In [70]:
# TextBlob automatically tokenizes
# https://github.com/sloria/TextBlob/blob/dev/textblob/classifiers.py
from textblob.tokenizers import word_tokenize
print(list(word_tokenize(texts.data[0].lower())))

['from', ':', 'velasquez', ',', 'tyrone', 'sent', ':', 'friday', ',', 'may', '12', ',', '2017', '8:53', 'am', 'to', ':', 'ng', ',', 'rubee', ';', 'lu', ',', 'darryl', 'cc', ':', 'le', ',', 'wenyi', ';', 'onetravelretail', ',', 'ion', ';', 'tan', ',', 'herwen', ';', 'low', ',', 'jevon', ';', 'shin', ',', 'jiwon', 'subject', ':', 're', ':', 'approval', 'needed', '-', 'release', 'travel', 'retail', 'order', '|', '2002825715', 'dfs', 'venture', 'singapore', 'pte', 'ltd', '[', 'sk-ii', ']', '13', 'may', '2017', 'thanks', 'rubee', '.', 'approved', 'tyrone', 'tyrone', 'bob', 'c.', 'velasquez', 'out', 'of', 'office', '(', 'amj', ')', ':', 'leaves', ':', 'may', '25', '/', 'jun', '9', ',', '12-16', 'public', 'holidays', ':', 'jun', '26', 'this', 'electronic', 'message', 'transmission', 'contains', 'information', 'which', 'may', 'be', 'confidential', '.', 'the', 'information', 'is', 'intended', 'for', 'the', 'use', 'of', 'the', 'individual', 'or', 'entity', 'named', 'above', '.', 'if', 'you', 'ar

In [57]:
num_training = int(len(texts.data) * 0.8)
num_testing = len(texts.data) - num_training

# first get vocabulary. We are creating a vocabulary to limit the features,
# since each word will eventually be a feature.
# https://docs.python.org/2/library/collections.html#collections.Counter
all_text = ''
for i in range(num_training):
    all_text += texts.data[i].lower()

    

_stopwords = set(stopwords.words('english') + list(punctuation) + 
                 ["pg.com", "from", "sent", "cc", "please", "thank"])
    
# make a list of words, we need to tokenize ourselves to get this list
from nltk.tokenize import word_tokenize
tokens = word_tokenize(all_text)
# stemmer.stem(token)
tokens = [token.lower() for token in tokens if token.lower() not in _stopwords]

# get the most common words
import collections
cnt = collections.Counter(tokens).most_common()
vocab = [k for k,v in cnt if v >= 5 and v <= int(len(texts.data)*0.8)]
print("The size of vocabulary is: ", len(vocab))
print(vocab[:20])
# checking in a set is much faster than checking in a list
vocab = set(vocab)

# Now create the training and testing data, filtering out words not in our vocabulary
# This is important because each word is a feature, and you don't want too many features
training_data = []
for i in range(num_training):
    tokens = word_tokenize(texts.data[i])
    item_text = ' '.join([t.lower() for t in tokens if t.lower() in vocab])
    training_data.append((item_text,texts.target_names[texts.target[i]]))
testing_data = []
for i in range(num_training,num_training+num_testing):
    tokens = word_tokenize(texts.data[i])
    item_text = ' '.join([t.lower() for t in tokens if t.lower() in vocab])
    testing_data.append((item_text,texts.target_names[texts.target[i]]))
print(training_data[0])

The size of vocabulary is:  1767
['service', 'sk2', 'june', '2ml', 'wingyan.c', 'march', 'stock', 'cream', 'pls', 'code', 'mask', 'operations', 'teo', 'ship', 'tr', 'ng.rb', 'kindly', 'us', 'operation', 'check']
('wu david 25 ginny seek tw jun incentive ginny 25 wu david wu.da.2 wingyan.c seek tw jun incentive david reseek jun incentive due code change drop accordingly 25 ginny liang.gl.1 seek tw jun incentive ginny code change kindly reseek amended form nice day analyst service operation ginny 25 seek tw jun incentive need resend case 25 10:16 ginny liang.gl.1 wu david wu.da.2 wingyan.c seek tw jun incentive ginny pls see update incentive proceed except ww sku cant ship pls asia sku instead ean code product description unit cost total cost quantity gcas stock ctmz ever rich sk2 aa ageless beauty cream duo set rna crm 24 sk2 rna pwr rdcl age ess duo set rna ess 24 4979006052404 330ml 9.46 82248447 4979006067088 r.n.a.power radical age cream 100g 60 82248646 mask deluxe set 24 cant ship

In [58]:
# The standard TextBlob Naive Bayes classifier re-parses the whole text of the corpus
# for each record. This makes it slow. I fixed that and included the fix in the
# course materials
#from textblob.classifiers import NaiveBayesClassifier
from our_textblob_classifiers import NaiveBayesClassifier

# Train. Takes a while
cl = NaiveBayesClassifier(training_data)

In [59]:
# Shows what the features look like and what the important ones are
# Very helpful for debugging and understanding data
cl.show_informative_features(20)

Most Informative Features
         contains(avoid) = True           Approv : Other  =     20.9 : 1.0
       contains(related) = True           Approv : Other  =     20.9 : 1.0
        contains(result) = True           Approv : Other  =     17.7 : 1.0
        contains(direct) = True           Approv : Other  =     17.7 : 1.0
        contains(packed) = True           Approv : Other  =     17.7 : 1.0
       contains(matters) = True           Approv : Other  =     17.7 : 1.0
     contains(regarding) = True           Approv : Other  =     17.7 : 1.0
         contains(12:19) = True           Approv : Other  =     14.4 : 1.0
         contains(click) = True           Approv : Other  =     14.4 : 1.0
        contains(access) = True           Approv : Other  =     14.4 : 1.0
  contains(image004.jpg) = True           Approv : Other  =     14.4 : 1.0
        contains(placed) = True           Approv : Other  =     14.4 : 1.0
          contains(cuts) = True           Approv : Other  =     14.4 : 1.0

In [60]:
# Pretty good, baseline is 5% because we have 20 classes
print("Accuracy: ", cl.accuracy(testing_data))

Accuracy:  0.7692307692307693


In [64]:
testDoc = r'C:\Users\Artur_Zahreba\Desktop\WorkFusion\P&G\sample email June 15\Emails\Q\TEST\RE Approval DFS HPP Qty Split - DFS OKI.txt'
with open(testDoc, 'r') as ffile:
    testText = ffile.read()
# Weird thing, it doesn't work well for short sentences.
# Maybe can't overcome prior because it was trained on longer texts.
cl.classify(testText)

'Other'

In [65]:
# We see it does better with the full text
for t in testing_data[:10]:
    print(t[0][:80])
    print("Predicted: {}, Actual: {}".format(cl.classify(t[0]),t[1]))

behalf 27 ond jfm allowance processed eta analyst service operation 27 chia.ic f
Predicted: Other, Actual: Other
september 29 2016 ho soriano paul toll asia operations quek josephine honghupron
Predicted: Other, Actual: Approved
jevon june 5 le wenyi shin jiwon chan meilay dietrich wendi 2002587038 america 0
Predicted: Other, Actual: Other
ginny march 31 chai winnie location set apr request ship alone since theres oos 
Predicted: Approved, Actual: Other
march 14 chai winnie soriano paul chan meilay honghupronnie ayard henri ginny lo
Predicted: Approved, Actual: Other
27 5:24 ond jfm allowance told stocks tomorrow tomorrow instead 27 chia.ic fw on
Predicted: Other, Actual: Other
shin jiwon march 31 jevon chan meilay pannese marianna retail| logistics 31 mar 
Predicted: Other, Actual: Other
shin jiwon 6 dietrich wendi jevon chan meilay 2002708880 starboard cruise servic
Predicted: Other, Actual: Other
koh yueee 17 sharma shweta jevon shin jiwon chan meilay tr delay due payment tim
Predic

## Exercise: play with the hyperparameters (like the vocabulary size), and see if you can improve the performance.