In [2]:
from __future__ import print_function
import re
import pandas as pd
import numpy as np
from os import listdir
from collections import Counter
from bs4 import BeautifulSoup
import sklearn
import xgboost as xgb
import nltk
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

import math 
import gensim

import pickle

from string import punctuation
from nltk.corpus import stopwords

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer



In [3]:
stemmer = SnowballStemmer(language='english', ignore_stopwords=True)
# tokenizer = RegexpTokenizer(r'[a-z\d]{2,}')
tokenizer = RegexpTokenizer(r'\s+', gaps=True)

## Get Texts

In [4]:
doc_path = r'C:\Users\Artur_Zahreba\Desktop\WorkFusion\P&G\Approval By Email\Training Set\v5\Training Set\ALL_FIXED'

In [5]:
doc_types = sorted(listdir(doc_path))

doc_texts = []

for doc_type in doc_types:
    filenames = sorted(listdir(doc_path + '\\' + doc_type))
    filenames = filter(lambda x: re.search(r'\.txt$', x), filenames)
    for filename in filenames:
        with open(doc_path + '\\' + doc_type + '\\' + filename, 'r') as ffile:
            txt = ffile.read()
            doc_texts.append((doc_type, filename, txt))
            ffile.close()
            
print(len(doc_texts))

737


In [6]:
lables, _, texts = zip(*doc_texts)

In [7]:
lables = list(lables)
texts = map(lambda x: stemmer.stem(x), texts)
texts = map(lambda x: tokenizer.tokenize(x), texts)
texts = map(lambda x: re.sub(r'\s\d+\s', ' ', ' '.join(x)), texts)
texts[0]

u'from: liang, anna sent: wed may 10:05:28 cest to: chia, ian; nino, daisyanne; onetravelretail, ion; muthuraman, alagammai cc: tan, herwen; neo, joyce; eunice, chiam subject: : [approval] nro - kingpower tester request approve anna : chia, ian : 16:00 : nino, daisyanne <nino.d@pg.com>; onetravelretail, ion <onetravelretail.im@pg.com>; muthuraman, alagammai <muthuraman.am@pg.com> : tan, herwen <tan.hw@pg.com>; neo, joyce <neo.j.1@pg.com>; liang, anna <liang.an.2@pg.com>; eunice, chiam <eunice.ec@pg.com> : re: [approval] nro - kingpower tester request hi anna, please approve attached nro. hi daisy, thanks! thank you! ian the metropolis, north buona vista drive, #21-07, the metropolis tower 2, singapore ooo date: 12 may 2017, please expect delayed response on enquiries and emails. all decisions on pricing, promotion, distribution, assortment and shelving are at the sole discretion of the retailer. this electronic message transmission contains information which may be confidential. the in

In [8]:
lable_dict = dict(zip(list(set(lables)), range(len(set(lables)))))
lable_dict

{'Approved': 1, 'Other': 0}

In [9]:
inv_lable_dict = dict(zip(lable_dict.values(), lable_dict.keys()))
print('Number of classes:', len(inv_lable_dict))
print(inv_lable_dict)

Number of classes: 2
{0: 'Other', 1: 'Approved'}


In [10]:
lables_int = map(lambda x: lable_dict[x], lables)

## Vectorize and Split

In [11]:
train_ind, test_ind = train_test_split(range(len(texts)), test_size = 0.2, random_state=42)

In [12]:
X = np.array(texts)
y = np.array(lables_int)

X_train = X[train_ind]
X_test = X[test_ind]
y_train = y[train_ind]
y_test = y[test_ind]

In [13]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(sublinear_tf=False, max_df=0.9, min_df=0.3, use_idf=True, 
                                                             stop_words="english", lowercase=True, max_features=None,
                                                             strip_accents='unicode', ngram_range=(1,2), norm=u'l2',
                                                             smooth_idf=True, token_pattern=r'[a-z\d]{2,}')
vectorizer.fit(X_train)

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.3,
        ngram_range=(1, 2), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=False,
        token_pattern='[a-z\\d]{2,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [14]:
pickle.dump(vectorizer, open(r"vectorizers\tfidf\vectorizer2.pk", "wb" ))

In [15]:
X_train = vectorizer.transform(X_train)
X_test =  vectorizer.transform(X_test)

In [17]:
print(X_train.shape)
print(Counter(y_train))

(589, 201)
Counter({0: 472, 1: 117})


In [18]:
print(X_test.shape)
print(Counter(y_test))

(148, 201)
Counter({0: 114, 1: 34})


In [17]:
D_train = xgb.DMatrix(X_train, label = y_train)
D_test = xgb.DMatrix(X_test, label = y_test)

## Parameters

In [18]:
param = {}

param['nthread'] = 4 # number of parallel threads
param ['booster'] = 'gbtree' # tree based model

param['eta'] = 0.3 # step size shrinkage used in update to prevent overfitting.
# Range [0, 1], default 0.3
# If overfitting observed, reduce stepsize eta and increase nround at the same time

num_round = 200

In [19]:
# Controlling the model complexity
param['gamma'] = 0.2 # minimum loss reduction required to make a split, contolling a number of trees.
# Range [0, infinity], default 0 - no control

param['max_depth'] = 2 # maximum depth of a tree
# Range [1, infinity], default 6

param['min_child_weight'] = 1.0 # minimum sum of instance weight needed in a child
# Range [0, infinity], default 1

# param['max_delta_step'] = 10 # maximum delta step we allow each tree's weight estimation to be, helps for convergence, care about the right probability
# Range [0, infinity], default 0 - no control

In [20]:
# Robust to noize
param['subsample'] = 0.5 # subsample ratio of the training instance
# Range (0, 1], default 1

param['colsample_bytree'] = 0.5 # subsample ratio of columns when constructing each tree
# Range (0, 1], default 1

param['colsample_bylevel'] = 0.5

In [21]:
# For Linear Booster
param['lambda'] = 5.0 # L2 regularization term on weights
# Default 0

param['alpha'] = 0.0 # L1 regularization term on weights
# Default 0

# param['lambda_bias'] = 0 # L2 regularization bias
# Default 0

In [22]:
# Objectives
# param['objective'] = 'binary:logistic' # logistic regression for binary classification, output probability
param['objective'] = 'multi:softmax' # multiclass classification using the softmax objective, need to specify num_class
# Default "reg:linear"

param['num_class'] = len(lable_dict)

In [23]:
# Balance the positive and negative weights
param['scale_pos_weight'] = 1.0 * (y_train.shape[0] - y_train.sum()) / (y_train.sum() + 1)

#Evaluation Metric
# param['eval_metric'] = 'merror'
param['eval_metric'] = 'auc' # only care about ranking order
# Options: 'rmse', 'logloss', 'error', 'auc', 'merror', 'mlogloss'

param['silent'] = 0

# param['one_drop'] = 1
# param['rate_drop'] =  0.2
param['updater'] = 'grow_histmaker,refresh'
# param['sample_type'] =  "weighted"
# param['normalize_type'] = "forest"


## Training

In [None]:
watchlist  = [(D_test, 'eval'), (D_train,'eval')]
bst = xgb.train(param, D_train, num_round, watchlist)

In [72]:
xxx = bst.predict(D_test)

accuracy = 1.0 * map(lambda x,y: 1 if x == y else 0, y_test, xxx).count(1) / len(y_test)
print(accuracy)

0.878378378378


In [73]:
bst.save_model(r'models\xgb\2classes_xgb_tfidf_v1.model')

In [76]:
# TODO:
# Plot xgb metrix

xgb.plot_importance(bst)
xgb.plot_tree(bst, num_trees=2)
xgb.to_graphviz(bst, num_trees=2)

res = xgb.cv(
    params = params, 
    data = X_train, 
    label = y_train,
    nround = 2, 
    nfold = 5, 
    prediction = TRUE
)
str(res)

ExecutableNotFound: failed to execute ['dot', '-Tpng'], make sure the Graphviz executables are on your systems' PATH

# cv

In [37]:
lables, _, texts = zip(*doc_texts)

lables = list(lables)
texts = map(lambda x: stemmer.stem(x), list(texts))
texts = map(lambda x: tokenizer.tokenize(x), texts)
texts = map(lambda x: re.sub(r'\s\d+\s', ' ', ' '.join(x)), texts)

In [38]:
# texts = map(lambda x: re.sub(r'(?<=[^\w])20\d\d(?=[^\w]?)', 'year_mask', x), texts)
# texts = map(lambda x: re.sub(r'year_mask +(0[1-9]|1[0-2]) +([0-2][1-9]|3[01])', 'date_mask', x), texts)
# texts = map(lambda x: re.sub(r'year_mask +([0-2][1-9]|3[01]) +(0[1-9]|1[0-2])', 'date_mask', x), texts)
# texts = map(lambda x: re.sub(r'([0-2][1-9]|3[01]) +(0[1-9]|1[0-2]) +year_mask', 'date_mask', x), texts)
# texts = map(lambda x: re.sub(r'(0[1-9]|1[0-2]) +([0-2][1-9]|3[01]) +year_mask', 'date_mask', x), texts)

lable_dict = dict(zip(list(set(lables)), range(len(set(lables)))))
inv_lable_dict = dict(zip(lable_dict.values(), lable_dict.keys()))

lables_int = np.array(map(lambda x: lable_dict[x], lables))

# vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(sublinear_tf=False, max_df=1.0, min_df=5, use_idf=True, 
#                                                              stop_words='english', lowercase=True, max_features=None,
#                                                              strip_accents = 'unicode', ngram_range=(1,3), norm=u'l2',
#                                                              smooth_idf=True, token_pattern=r'[a-z]{3,}')

# X = vectorizer.fit_transform(texts)
X = np.array(texts)
y = lables_int

In [1]:
print(X.shape)
print('# classes:', len(lable_dict))
for i in lable_dict.keys():
    print(i, ' '*(10-len(i)),':', list(lables_int).count(lable_dict[i]))
print(lable_dict)
print(Counter(y))

NameError: name 'X' is not defined

In [40]:
param = {}
param ['booster'] = 'gbtree'
param['objective'] = 'multi:softmax'
param['eta'] = 0.3
param['gamma'] = 0.1
param['min_child_weight'] = 0.1
param['scale_pos_weight'] = 1.0 * (y_train.shape[0] - y_train.sum()) / (y_train.sum() + 1)

#param['max_delta_step'] = 10
param['max_depth'] = 5
param['silent'] = 1
param['subsample'] = 0.5
param['colsample_bytree' ] = 0.5
param['colsample_bylevel' ] = 0.5
param['alpha' ] = 0.0
param['lambda' ] = 5.0
param['nthread'] = 4
param['num_class'] = len(lable_dict)
# param['one_drop'] = 1
# param['rate_drop'] =  0.2
param['updater']='grow_histmaker,prune'
# param['sample_type'] =  "weighted"
# param['normalize_type'] = "forest"
param['eval_metric'] = 'merror'
num_round = 200

In [41]:
cv = sklearn.model_selection.KFold(n_splits=5, random_state=42, shuffle=True)

In [42]:
cc = 0
accs = []
incorrect_classes = Counter()
incorrect_files = Counter()
common_mistakes = Counter()
for i ,j in  cv.split(range(len(doc_texts))):
    print ('Fold', cc, '...')
    X_train, X_test = X[i], X[j]
    y_train, y_test = y[i], y[j]
    
    vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
        sublinear_tf=False, 
        max_df=0.9, 
        min_df=0.3, 
        use_idf=True,                                    
        stop_words='english',  
#         stop_words=_stopwords,                                           
        lowercase=True,                                       
        max_features=None,                                
        strip_accents = 'unicode',                                     
        ngram_range=(1,3),                                            
        norm=u'l2',                                        
        smooth_idf=True,                                                       
        token_pattern=r'[a-z\d]{2,}'
    )
    
    vectorizer.fit(X_train)
    X_train = vectorizer.transform(X_train)
    X_test =  vectorizer.transform(X_test)
    
    D_train = xgb.DMatrix(X_train, label = y_train)
    D_test = xgb.DMatrix(X_test, label = y_test)
    
    bst = xgb.train(param, D_train, num_round)
    xxx = bst.predict(D_test)

    accuracy = 1.0*map(lambda x,y: 1 if x == y else 0, y_test, xxx).count(1)/len(y_test)
    accs.append(accuracy)
    
    for pr in range(len(xxx)):
        if xxx[pr] != y_test[pr]:
#            print('pred:', inv_lable_dict[xxx[pr]])
#            print('true:', inv_lable_dict[y_test[pr]])
#            print(doc_texts[j[pr]][1], '\n')
            incorrect_classes[inv_lable_dict[y_test[pr]]] += 1
            incorrect_files[doc_texts[j[pr]][1]] += 1
            common_mistakes[(inv_lable_dict[xxx[pr]], inv_lable_dict[y_test[pr]])] += 1
            #print(doc_texts[j][2])
    
    print ('---------------------------------\n', 'Done')
    print ('Fold', cc, 'accuracy :\t', accuracy, '\n')
    cc += 1
cv_acc = sum(accs)/len(accs)
print ('%s-Fold cross-validation accuracy over %s classes for the basic BST model is:' % (cv.n_splits,len(lable_dict)), cv_acc)

Fold 0 ...
---------------------------------
 Done
Fold 0 accuracy :	 0.865771812081 

Fold 1 ...
---------------------------------
 Done
Fold 1 accuracy :	 0.851351351351 

Fold 2 ...
---------------------------------
 Done
Fold 2 accuracy :	 0.898648648649 

Fold 3 ...
---------------------------------
 Done
Fold 3 accuracy :	 0.898648648649 

Fold 4 ...
---------------------------------
 Done
Fold 4 accuracy :	 0.878378378378 

5-Fold cross-validation accuracy over 2 classes for the basic BST model is: 0.878559767822


In [43]:
sorted(incorrect_classes.items(), key =lambda x: x[1], reverse = True)

[('Approved', 61), ('Other', 29)]

In [44]:
print(len(incorrect_files))
print(sorted(common_mistakes.items(), key =lambda x: x[1], reverse = True))
print()
sorted(incorrect_files.items(), key = lambda x: x[1], reverse = True)

85
[(('Other', 'Approved'), 61), (('Approved', 'Other'), 29)]



[('RE   Approval  BC OND JFM allowance.txt', 2),
 ('RE    NRO request  DFS SG   SK-II miniature .txt', 2),
 ('RE  Approval Needed - Release TRAVEL RETAIL  2002825715  DFS VENTURE SINGAPORE PTE LTD   SKII   17 APR 2017.txt',
  2),
 ('Approval  NRO - Kingpower Tester request.txt', 2),
 ('RE  New PO# G124293 for Procter & Gamble   (1).txt', 2),
 ('RE  NRO FTE 2ml Sachet - Hainan Order.txt', 1),
 ('FW   Approval  GWP DFS Aug 2017 First Class Beauty Event .txt', 1),
 ('FW  IShopChangi Program .txt', 1),
 ('RE  Approval Needed - Release TRAVEL RETAIL Order   2002834049 HARRODS DC (THATCHAM)  SK-II   19 May 2017.txt',
  1),
 ('RE    NRO request BC incentive allowance    JFM 2017  ( ANZ   SG DFS   INDO  .txt',
  1),
 ("RE  Copy of SKII_NRO_Order_Form - FY1617 -RWS Jul-Sep'16 Hampers contest xlsx (8).txt",
  1),
 ('RE  Shilla FITW Lucky Draw prize NRO.txt', 1),
 ('RE   Approval  Ageless Beauty Travel Kit NRO.txt', 1),
 ('RE   seek help  SG LGS remnant (2).txt', 1),
 ('RE  NRO - Additional teste

In [45]:
feature_imp = sorted(bst.get_fscore().items(), key = lambda x: x[1], reverse=True)
feature_imp

[('f204', 53),
 ('f28', 31),
 ('f26', 28),
 ('f251', 25),
 ('f44', 22),
 ('f203', 22),
 ('f105', 22),
 ('f27', 20),
 ('f41', 18),
 ('f53', 18),
 ('f8', 18),
 ('f188', 17),
 ('f0', 17),
 ('f260', 17),
 ('f179', 17),
 ('f245', 16),
 ('f12', 16),
 ('f194', 15),
 ('f51', 15),
 ('f220', 15),
 ('f180', 14),
 ('f130', 14),
 ('f11', 14),
 ('f16', 14),
 ('f9', 14),
 ('f271', 14),
 ('f200', 13),
 ('f244', 13),
 ('f158', 13),
 ('f111', 12),
 ('f33', 12),
 ('f72', 12),
 ('f163', 11),
 ('f205', 11),
 ('f100', 11),
 ('f29', 11),
 ('f30', 11),
 ('f129', 11),
 ('f240', 11),
 ('f207', 11),
 ('f18', 11),
 ('f4', 11),
 ('f61', 11),
 ('f168', 10),
 ('f208', 10),
 ('f112', 10),
 ('f134', 10),
 ('f232', 10),
 ('f32', 10),
 ('f3', 10),
 ('f246', 10),
 ('f94', 10),
 ('f252', 10),
 ('f19', 10),
 ('f267', 10),
 ('f73', 10),
 ('f42', 9),
 ('f47', 9),
 ('f199', 9),
 ('f107', 9),
 ('f133', 9),
 ('f233', 9),
 ('f171', 9),
 ('f183', 8),
 ('f25', 8),
 ('f132', 8),
 ('f85', 8),
 ('f241', 8),
 ('f242', 8),
 ('f155', 8)

In [46]:
feature_dict = vectorizer.get_feature_names()
inv_feature_dict = dict(zip(map(lambda x: 'f'+ str(x), range(len(feature_dict))), feature_dict))

In [47]:
a = []
for i in feature_imp:
    a.append(inv_feature_dict[i[0]])
    print(inv_feature_dict[i[0]], ' ' * (30 - len(inv_feature_dict[i[0]])), i[1])

pm                              53
approved                        31
approval                        28
thank                           25
com cc                          22
pls                             22
hi                              22
approve                         20
cest                            18
com upcoming                    18
12                              18
ooo                             17
05                              17
tr                              17
nro                             17
subject approval                16
17                              16
order                           15
com subject                     15
retail                          15
om                              14
ion cc                          14
16                              14
65                              14
13                              14
usd                             14
pg com subject                  13
skii                            13
muthuraman pg       