Simple Naive Bayes classifier

In [59]:
import re
import sklearn.datasets as skds
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB

In [60]:
# First tokenization
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(ds.data)

print X_train_counts.shape
print count_vectorizer.vocabulary_

(4, 347)
{u'gr\xf6\xdfe': 148, u'rendir': 265, u'charles': 62, u'ansehen': 24, u'germar': 140, u'bereits': 51, u'southwestern': 297, u'compararlo': 69, u'religious': 263, u'jahre': 168, u'paris': 239, u'state': 300, u'to': 310, u'aus': 40, u'hermann': 155, u'hatte': 152, u'apos': 26, u'l\xe4nge': 196, u'beau': 44, u'de': 80, u'lendenwirbel': 189, u'vuelven': 326, u'algunos': 18, u'\xe9diteur': 343, u'llama': 192, u'during': 97, u'morceaux': 210, u'qu': 255, u'becken': 47, u'die': 85, u'ausgestellt': 41, u'vingt': 323, u'dixi\xe8me': 93, u'qotsa': 254, u'kasaragod': 174, u'm\xfasica': 217, u'karnataka': 173, u'collectively': 66, u'are': 29, u'entretiens': 108, u'districts': 92, u'weitere': 330, u'remporta': 264, u'coqu\xe9au': 73, u'kupferschiefer': 179, u'sur': 305, u'rite': 271, u'guns': 149, u'publia': 253, u'pregunto': 247, u'latin': 187, u'diese': 86, u'royale': 276, u'r\xe9pertoire': 277, u'abb\xe9': 12, u'nicht': 224, u'\xe9tait': 344, u'sind': 294, u'schleusingen': 286, u'aunque

In [61]:
# Term Frequency Transformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print X_train_tf.shape

(4, 347)


In [62]:
# Naive Bayes classification
# Trained the classifier on our documents
clf = MultinomialNB().fit(X_train_tf, ds.target)

docs_new = ["Hello, how are you?", "Yo me guesta juegar con sus bano"]
docs_counts = count_vectorizer.transform(docs_new)
docs_tf = tf_transformer.transform(docs_counts)
print docs_counts.data
print docs_tf


# Display predictions
predicted = clf.predict(docs_tf)
for doc, category in zip(docs_new, predicted):
    print ('%r => %s' % (doc, ds.target_names[category]))

[1 1 1]
  (0, 29)	1.0
  (1, 71)	0.707106781187
  (1, 203)	0.707106781187
'Hello, how are you?' => english
'Yo me guesta juegar con sus bano' => spanish


----

**load_corpus**
Returns the experimental corpus

In [63]:
def load_corpus(source="../data/experimental_corpus", encoding='UTF-8'):
    return skds.load_files(source, encoding=encoding)

In [64]:
def load_dev_corpus():
    return load_corpus(source='../data/dev_corpus')

**alpha_only_token_pattern**

Returns a token pattern for unicode alphabetical character words only.  No numbers or underscores allowed.8

In [65]:
def alpha_only_token_pattern():
    return '(?u)\\b[^\W_0-9]{1,}\\b'

_Experiment 1:  Naive Bayes using Feature Counts only_

Word features are vectorized per:
- Alpha only tokens, single characters allowed
- binary representation only (token exists or does not)
- all lowercase
- UTF-8 encoding
- no stopwords used

In [74]:
corpus = load_dev_corpus()

print "Loaded {0} samples".format(len(corpus.data))

# Classification labels
for index, name in enumerate(corpus.target_names):
    print "Category {0} is {1}".format(index, name)

    
# Starting with a simple Binary Word Tokenizer,  requiring tokens at least 2 letters in length 
# eliminate numerical features
cv = CountVectorizer(analyzer='word',binary=True, lowercase=True, encoding='UTF-8', 
                     token_pattern=alpha_only_token_pattern())
voc = cv.fit_transform(corpus.data)
print "The shape of the vocabulary: {0}".format(voc.shape[0])


# Print the entire set of feature names
print "{0} features were found in the corpus.".format(len(cv.get_feature_names()))
for i, name in enumerate(cv.get_feature_names()[0:20]):
    print u"Feature {0} is {1}. length: {2}".format(i, name, len(name))

    
# Check for numbers
numbers = re.compile('\d')
for feature in cv.get_feature_names():
    if numbers.search(feature):
        print u"Feature {0} has numbers in it.".format(feature)

print "Towards is {0}".format(cv.vocabulary_['towards'])
print u"First training sample is:\n{0}".format(corpus.data[0])
print "Feature vector is:\n{0}".format(voc[0].toarray())    

fet1 = cv.vocabulary_['the']
print u'Checking for presence of {0}. Indicatoris {1}'.format('the', voc[0].toarray()[0][fet1])
fet2 = cv.vocabulary_['acquis']
print u'Checking for presence of {0}. Indicator is {1}'.format('acquis', voc[0].toarray()[0][fet2])

print cv.stop_words_

Loaded 400 samples
Category 0 is de
Category 1 is en
Category 2 is es
Category 3 is fr
The shape of the vocabulary: 400
17272 features were found in the corpus.
Feature 0 is a. length: 1
Feature 1 is aa. length: 2
Feature 2 is aaaaa. length: 5
Feature 3 is aachen. length: 6
Feature 4 is ab. length: 2
Feature 5 is abaissée. length: 8
Feature 6 is abajo. length: 5
Feature 7 is abandon. length: 7
Feature 8 is abandona. length: 8
Feature 9 is abandonado. length: 10
Feature 10 is abandoned. length: 9
Feature 11 is abandonne. length: 9
Feature 12 is abandonner. length: 10
Feature 13 is abandonnera. length: 11
Feature 14 is abandonné. length: 9
Feature 15 is abandonó. length: 8
Feature 16 is abattant. length: 8
Feature 17 is abattue. length: 7
Feature 18 is abbakka. length: 7
Feature 19 is abbaye. length: 6
Towards is 15251
First training sample is:
A large part of the C++ library is based on the STL. This provides useful tools as containers (for example vectors and lists), iterators to provi

In [75]:
for name, ident in zip(corpus.target_names, corpus.target):
    print "Target name: {0}, id: {1}".format(name, ident)
    
print len(corpus.target)

Target name: de, id: 1
Target name: en, id: 3
Target name: es, id: 3
Target name: fr, id: 1
400


In [76]:
## Try splits
train_corpus, test_corpus, train_targets, test_targets = train_test_split(corpus.data, corpus.target, test_size=.1)

### Test w/ cross validation

In [77]:
# Load dev corpus
dev_corpus = load_dev_corpus()

# Create our vectorizer for the corpus
vectorizer = CountVectorizer(analyzer='word', binary=True, lowercase=True, encoding='UTF-8', 
                            token_pattern=alpha_only_token_pattern())
exp_data = vectorizer.fit_transform(corpus.data)

# Train a Naive Bayes classifier on our vectorized corpus
classifier = MultinomialNB().fit(exp_data, dev_corpus.target)

# Simple classification test
docs_new = ["Hello, how are you?", "Yo me guesta juegar con sus bano", 'hamburger on wheat with peanuts']
docs_counts = vectorizer.transform(docs_new)
predicted = classifier.predict(docs_counts)
for doc, category in zip(docs_new, predicted):
    print '{0} => {1}'.format(doc, dev_corpus.target_names[category])
    
# Cross validation
X = exp_data
y = dev_corpus.target
scores = cross_val_score(classifier, X, y, cv=5)
print scores

Hello, how are you? => en
Yo me guesta juegar con sus bano => es
hamburger on wheat with peanuts => en
[ 1.      0.9875  1.      1.      1.    ]
