In [150]:
# Imports
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

In [151]:
# Training Data
training_raw = pd.read_table("../data/training_data.dat")
df_training = pd.DataFrame(training_raw)
df_training.describe()

Unnamed: 0,Barcode,UnitRRP,CategoryID
count,526.0,526.0,526.0
mean,9771880000000.0,416.119772,441.404943
std,2941241000.0,173.972146,197.59635
min,9770002000000.0,0.0,1.0
25%,9770886000000.0,330.0,528.0
50%,9771081000000.0,395.0,529.0
75%,9771744000000.0,475.0,531.0
max,9790951000000.0,1299.0,532.0


In [152]:
# test Data
test_raw = pd.read_table("../data/test_data.dat")
df_test = pd.DataFrame(test_raw)
df_test.describe()

Unnamed: 0,Barcode,UnitRRP,CategoryID
count,191.0,191.0,191.0
mean,9771487000000.0,399.528796,410.816754
std,2319726000.0,185.199147,221.482429
min,9770004000000.0,0.0,1.0
25%,9770140000000.0,304.5,528.0
50%,9771350000000.0,390.0,529.0
75%,9772042000000.0,440.0,530.0
max,9781910000000.0,1200.0,532.0


In [153]:
# target names
target_categories = ['Unclassified','Art','Aviation','Boating','Camping /Walking /Climbing','Collecting']
target_values = ['1','528','529','530','531','532']

In [154]:
# features
feature_names_integers = ['Barcode','UnitRRP']
training_data_integers = df_training[feature_names_integers].values
training_data_integers[:3]

array([[9771471058036,           340],
       [9770300169189,           399],
       [9781909786417,           795]], dtype=int64)

In [155]:
df_training['Description'][:3]

0        Todays Pilot
1               Pilot
2    Classic Airliner
Name: Description, dtype: object

In [156]:
# Rather than Vectorizing the string as a whole do each word
count_vect = CountVectorizer()
count_vect.fit(df_training['Description'])
training_data_description_vect_matrix = count_vect.transform(df_training['Description'])
training_data_description_vect_matrix.shape

(526, 305)

In [157]:
training_data_description_vect_matrix.__class__

scipy.sparse.csr.csr_matrix

In [158]:
training_data_description_vect_matrix

<526x305 sparse matrix of type '<type 'numpy.int64'>'
	with 1081 stored elements in Compressed Sparse Row format>

In [159]:
# So we work with the vectorized text along side the barcode and price, convert it to an array
training_data_description_vect = training_data_description_vect_matrix.toarray()
training_data_description_vect[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [160]:
print('Description: "{}" - "todays" word number: {} "pilot" word number: {}').format(
    df_training['Description'][0],count_vect.vocabulary_.get(u'todays'),count_vect.vocabulary_.get(u'pilot'))

Description: "Todays Pilot" - "todays" word number: 275 "pilot" word number: 216


In [161]:
# Using numpy's hstack append the vectorized text to the barcode and price
training_data_combined = np.hstack((training_data_integers,training_data_description_vect))
training_data_combined[0]

array([9771471058036,           340,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,           

In [162]:
# Train the model
model = DecisionTreeClassifier(random_state=511)
target = df_training["CategoryID"].values
model.fit(training_data_combined, target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=511, splitter='best')

In [163]:
# Do all this again for the test data
test_data_integers = df_test[feature_names_integers].values
test_data_description_vect_matrix = count_vect.transform(df_test['Description'])
test_data_description_vect = test_data_description_vect_matrix.toarray()
test_data_combined = np.hstack((test_data_integers,test_data_description_vect))
test_data_combined[0]

array([9770306563172,           370,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             1,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,             0,
                   0,             0,             0,           

In [164]:
predicted = model.predict(test_data_combined)

In [165]:
expected = df_test["CategoryID"].values

In [166]:
print(metrics.classification_report(expected, predicted,    target_names=target_categories))

                            precision    recall  f1-score   support

              Unclassified       0.55      0.14      0.22        43
                       Art       0.29      0.60      0.39        20
                  Aviation       0.73      0.56      0.63        54
                   Boating       0.51      0.75      0.61        28
Camping /Walking /Climbing       0.41      0.73      0.52        15
                Collecting       0.77      0.74      0.75        31

               avg / total       0.59      0.54      0.52       191



In [167]:
print(metrics.confusion_matrix(expected, predicted))

[[ 6 16  4 12  5  0]
 [ 1 12  6  1  0  0]
 [ 0 10 30  4  9  1]
 [ 2  0  1 21  0  4]
 [ 0  1  0  1 11  2]
 [ 2  2  0  2  2 23]]


In [168]:
metrics.accuracy_score(expected, predicted, normalize=True, sample_weight=None)

0.53926701570680624

So an increase from 46%/47% to 54%

## Stop words

As we sure in the text processing session some words just add noise to the data set.

So would adding the english stop words help?

In [169]:
count_vect_stop = CountVectorizer(stop_words='english')
count_vect_stop.fit(df_training['Description'])
training_data_stop_description_vect_matrix = count_vect_stop.transform(df_training['Description'])
training_data_stop_description_vect = training_data_stop_description_vect_matrix.toarray()
training_data_stop_combined = np.hstack((training_data_integers,training_data_stop_description_vect))
model = DecisionTreeClassifier(random_state=511)
model.fit(training_data_stop_combined, target)
test_data_stop_integers = df_test[feature_names_integers].values
test_data_stop_description_vect_matrix = count_vect_stop.transform(df_test['Description'])
test_data_stop_description_vect = test_data_stop_description_vect_matrix.toarray()
test_data_stop_combined = np.hstack((test_data_stop_integers,test_data_stop_description_vect))
predicted_stop = model.predict(test_data_stop_combined)
print(metrics.confusion_matrix(expected, predicted))
metrics.accuracy_score(expected, predicted_stop, normalize=True, sample_weight=None)

[[ 6 16  4 12  5  0]
 [ 1 12  6  1  0  0]
 [ 0 10 30  4  9  1]
 [ 2  0  1 21  0  4]
 [ 0  1  0  1 11  2]
 [ 2  2  0  2  2 23]]


0.53403141361256545

In [170]:
count_vect_stop.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [171]:
count_vect_stop = CountVectorizer(stop_words=['the'])
count_vect_stop.fit(df_training['Description'])
training_data_stop_description_vect_matrix = count_vect_stop.transform(df_training['Description'])
training_data_stop_description_vect = training_data_stop_description_vect_matrix.toarray()
training_data_stop_combined = np.hstack((training_data_integers,training_data_stop_description_vect))
model = DecisionTreeClassifier(random_state=511)
model.fit(training_data_stop_combined, target)
test_data_stop_integers = df_test[feature_names_integers].values
test_data_stop_description_vect_matrix = count_vect_stop.transform(df_test['Description'])
test_data_stop_description_vect = test_data_stop_description_vect_matrix.toarray()
test_data_stop_combined = np.hstack((test_data_stop_integers,test_data_stop_description_vect))
predicted_stop = model.predict(test_data_stop_combined)
print(metrics.confusion_matrix(expected, predicted))
metrics.accuracy_score(expected, predicted_stop, normalize=True, sample_weight=None)

[[ 6 16  4 12  5  0]
 [ 1 12  6  1  0  0]
 [ 0 10 30  4  9  1]
 [ 2  0  1 21  0  4]
 [ 0  1  0  1 11  2]
 [ 2  2  0  2  2 23]]


0.52356020942408377

In [None]:
metrics.accuracy_score(expected, predicted_stop, normalize=True, sample_weight=None)