# EPOS Data Set composition

New product information coming into a central system for categorisation

In [125]:
# Imports
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

In [126]:
# Training Data
training_raw = pd.read_table("../data/training_data.dat")
df_training = pd.DataFrame(training_raw)
df_training.head()

Unnamed: 0,Barcode,Description,UnitRRP,CategoryID,Category
0,9771471058036,Todays Pilot,340,529,Aviation
1,9770300169189,Pilot,399,529,Aviation
2,9781909786417,Classic Airliner,795,529,Aviation
3,9771440132057,International Artist,475,528,Art
4,9771362031988,Canal Boat,499,530,Boating


In [127]:
# test Data
test_raw = pd.read_table("../data/test_data.dat")
df_test = pd.DataFrame(test_raw)
df_test.head()

Unnamed: 0,Barcode,Description,UnitRRP,CategoryID,Category
0,9770306563172,Air International,370,529,Aviation
1,9770306563189,Air International,380,529,Aviation
2,9770306563196,Air International,395,529,Aviation
3,9770306563202,Air International,410,529,Aviation
4,9770306563257,Air International,460,529,Aviation


In [128]:
# target names
target_categories = ['Unclassified','Art','Aviation','Boating','Camping /Walking /Climbing','Collecting']
target_values = ['1','528','529','530','531','532']

In [129]:
# features
feature_names = ['Barcode','Description','UnitRRP']

In [130]:
# Extract features from panda
training_data = df_training[feature_names].values
training_data[:3]

array([[9771471058036L, 'Todays Pilot', 340L],
       [9770300169189L, 'Pilot', 399L],
       [9781909786417L, 'Classic Airliner', 795L]], dtype=object)

In [131]:
# Extract target results from panda
target = df_training["CategoryID"].values

In [132]:
# Create classifier class
model_dtc = DecisionTreeClassifier()

In [133]:
# train model
model_dtc.fit(training_data, target)

ValueError: could not convert string to float: Usa Today

We fail here because the description column is a string.
Lets try again without the description.

In [134]:
# features
feature_names_integers = ['Barcode','UnitRRP']

In [135]:
# Extra features from panda (without description)
training_data_integers = df_training[feature_names_integers].values
training_data_integers[:3]

array([[9771471058036,           340],
       [9770300169189,           399],
       [9781909786417,           795]], dtype=int64)

In [136]:
# train model again
model_dtc.fit(training_data_integers, target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [137]:
# Extract test data and test the model
test_data_integers = df_test[feature_names_integers].values
test_target = df_test["CategoryID"].values
expected = test_target
predicted_dtc = model_dtc.predict(test_data_integers)

In [138]:
print(metrics.classification_report(expected, predicted_dtc,    target_names=target_categories))

                            precision    recall  f1-score   support

              Unclassified       0.33      0.05      0.08        43
                       Art       0.31      0.55      0.39        20
                  Aviation       0.50      0.56      0.53        54
                   Boating       0.47      0.57      0.52        28
Camping /Walking /Climbing       0.42      0.53      0.47        15
                Collecting       0.53      0.61      0.57        31

               avg / total       0.44      0.45      0.41       191



In [139]:
print(metrics.confusion_matrix(expected, predicted_dtc))

[[ 2  0 24  9  5  3]
 [ 1 11  1  2  0  5]
 [ 1 16 30  1  4  2]
 [ 0  4  3 16  1  4]
 [ 2  0  2  0  8  3]
 [ 0  5  0  6  1 19]]


In [140]:
metrics.accuracy_score(expected, predicted, normalize=True, sample_weight=None)

0.44502617801047123

In [141]:
predicted[:5]

array([529, 529, 529, 529, 529], dtype=int64)

Lets try a different Classifier

Linear classifiers (SVM, logistic regression, a.o.) with SGD training.

In [142]:
from sklearn.linear_model import SGDClassifier

In [143]:
# Create classifier class
model_sgd = SGDClassifier()

In [144]:
# train model again
model_sgd.fit(training_data_integers, target)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [145]:
predicted_sgd = model_sgd.predict(test_data_integers)

In [146]:
print(metrics.classification_report(expected, predicted_sgd,    target_names=target_categories))

                            precision    recall  f1-score   support

              Unclassified       0.00      0.00      0.00        43
                       Art       0.00      0.00      0.00        20
                  Aviation       0.00      0.00      0.00        54
                   Boating       0.00      0.00      0.00        28
Camping /Walking /Climbing       0.00      0.00      0.00        15
                Collecting       0.16      1.00      0.28        31

               avg / total       0.03      0.16      0.05       191



In [147]:
print(metrics.confusion_matrix(expected, predicted_sgd))

[[ 0  0  0  0  0 43]
 [ 0  0  0  0  0 20]
 [ 0  0  0  0  0 54]
 [ 0  0  0  0  0 28]
 [ 0  0  0  0  0 15]
 [ 0  0  0  0  0 31]]


In [148]:
metrics.accuracy_score(expected, predicted_sgd, normalize=True, sample_weight=None)

0.16230366492146597