In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import dmc
from process import processed_data

Tensorflow not installed


In [3]:
data = processed_data()
train_ids, test_ids = dmc.loading.load_ids('rawMirrored')
train, test = dmc.preprocessing.split_train_test(data, train_ids, test_ids)
data = data.drop('Unnamed: 0', 1)

## Evaluate and boost naive bayes on mirrored

In [5]:
offset = len(train)
X, y = dmc.transformation.transform(data, ignore_features=['returnQuantity', 'orderID', 'orderDate', 'customerID'],
                                    binary_target=False)

In [5]:
fts = dmc.transformation.transform_feature_matrix_ph(data, ignore_features=['returnQuantity', 'orderID', 'orderDate'])

In [6]:
from dmc.classifiers import NaiveBayes, NaiveBayesM

In [7]:
train = X[:offset], y[:offset]
test = X[offset:], y[offset:]

In [14]:
# standard naive bayes
clf = NaiveBayes(train[0], train[1])
res = clf(test[0])
precision = dmc.evaluation.precision(res, test[1])
print(precision, ' using ', str(NaiveBayes))

0.6461661409398811  using  <class 'dmc.classifiers.NaiveBayes'>


In [8]:
feature_evaluation = dmc.evaluation.evaluate_features_leaving_one_out(train[0], train[1], test[0], test[1],
                                                                      fts, NaiveBayes)

In [15]:
# boosted with target set knowledge
negative_features = set(feature_evaluation.index[feature_evaluation.decrement < 0])
fts_series = pd.Series(fts).apply(lambda x: False if x in negative_features else True)
fts_mask = np.array(fts_series)
X_tr, X_cl = train[0].T[fts_mask].T, test[0].T[fts_mask].T
print('Train and Evaluate')
clf = NaiveBayes(X_tr, train[1])
prec = dmc.evaluation.precision(test[1], clf(X_cl))
print(prec)

Train and Evaluate
0.6541070114757841


## Use ensemble and optimize each splits for used features

In [4]:
ensemble = dmc.ensemble.Ensemble(train, test)

In [None]:
splits = len(ensemble.splits)
ensemble.transform(scalers='find out', ignore_features=None)

## Evaluate final precision

In [44]:
classifiers = [dmc.classifiers.DecisionTree, dmc.classifiers.NaiveBayes, dmc.classifiers.Forest]
Xn = X[y > 0]
yn = y[y > 0]
for clf in classifiers:
    c = clf(Xn[:900000], yn[:900000])
    pred = c(Xn[900000:])
    print('precision', dmc.evaluation.dmc_cost_relative(pred, yn[900000:]), 'with', clf)

precision 0.00435064935065 with <class 'dmc.classifiers.DecisionTree'>
precision 0.00391233766234 with <class 'dmc.classifiers.NaiveBayes'>
precision 0.00393181818182 with <class 'dmc.classifiers.Forest'>


Test on Train:
```
precision 8.27814569536e-07 with <class 'dmc.classifiers.DecisionTree'>
precision 0.00368625827815 with <class 'dmc.classifiers.NaiveBayes'>
precision 8.27814569536e-07 with <class 'dmc.classifiers.Forest'>
```

Test on 25%, Train on 75%:
```
precision 0.00435064935065 with <class 'dmc.classifiers.DecisionTree'>
precision 0.00391233766234 with <class 'dmc.classifiers.NaiveBayes'>
precision 0.00393181818182 with <class 'dmc.classifiers.Forest'>
```

Evaluation of rule approach versus classifier approach

In [39]:
def switch_predictor(quantity):
    if quantity == 1:
        return 1
    elif quantity == 2:
        return 1
    elif quantity == 3:
        return 2
    elif quantity == 4:
        return 2
    elif quantity == 5:
        return 3
    print('FAIL')
    return None

evalframe = test[test.returnQuantity > 0]
pred = evalframe.quantity.apply(switch_predictor)
print('precision', dmc.evaluation.dmc_cost_relative(pred, evalframe.returnQuantity))

precision 0.00285924286603


Predicting using the above switch is the most precise way if one does not want to train a specially optimized regressor/classifier.