# Two step classification benchmark
The purpose of this notebook consists in benchmarking two step classification against one step classification. Advantage of a two step approach is that most classifiers (especially SVM) have significantly shorter training times. Thus it should be evaluated how precision behaves in both approaches and the best classifier for predicting the final return quantity should be found.

In [21]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, hstack
import process as p
import dmc

In [7]:
df = p.processed_data()
for c in [col for col in df.columns if 'Prob' in col]:
    df = df.drop(c, 1)

In [52]:
def predict_return_quantity_direct(df, tr_size, te_size):
    results = []
    X, Y = dmc.transformation.transform(df, scaler=dmc.normalization.scale_features,
                                        binary_target=False)
    train = X[:tr_size], Y[:tr_size]
    test = X[tr_size:tr_size + te_size], Y[tr_size:tr_size + te_size]
    for classifier in p.basic[:-1]:
        clf = classifier(train[0], train[1])
        res = clf(test[0])
        precision = dmc.evaluation.precision(res, test[1])
        cost = dmc.evaluation.dmc_cost(res, test[1])
        results.append((precision, cost, str(classifier)))
    return results
        
def predict_return_quantity_twostep(df, tr_size, te_size):
    results = {
        'precision': [],
        'cost': []
    }
    X, Y = dmc.transformation.transform(df, scaler=dmc.normalization.scale_features,
                                        binary_target=True)
    Y_fin = dmc.transformation.transform_target_vector(df, binary=False)
    train = X[:tr_size], Y[:tr_size]
    test = X[tr_size:tr_size + te_size], Y[tr_size:tr_size + te_size]
    for classifier in p.basic[:-1]:
        clf = classifier(train[0], train[1])
        res = clf(test[0])
        Y_csr, res_csr = csr_matrix(Y).T, csr_matrix(res).T
        train_fin = hstack([train[0], Y_csr[:tr_size]]), Y_fin[:tr_size]
        test_fin = hstack([test[0], res_csr]), Y_fin[tr_size:tr_size + te_size]
        clf_fin = classifier(train_fin[0], train_fin[1])
        res_fin = clf_fin(test_fin[0])
        precision = dmc.evaluation.precision(res_fin, test_fin[1])
        cost = dmc.evaluation.dmc_cost(res_fin, test_fin[1])
        results.append((precision, cost, str(classifier)))
    return results

In [54]:
def benchmark_prediction_target(df, tr_size, te_size):
    df = p.shuffle(df)
    dfc = df[:te_size + tr_size].copy()
    print(predict_return_quantity_direct(dfc, tr_size, te_size))
    print(predict_return_quantity_twostep(dfc, tr_size, te_size))

In [55]:
benchmark_prediction_target(df, 5000, 20000)

[(0.56775, 8681, "<class 'dmc.classifiers.DecisionTree'>"), (0.636, 7303, "<class 'dmc.classifiers.Forest'>"), (0.6285000000000001, 7452, "<class 'dmc.classifiers.NaiveBayes'>"), (0.5862499999999999, 8297, "<class 'dmc.classifiers.SVM'>")]
[(0.5666, 8668, "<class 'dmc.classifiers.DecisionTree'>"), (0.6327499999999999, 7345, "<class 'dmc.classifiers.Forest'>"), (0.6304000000000001, 7392, "<class 'dmc.classifiers.NaiveBayes'>"), (0.5881000000000001, 8238, "<class 'dmc.classifiers.SVM'>")]
