# hoeffding adaptive tree

## Doc
  * https://scikit-multiflow.github.io/scikit-multiflow/skmultiflow.classification.trees.html
  * https://scikit-multiflow.github.io/scikit-multiflow/skmultiflow.data.file_stream.html
  * https://scikit-multiflow.github.io/scikit-multiflow/skmultiflow.evaluation.evaluate_prequential.html

In [73]:
from skmultiflow.trees import HoeffdingTree
from skmultiflow.trees.hoeffding_adaptive_tree import HAT
from skmultiflow.data.file_stream import FileStream
from skmultiflow.evaluation.evaluate_prequential import EvaluatePrequential
import numpy as np

In [92]:
from skmultiflow.metrics import ClassificationMeasurements
classMEtrics = ClassificationMeasurements()

In [76]:
def compute_y_weight(y):
    aux = np.unique(y,return_counts=True)
    nb_classes = len(aux[0])
    list_classes = aux[0]
    count_classes = aux[1]
    sum_classes =  float(np.sum(count_classes))
    weight_classes = count_classes/sum_classes
    y_weights = y.copy()
    
    i = 0
    while i < nb_classes:
        
        class_i = list_classes[i]
        y_weights = np.where(y_weights==float(class_i), weight_classes[i], y_weights) 
        i += 1
    return y_weights

# Comparing HT and HAT

In [103]:

pCat_features_idx = list(range(2,45))

# Setup the File Stream
stream = FileStream("../data_output/export_dataframe_0v4.csv", -1, 1,  pCat_features_idx)
stream.prepare_for_use()
#X, y = stream.next_sample(10000)
#y_weights = compute_y_weight(y)

cfiers = [HoeffdingTree(nominal_attributes=[0,1],split_confidence=1e-10), 
          HAT(nominal_attributes=[0,1],split_confidence=1e-10)
         ]
#classifier.partial_fit(X,y,[0,1],y_weights )
evaluator = EvaluatePrequential(pretrain_size=10000, max_samples=1000000, batch_size=10000, n_wait=200, max_time=1000,
                                 output_file='summary_hat.txt', show_plot=False, metrics=['accuracy','running_time','model_size']
                               )
models = evaluator.evaluate(stream=stream, model=cfiers, model_names=['HT', 'HAT']);
#0.8499

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 10000 sample(s).
Evaluating...

Time limit reached (1000.00s). Evaluation stopped.
Processed samples: 340000
Mean performance:
HT - Accuracy     : 0.8380
HT - Training time (s)  : 159.37
HT - Testing time  (s)  : 31.66
HT - Total time    (s)  : 191.03
HT - Size (kB)          : 1884258.3047
HAT - Accuracy     : 0.8441
HAT - Training time (s)  : 26.51
HAT - Testing time  (s)  : 10.25
HAT - Total time    (s)  : 36.76
HAT - Size (kB)          : 644.3047


In [104]:
np.unique(models[0].predict(X),return_counts=True)

(array([0, 1]), array([9415,  585]))

In [113]:
np.unique(models[1].predict(X),return_counts=True)

(array([0]), array([10000]))

In [89]:
np.unique(y_weights,return_counts=True)

(array([0.1976, 0.8024]), array([1976, 8024]))

In [112]:
models[0].get_info()

"HoeffdingTree(binary_split=False, grace_period=200, leaf_prediction='nba',\n              max_byte_size=33554432, memory_estimate_period=1000000,\n              nb_threshold=0, no_preprune=False, nominal_attributes=[0, 1],\n              remove_poor_atts=False, split_confidence=1e-10,\n              split_criterion='info_gain', stop_mem_management=False,\n              tie_threshold=0.05)"

In [111]:
models[1].get_info()

"HAT(binary_split=False, grace_period=200, leaf_prediction='nba',\n    max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0,\n    no_preprune=False, nominal_attributes=[0, 1], remove_poor_atts=False,\n    split_confidence=1e-10, split_criterion='info_gain',\n    stop_mem_management=False, tie_threshold=0.05)"