In [1]:
import numpy as np
import math
from tqdm.notebook import tqdm
from copy import deepcopy
from sklearn.metrics import accuracy_score, roc_auc_score as auc
from skmultiflow.trees.hoeffding_tree import HoeffdingTreeClassifier

In [2]:
# Read the dataset
from datasets import read_dataset

features, labels = read_dataset()
features = np.array(features)

In [3]:
# Define all the constants of the experiment
initial_limit = 100

In [4]:
# Set all the random seeds
seed = np.random.randint(65536)
seed

19268

In [5]:
# Get the initial training set and standardize it
x = features[:initial_limit, :]
y = labels[:initial_limit]

In [6]:
classes = np.unique(labels)

In [7]:
# Create the Hoeffding Tree Classifier
clf = HoeffdingTreeClassifier()

In [8]:
# Train the classifier based on the initial training alone (Initial Train Method)
clf.fit(X=x, y=y, classes=classes)

HoeffdingTreeClassifier(binary_split=False, grace_period=200,
                        leaf_prediction='nba', max_byte_size=33554432,
                        memory_estimate_period=1000000, nb_threshold=0,
                        no_preprune=False, nominal_attributes=None,
                        remove_poor_atts=False, split_confidence=1e-07,
                        split_criterion='info_gain', stop_mem_management=False,
                        tie_threshold=0.05)

In [9]:
# Predict the accuracy of the Initial Train Method
y_pred = clf.predict(features[initial_limit:, :])
auc_value = accuracy_score(y_true=labels[initial_limit:], y_pred=y_pred)
auc_value

0.5644076793771565

In [10]:
# Train the model based on incremental partial fits (Regular Update Method)
clf.reset()
clf.fit(X=x, y=y, classes=classes)

HoeffdingTreeClassifier(binary_split=False, grace_period=200,
                        leaf_prediction='nba', max_byte_size=33554432,
                        memory_estimate_period=1000000, nb_threshold=0,
                        no_preprune=False, nominal_attributes=None,
                        remove_poor_atts=None, split_confidence=1e-07,
                        split_criterion='info_gain', stop_mem_management=False,
                        tie_threshold=0.05)

In [11]:
# Set up the progress bar
progress_bar = tqdm(total=len(features))
progress_bar.update(initial_limit)

# Iterate over the whole dataset
y_pred = np.zeros(shape=(len(labels)-initial_limit))
idx = initial_limit
progress_bar.update(initial_limit)
while idx<len(labels):
    y_hat = clf.predict([features[idx, :]])
    y_pred[idx-initial_limit] = y_hat[0]
    clf.partial_fit(X=[features[idx, :]], y=[labels[idx]], classes=classes)
    idx += 1
    progress_bar.update(1)

  0%|          | 0/45312 [00:00<?, ?it/s]

In [12]:
# Predict the accuracy based on the Regular Update Method
auc_value = accuracy_score(y_true=labels[initial_limit:], y_pred=y_pred)
auc_value

0.7793284968592409

In [13]:
# Train the model based on on regular resets (Regular Retrain Method)
interval = 100
clf.reset()
clf.fit(X=x, y=y, classes=np.unique(y))

HoeffdingTreeClassifier(binary_split=False, grace_period=200,
                        leaf_prediction='nba', max_byte_size=33554432,
                        memory_estimate_period=1000000, nb_threshold=0,
                        no_preprune=False, nominal_attributes=None,
                        remove_poor_atts=None, split_confidence=1e-07,
                        split_criterion='info_gain', stop_mem_management=False,
                        tie_threshold=0.05)

In [14]:
# Set up the progress bar
progress_bar_new = tqdm(total=len(features))
progress_bar_new.update(initial_limit)

# Iterate over the whole dataset
y_pred = np.empty(shape=(0))
idx = initial_limit

progress_bar_new.update(initial_limit)

while idx+interval<len(labels):
    y_hat = clf.predict(features[idx:idx+interval, :])
    y_pred = np.hstack((y_pred, y_hat))
    # Reset the classifier and retrain
    clf.reset()
    clf.partial_fit(X=features[idx:idx+interval, :], y=labels[idx:idx+interval], classes=np.unique(y))
    idx += interval    
    progress_bar_new.update(interval)
    
# Predict for the remaining dataset
y_hat = clf.predict(features[idx:, :])
y_pred = np.hstack((y_pred, y_hat))

  0%|          | 0/45312 [00:00<?, ?it/s]

In [15]:
# Predict the accuracy based on the Regular Retrain method
auc_value = accuracy_score(y_true=labels[initial_limit:], y_pred=y_pred)
auc_value

0.7511722551534991