Streaming KNN ADWIN Classification

In [1]:
import numpy as np
import pandas as pd

from skmultiflow.lazy import KNNADWINClassifier

from skmultiflow.data.sea_generator import SEAGenerator
from skmultiflow.data.led_generator_drift import LEDGeneratorDrift
from skmultiflow.data import AGRAWALGenerator
from skmultiflow.data.stagger_generator import STAGGERGenerator

from skmultiflow.data import ConceptDriftStream
from skmultiflow.data.file_stream import FileStream

from skmultiflow.evaluation import EvaluatePrequential


#set variables
m = 5000 #samples to evaluate
p = 500 #samples used to train the model before evaluating
mc = ['accuracy', 'kappa', 'precision', 'recall', 'f1', 'running_time', 'model_size']

In [2]:
#create stream
agr_g_stream = FileStream('https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/master/agr_g.csv') 

#set classifier
knn_adwin = KNNADWINClassifier()

#set evaluation
evaluator = EvaluatePrequential(max_samples=m,
                                pretrain_size=p,
                                metrics=mc)

evaluator.evaluate(stream=agr_g_stream, model=knn_adwin)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 500 sample(s).
Evaluating...
 #################### [100%] [0.86s]
Processed samples: 5000
Mean performance:
M0 - Accuracy     : 0.9202
M0 - Kappa        : 0.8258
M0 - Precision: 0.8358
M0 - Recall: 0.9450
M0 - F1 score: 0.8871
M0 - Training time (s)  : 0.44
M0 - Testing time  (s)  : 0.29
M0 - Total time    (s)  : 0.73
M0 - Size (kB)          : 45.2334


[HoeffdingTreeClassifier(binary_split=False, grace_period=200,
                         leaf_prediction='nba', max_byte_size=33554432,
                         memory_estimate_period=1000000, nb_threshold=0,
                         no_preprune=False, nominal_attributes=None,
                         remove_poor_atts=False, split_confidence=1e-07,
                         split_criterion='info_gain', stop_mem_management=False,
                         tie_threshold=0.05)]

In [8]:
#create stream
agr_a_stream = FileStream('https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/master/agr_a.csv') 

#set classifier
knn_adwin = KNNADWINClassifier()

#set evaluation
evaluator = EvaluatePrequential(max_samples=m,
                                pretrain_size=p,
                                metrics=mc)

evaluator.evaluate(stream=agr_a_stream, model=knn_adwin)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 500 sample(s).
Evaluating...
 #################### [100%] [0.88s]
Processed samples: 5000
Mean performance:
M0 - Accuracy     : 0.9202
M0 - Kappa        : 0.8258
M0 - Precision: 0.8358
M0 - Recall: 0.9450
M0 - F1 score: 0.8871
M0 - Training time (s)  : 0.46
M0 - Testing time  (s)  : 0.29
M0 - Total time    (s)  : 0.75
M0 - Size (kB)          : 45.2334


[HoeffdingTreeClassifier(binary_split=False, grace_period=200,
                         leaf_prediction='nba', max_byte_size=33554432,
                         memory_estimate_period=1000000, nb_threshold=0,
                         no_preprune=False, nominal_attributes=None,
                         remove_poor_atts=False, split_confidence=1e-07,
                         split_criterion='info_gain', stop_mem_management=False,
                         tie_threshold=0.05)]

In [11]:
#create stream
sea_a_stream = FileStream('https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/master/sea_a.csv') 

#set classifier
knn_adwin = KNNADWINClassifier()

#set evaluation
evaluator = EvaluatePrequential(max_samples=m,
                                pretrain_size=p,
                                metrics=mc)

evaluator.evaluate(stream=sea_a_stream, model=knn_adwin)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 500 sample(s).
Evaluating...
 #################### [100%] [0.78s]
Processed samples: 5000
Mean performance:
M0 - Accuracy     : 0.6847
M0 - Kappa        : 0.2848
M0 - Precision: 0.6922
M0 - Recall: 0.8704
M0 - F1 score: 0.7712
M0 - Training time (s)  : 0.28
M0 - Testing time  (s)  : 0.26
M0 - Total time    (s)  : 0.54
M0 - Size (kB)          : 21.0029


[HoeffdingTreeClassifier(binary_split=False, grace_period=200,
                         leaf_prediction='nba', max_byte_size=33554432,
                         memory_estimate_period=1000000, nb_threshold=0,
                         no_preprune=False, nominal_attributes=None,
                         remove_poor_atts=False, split_confidence=1e-07,
                         split_criterion='info_gain', stop_mem_management=False,
                         tie_threshold=0.05)]

In [None]:
#create stream
sea_g_stream = FileStream('https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/master/sea_g.csv') 

#set classifier
knn_adwin = KNNADWINClassifier()

#set evaluation
evaluator = EvaluatePrequential(max_samples=m,
                                pretrain_size=p,
                                metrics=mc)

evaluator.evaluate(stream=sea_g_stream, model=knn_adwin)

In [4]:
#create stream
stag_g_stream = ConceptDriftStream(stream=STAGGERGenerator
                   (balance_classes=False, 
                    classification_function=1, 
                    random_state=112), 
                   drift_stream=STAGGERGenerator
                   (balance_classes=False, 
                    classification_function=2, 
                    random_state=112), 
                   position=5000, width=1000, random_state=None, alpha=0)

#set classifier
knn_adwin = KNNADWINClassifier()

#set evaluation
evaluator = EvaluatePrequential(max_samples=m,
                                pretrain_size=p,
                                metrics=mc)

evaluator.evaluate(stream=stag_g_stream, model=knn_adwin)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 500 sample(s).
Evaluating...
 #####--------------- [25%] [0.19s]



 #################### [100%] [0.82s]
Processed samples: 5000
Mean performance:
M0 - Accuracy     : 0.9287
M0 - Kappa        : 0.8530
M0 - Precision: 0.9015
M0 - Recall: 0.9807
M0 - F1 score: 0.9394
M0 - Training time (s)  : 0.27
M0 - Testing time  (s)  : 0.28
M0 - Total time    (s)  : 0.55
M0 - Size (kB)          : 20.8154


[HoeffdingTreeClassifier(binary_split=False, grace_period=200,
                         leaf_prediction='nba', max_byte_size=33554432,
                         memory_estimate_period=1000000, nb_threshold=0,
                         no_preprune=False, nominal_attributes=None,
                         remove_poor_atts=False, split_confidence=1e-07,
                         split_criterion='info_gain', stop_mem_management=False,
                         tie_threshold=0.05)]

In [15]:
#create stream
led_g1_stream = LEDGeneratorDrift(random_state = 112, noise_percentage = 0.34,has_noise= True,
n_drift_features=10) #es werden 10 Features mit irrelevanten getauscht

#set classifier
knn_adwin = KNNADWINClassifier()

#set evaluation
evaluator = EvaluatePrequential(max_samples=m,
                                pretrain_size=p,
                                metrics=mc)

evaluator.evaluate(stream=led_g1_stream, model=knn_adwin)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 500 sample(s).
Evaluating...
 #################### [100%] [6.44s]
Processed samples: 5000
Mean performance:
M0 - Accuracy     : 0.2738
M0 - Kappa        : 0.1928
M0 - Precision: 0.2708
M0 - Recall: 0.2730
M0 - F1 score: 0.2693
M0 - Training time (s)  : 3.47
M0 - Testing time  (s)  : 2.43
M0 - Total time    (s)  : 5.90
M0 - Size (kB)          : 112.7432


[HoeffdingTreeClassifier(binary_split=False, grace_period=200,
                         leaf_prediction='nba', max_byte_size=33554432,
                         memory_estimate_period=1000000, nb_threshold=0,
                         no_preprune=False, nominal_attributes=None,
                         remove_poor_atts=False, split_confidence=1e-07,
                         split_criterion='info_gain', stop_mem_management=False,
                         tie_threshold=0.05)]

In [16]:
#create stream
led_g2_stream = LEDGeneratorDrift(random_state = 112, noise_percentage = 0.34,has_noise= True,
n_drift_features=2) #es werden 2 Features mit irrelevanten getauscht

#set classifier
knn_adwin = KNNADWINClassifier()

#set evaluation
evaluator = EvaluatePrequential(max_samples=m,
                                pretrain_size=p,
                                metrics=mc)

evaluator.evaluate(stream=led_g2_stream, model=knn_adwin)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 500 sample(s).
Evaluating...
 #################### [100%] [6.60s]
Processed samples: 5000
Mean performance:
M0 - Accuracy     : 0.2713
M0 - Kappa        : 0.1901
M0 - Precision: 0.2688
M0 - Recall: 0.2707
M0 - F1 score: 0.2671
M0 - Training time (s)  : 3.60
M0 - Testing time  (s)  : 2.47
M0 - Total time    (s)  : 6.07
M0 - Size (kB)          : 113.6807


[HoeffdingTreeClassifier(binary_split=False, grace_period=200,
                         leaf_prediction='nba', max_byte_size=33554432,
                         memory_estimate_period=1000000, nb_threshold=0,
                         no_preprune=False, nominal_attributes=None,
                         remove_poor_atts=False, split_confidence=1e-07,
                         split_criterion='info_gain', stop_mem_management=False,
                         tie_threshold=0.05)]

In [17]:
#create stream
air_stream = FileStream('https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/master/airlines.csv') 

#set classifier
knn_adwin = KNNADWINClassifier()

#set evaluation
evaluator = EvaluatePrequential(max_samples=m,
                                pretrain_size=p,
                                metrics=mc)

evaluator.evaluate(stream=air_stream, model=knn_adwin)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 500 sample(s).
Evaluating...
 #################### [100%] [1.26s]
Processed samples: 5000
Mean performance:
M0 - Accuracy     : 0.6589
M0 - Kappa        : 0.0179
M0 - Precision: 0.4717
M0 - Recall: 0.0327
M0 - F1 score: 0.0612
M0 - Training time (s)  : 0.85
M0 - Testing time  (s)  : 0.29
M0 - Total time    (s)  : 1.14
M0 - Size (kB)          : 28.3506


[HoeffdingTreeClassifier(binary_split=False, grace_period=200,
                         leaf_prediction='nba', max_byte_size=33554432,
                         memory_estimate_period=1000000, nb_threshold=0,
                         no_preprune=False, nominal_attributes=None,
                         remove_poor_atts=False, split_confidence=1e-07,
                         split_criterion='info_gain', stop_mem_management=False,
                         tie_threshold=0.05)]

In [19]:
#create stream
elec_stream = FileStream('https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/master/elec.csv') 

#set classifier
knn_adwin = KNNADWINClassifier()

#set evaluation
evaluator = EvaluatePrequential(max_samples=m,
                                pretrain_size=p,
                                metrics=mc)

evaluator.evaluate(stream=elec_stream, model=knn_adwin)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 500 sample(s).
Evaluating...
 #################### [100%] [0.87s]
Processed samples: 5000
Mean performance:
M0 - Accuracy     : 0.8324
M0 - Kappa        : 0.6401
M0 - Precision: 0.8029
M0 - Recall: 0.7440
M0 - F1 score: 0.7723
M0 - Training time (s)  : 0.38
M0 - Testing time  (s)  : 0.34
M0 - Total time    (s)  : 0.72
M0 - Size (kB)          : 65.5771


[HoeffdingTreeClassifier(binary_split=False, grace_period=200,
                         leaf_prediction='nba', max_byte_size=33554432,
                         memory_estimate_period=1000000, nb_threshold=0,
                         no_preprune=False, nominal_attributes=None,
                         remove_poor_atts=False, split_confidence=1e-07,
                         split_criterion='info_gain', stop_mem_management=False,
                         tie_threshold=0.05)]

In [20]:
#create stream
for_stream = FileStream('covtype.csv') 

#set classifier
knn_adwin = KNNADWINClassifier()

#set evaluation
evaluator = EvaluatePrequential(max_samples=m,
                                pretrain_size=p,
                                metrics=mc)

evaluator.evaluate(stream=for_stream, model=knn_adwin)

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 500 sample(s).
Evaluating...
 #################### [100%] [15.52s]
Processed samples: 5000
Mean performance:
M0 - Accuracy     : 0.5691
M0 - Kappa        : 0.4836
M0 - Precision: 0.5785
M0 - Recall: 0.4540
M0 - F1 score: 0.4485
M0 - Training time (s)  : 9.21
M0 - Testing time  (s)  : 6.11
M0 - Total time    (s)  : 15.32
M0 - Size (kB)          : 190.4697


[HoeffdingTreeClassifier(binary_split=False, grace_period=200,
                         leaf_prediction='nba', max_byte_size=33554432,
                         memory_estimate_period=1000000, nb_threshold=0,
                         no_preprune=False, nominal_attributes=None,
                         remove_poor_atts=False, split_confidence=1e-07,
                         split_criterion='info_gain', stop_mem_management=False,
                         tie_threshold=0.05)]

In [None]:
#create stream
pok_stream = FileStream('https://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-testing.data') 

#set classifier
knn_adwin = KNNADWINClassifier()

#set evaluation
evaluator = EvaluatePrequential(max_samples=m,
                                pretrain_size=p,
                                metrics=mc)

evaluator.evaluate(stream=pok_stream, model=knn_adwin)