In [5]:
!pip install -e ./stream-learn

Obtaining file:///home/jovyan/work/stream-learn
Installing collected packages: stream-learn
  Found existing installation: stream-learn 0.7.1
    Uninstalling stream-learn-0.7.1:
      Successfully uninstalled stream-learn-0.7.1
  Running setup.py develop for stream-learn
Successfully installed stream-learn


In [1]:
!pip install attrs



In [4]:
from strlearn.streams import StreamGenerator

In [5]:
stream = StreamGenerator(n_classes=2, n_drifts=1)

In [6]:
import numpy as np
from attr import attrs, attrib, Factory
from copy import deepcopy
from typing import List, Callable, NewType, Any, Optional, Dict
from sklearn.naive_bayes import GaussianNB
from abc import ABC, abstractmethod

In [7]:
Classifier = NewType('Classifier', Any)
Instance = NewType('Instance', Any)

In [55]:
class OALELabelingStrategy(ABC):
    
    @abstractmethod
    def label(self, x: Instance): raise NotImplementedError

In [63]:
@attrs(auto_attribs=True)
class RandomStrategy(OALELabelingStrategy):
    _threshold_adjustment_step: int = attrib(default=0.1)
        
    def label(self, x: Instance):
        if self._threshold_adjustment_step <= np.random.uniform():
            return True
        return False
        

In [64]:
@attrs(auto_attribs=True)
class UncertaintyStrategy(OALELabelingStrategy):
    _threshold_margin:int = attrib()
    _threshold_adjustment_step: int = attrib()
        
    def label(self, x: Instance):
        return True # TODO(bgulowaty)

In [65]:
@attrs(auto_attribs=True)
class CompositeStrategy(OALELabelingStrategy):
    _main_strategy: OALELabelingStrategy
    _fallback_strategy: OALELabelingStrategy
        
    def label(self, x: Instance):
        labeling = self._main_strategy.label(x)
        
        if labeling is not True:
            labeling = self._fallback_strategy.label(x)
        
        return labeling

In [12]:
def originalPaperLabelingStrategyProvider(uncertainity_strategy_threshold, uncertainity_strategy_margin, random_strategy_parameter):
    return RandomStrategy(random_strategy_parameter)
    

In [301]:
def scikit_gaussian_nb_provider(x, y, classes):
    clf = GaussianNB()
    clf.partial_fit(x,  y, classes=classes)
    
    return clf
    

In [400]:
from abc import ABCMeta, abstractmethod


class BaseEnsemblePredictionCombiner(metaclass=ABCMeta):

    @abstractmethod
    def predict(self, x):
        raise NotImplementedError

In [525]:
@attrs
class WeightedMajorityPredictionCombiner(BaseEnsemblePredictionCombiner):
    _ensemble = attrib()
    _weights = attrib()
    _classes = attrib()

    def predict(self, x):
        all_members_can_return_supports = all([hasattr(clf, 'predict_proba') for clf in self._ensemble])

        if all_members_can_return_supports:
            supports_by_clf = [clf.predict_proba(x) * weight for (weight, clf) in zip(self._weights, self._ensemble)]
            supports_sum_by_sample = sum(supports_by_clf)
            predictions = [self._classes[idx] for idx in np.argmax(supports_sum_by_sample, axis=1)]

        else:
            predictions_by_clf = [clf.predict(x) for clf in self._ensemble]
            supports_by_clf = [
                np.vstack(
                    [(predictions == clazz).T * weight for clazz in self._classes]
                ) for (weight, predictions) in zip(self._weights, predictions_by_clf)
            ]

            supports_sum_by_sample = sum(supports_by_clf)

            predictions = [self._classes[idx] for idx in np.argmax(supports_sum_by_sample, axis=0)]
    
        return predictions

In [475]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

In [526]:
@attrs(auto_attribs=True)
class OALE(BaseEstimator, ClassifierMixin):
    _classifier_provider: Callable[[List[Any], List[Any], List[Any]], Classifier] = scikit_gaussian_nb_provider
    _block_size: int = 30 # I
    _dynamics_clfs_limit: int = 9 # D
    _initial_selection_ratio: int = 0.5 # r
    _threshold_adjustment_step: int = attrib(default=0.5) # s
    _margin_threshold: int = attrib(default=1) # theta
    _labeling_strategy_provider: Callable[..., OALELabelingStrategy] = originalPaperLabelingStrategyProvider

    
#     _ensemble: List = attrib(factory=list, init=False)
    _classes: List = attrib(factory=list, init=False)
    
    _cache: List = attrib(factory=list, init=False) # of tuples (x,y)
    _processed_instances: int = attrib(default=0, init=False)
    _dynamic_clfs_count: int = attrib(default=0, init=False)
    _theta_m: int = attrib(default=0, init=False)
    
    _stable_clf: Optional[Classifier] = attrib(default=None, init=False)
    _stable_clf_weight: int = attrib(init=False, default=0.5)
    _dynamic_clfs_weights: List[int] = attrib(init=False)
    _dynamic_clfs: List[Classifier] = attrib(factory=list, init=False)
    
    @_dynamic_clfs_weights.default
    def init_dynamic_clfs_weights(self):
        return np.zeros(self._dynamics_clfs_limit + 1)
    
    ## SINGLE INSTANCE
    
    def partial_fit(self, X, y, classes=None):
        X, y = check_X_y(X, y)
        
        for x_single, y_single in zip(X, y):
            self._partial_fit(x_single, y_single, classes)
        
    def _partial_fit(self, x_new, y_new, classes=None):
            
        self._classes = classes
        if self._classes is None:
            self._classes, _ = np.unique(y, return_inverse=True)
        
        new_instance = (x_new, y_new)
        self._processed_instances += 1
        
        if self._processed_instances < self._block_size: # fill the circular array for the first time
            self._cache.append(new_instance)
            
        elif self._processed_instances == self._block_size: # the first fill of the circular array
            self._cache.append(new_instance)
            new_clf = self._create_new_classifier()
            self._dynamic_clfs_count = 1
            self._stable_clf = new_clf # create C_s
            self._dynamic_clfs = np.append(self._dynamic_clfs, deepcopy(new_clf)) #  create the first dynamic classifier
        else: # more instances processed than block size
            i = (self._processed_instances - 1)%self._block_size # i is the current index for a
            self._deal_instance(new_instance, i)
            i = (i + 1)%self._block_size  # i moves circularly
            if i == 0:  # new instances fill A again
                self._dynamic_clfs_count += 1
                new_clf = self._create_new_classifier()
                self._dynamic_clfs = np.append(self._dynamic_clfs, new_clf)
                self._theta_m = self._margin_threshold * 2/len(self._classes) # reset theta_m for UncertaintyStrategy
                if self._dynamic_clfs_count > self._dynamics_clfs_limit: 
                    self._dynamic_clfs = np.delete(self._dynamic_clfs, 0)
                self._update_weights()
        
#         for i in range(0, self._block_size - 1): # the array still have I instances to deal
#             print(f"iteration {i} block size {self._block_size}")
#             print(f"cache size {len(self._cache)}")
#             x = self._cache[i]
#             self._deal_instance(x, i)

        return self
            
    
    def predict(self, x):
        x = check_array(x)
        print(x)
        preds = WeightedMajorityPredictionCombiner(
            ensemble=np.concatenate(([self._stable_clf], self._dynamic_clfs)),
            weights=np.concatenate(([self._stable_clf_weight], self._dynamic_clfs_weights)),
            classes=self._classes)\
        .predict(x)
        
        print(preds)
        
        return preds
                
    def _update_classifier(self, x, y, clf):
        y = [y] if not isinstance(y, np.ndarray) else y
        x = np.array(x)
        x = np.array([x]) if x.ndim == 1 else x
        try:
            clf.partial_fit(x, y, self._classes)
        except Exception as e:
            raise BaseClassifierDoesNotSupportPartialFitting(e)
        
    def _get_randomly_chosen_instances_to_label(self):
        instances_to_label_count = int(self._initial_selection_ratio * len(self._cache))
        random_idxs = np.random.choice(len(self._cache), instances_to_label_count, replace=False)
        
        print(len(self._cache))
        print(random_idxs)
        print(self._cache[0])
        print(instances_to_label_count)
        
        return np.take(self._cache, random_idxs, axis=0)
        
    def _create_new_classifier(self):
        instances = self._get_randomly_chosen_instances_to_label()
        x = np.stack(instances[:, 0]) # TODO(bgulowaty): make this more elegant
        y = np.stack(instances[:, 1])
        if self._stable_clf != None:
            self._update_stable_classifier(x, y)
        
        return self._classifier_provider(x, y, self._classes)
        
        
    def _update_weights(self):
        self._stable_clf_weight = 0.5
        self._dynamic_clfs_weights = [current_weight * (1 - 1/self._dynamics_clfs_limit)
                                                        for current_weight in self._dynamic_clfs_weights]
        self._dynamic_clfs_weights[self._dynamics_clfs_limit] = 1/self._dynamics_clfs_limit
    
    
    def _update_stable_classifier(self, x, y):
        self._update_classifier(x, y, self._stable_clf)
            
    def _get_randomly_selected_cache_instances(self):
        random_instances_count = np.ceil(len(self._cache) * self._selection_ratio)
        instances_with_labels = np.random.choice(self._cache, random_instances_count)
        
        return map(list, zip(*instances_with_labels))
        
        
        
    def _update_dynamic_classifiers(self, x, y):
        for clf in self._dynamic_clfs:
            self._update_classifier(x, y, clf)
    
    
    def _deal_instance(self, new_instance, i):
        x, y = self._cache[i]
        
        ## TODO(bgulowaty)
        labeling_strategy = self._labeling_strategy_provider(0, 0, 0.5)
        
        labeling = labeling_strategy.label(x)
        
        if labeling is True:
            self._update_stable_classifier(x, y)
            self._update_dynamic_classifiers(x, y)
        
        self._cache[i] = new_instance


In [496]:
from strlearn.evaluators import PrequentialEvaluator, TestThenTrainEvaluator

In [467]:
nb = GaussianNB()
nb.fit([x], [y])

GaussianNB(priors=None, var_smoothing=1e-09)

In [469]:
nb.predict([x])

  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  (self.sigma_[i, :]), 1)


array([0])

In [454]:
ens.predict([x])

[0]

In [529]:
evaluator = TestThenTrainEvaluator()
stream = StreamGenerator(chunk_size=1, n_chunks=20000)
first_fit_size = 20
clf = OALE(block_size=first_fit_size)
for i in range(first_fit_size):
    x, y = stream.get_chunk()
    clf.partial_fit(x, y, classes=[0, 1])
evaluator.process(stream, clf)

20
[13  9  0 19 10  7  1 16 18 15]
(array([ 0.44323356, -0.22243235, -0.41011134, -0.7736701 ,  0.24789053,
       -0.55985413,  0.08614155, -1.39403497, -0.6323416 ,  0.16892831,
        1.57063048, -1.79853276,  0.75129384, -2.01594175, -0.29787713,
        0.37813792,  0.05318888, -0.24636633,  0.85935331,  0.87380524]), 1)
10
[[-0.52591772  0.33622082  2.17941229 -0.39876838  0.86046269 -0.70450029
   0.60056307 -0.29628586 -0.9647923  -0.07036195 -0.993891    0.77682733
  -0.85160291  0.62332828 -1.29236639 -0.13992957 -0.4938495  -0.85444435
  -0.60158826 -0.4464835 ]]
[0]


TypeError: only integer scalar arrays can be converted to a scalar index

In [499]:
evaluator.scores_

array([[[0.88      , 0.88141026],
        [0.88      , 0.88461538],
        [0.88      , 0.87820513],
        ...,
        [0.88      , 0.87820513],
        [0.92      , 0.91883117],
        [0.88      , 0.87337662]]])

In [452]:
ens.predict([x])

[0]

In [446]:
ens = OALE()
x, y = stream.get_chunk()
for x, y in zip(x, y):
    ens.partial_fit(x, y, [0, 1])

In [1]:
class BaseClassifierDoesNotSupportPartialFitting(Exception):
    """Provided base classifier does not support partial fitting"""

In [18]:
stream.get_chunk()

(array([[ 0.66357216,  0.4365599 , -0.95318728, ..., -0.1097405 ,
          1.15705383,  0.07174972],
        [ 1.00545866, -0.48418775, -0.08295778, ...,  1.13516185,
          0.52872851,  0.75917814],
        [-0.29395326, -0.52416437,  0.43990781, ..., -0.68436744,
         -1.13641842,  0.32664697],
        ...,
        [ 0.67810943,  0.24399256, -0.41654112, ..., -0.39413079,
         -1.31152763, -0.52695655],
        [-0.96570818, -0.02011439, -1.99643682, ...,  0.34816807,
          0.69816285, -0.00396668],
        [-2.4647308 , -1.11994915, -0.34874583, ..., -1.05118336,
          0.03149365,  0.47387064]]),
 array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1,
        1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,
        0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
   