In [None]:
import copy
import gc
import tracemalloc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import dill

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn import linear_model as sk_linear_model
from sklearn import pipeline as sk_pipeline
from sklearn import preprocessing as sk_preprocessing
from sklearn import metrics as sk_metrics

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from pyspotstream.data.OPM import load_opm
from pyspotstream.misc.drift_generator import generate_drift
from pyspotstream.eval.evaluation import baseline_batch_experiment
from pyspotstream.eval.evaluation import eval_multiple_instances
from pyspotstream.plot.plotmulti import plot_multiple_instances_results
from pyspotstream.eval.evaluation import eval_single_instances
from pyspotstream.data.OPM import get_opm, load_opm

from river import tree as river_tree
from river import compose as river_compose
from river import linear_model as river_linear_model
from river import preprocessing as river_preprocessing
from river import feature_extraction as river_feature_extraction
from river import neural_net as river_nn
from river import optim as river_optim
from river import stream as river_stream
from river import ensemble as river_ensemble
from river import drift as river_drift

%matplotlib inline

np.random.seed(42)

import warnings
from sklearn import exceptions
warnings.filterwarnings(action='ignore', category=exceptions.ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=RuntimeWarning)

In [None]:
get_opm(overwrite=False)

# 1 Load OPM

In [None]:
X_train, y_train = load_opm(data_type="num", n=10000, sorted=True, verbose=False)
y_train = y_train.squeeze()  # necessary because Y is a one column df
X_train


# 2 Generate Drift

In [None]:
X_d = copy.deepcopy(X_train)
y_d = copy.deepcopy(y_train)

drift = [1.1,10.0,0.1,1.0]

mr = generate_drift(X_train,drift) 
y_d = np.array(y_d).reshape(-1,)
X_d["Assessed Value"] = X_d["Assessed Value"] * mr
y_d = y_d * mr

## Plot the Data with drift:

In [None]:
x_1 = range(X_d.shape[0])
plt.plot(x_1, y_d, color='grey', marker='o', linestyle='dashed', linewidth=.2, markersize=.1)
for i in range(1,4):
    plt.axvline(x=i*np.divmod(X_d.shape[0], 4)[0], color='r', linestyle='-', linewidth = 1)

In [None]:
X_train = copy.deepcopy(X_d)
y_train = pd.Series(copy.deepcopy(y_d))
y_train

# 3 Selection of the metric

* For example `MAE` for regression and `Accuracy` for classification. 
* Should be implemented as an argument of the function that calls the experiments (below).

In [None]:
metric = sk_metrics.mean_absolute_error
metric_name = metric.__class__.__name__
metric_name = "MAE"

# 4 Experimente Batch ML

* The function `baseline_batch_experiment()` implements the classic batch ML evaluation:
  * Die gegebenen N Samples werden in ein Training- und ein Testset unterteilt.
  * Das Modell wird einmal auf dem Trainingsdaten trainiert und erstellt dann Vorhersagen auf den Testdaten
  * Here: 10 % are used for training and 90 % for testing (as default).

## 4.1 Batch Decision Tree Regressor

* Simple example how to call the batch eval with decision trees.
* Should be available for other learners as well.

In [None]:
from pyspotstream.eval.evaluation import baseline_batch_experiment

X = copy.deepcopy(X_train)
Y = copy.deepcopy(y_train)
model = DecisionTreeRegressor(random_state=0)

dtr_time, dtr_mae, dtr_mem, dtr_model = baseline_batch_experiment(X,Y,model)

## 4.2 Batch Random Forest

* Second simple example how to call the batch eval with random forest.
* Should be available for other learners as well.

In [None]:
from pyspotstream.eval.evaluation import baseline_batch_experiment

X = copy.deepcopy(X_train)
Y = copy.deepcopy(y_train)

model = RandomForestRegressor(n_estimators=100, random_state=0)

rf_time, rf_mae, rf_mem, rf_model = baseline_batch_experiment(X,Y,model)

## 4.3 Select Default Batch Results: dt or rf

In [None]:
DT = True
if DT:
    batch_mae = dtr_mae
    batch_time = dtr_time
    batch_mem = dtr_mem
else:
    batch_mae = rf_mae
    batch_time = rf_time
    batch_mem = rf_mem    

# 5 Mini-Batch Learning 

* Synonym: Multiple Instance Learning
* Mini-batch learning is implemented via the function `eval_multiple_instances`.

### lin- und logspace

* Standardmäßig wird eine lineare, äquidistante Partitionierung vorgenommen (`linspace`). Alternativ kann auch eine logarithmisch skalierte Partitionierung durchgeführt werden (`lgospace`).

### Modelle

* Als Batch Lerner werden die Modelle aus `scikit-learn` verwendet:
  * Klassifikation:
    * LogisticRegression
    * DescisionTreeClassifier
  * Regression:
    * LinearRegression
    * SGDRegressor

* Aus dem `river` Paket werden die folgenden Modelle verwendet:
  * MLPRegressor
  * LinearRegression 
* These perform really poor see `river`  documentation.

* Die Variable `m_river` legt fest, wie die Modelle gebaut (gefitted) und zur Vorhersage verwendet werden (predict).
* Dies geschieht in `scikit-learn` anders als in `river`:
* Die `scikit-learn` Modelle verwenden die Methoden
  * `fit(x,y)` und 
  * `predict(x)`.
* Die `river` Modelle verwenden die Methoden
  * `learn_many(x,y)` und
  * `predict_many(x)`.

In [None]:
type(model)

In [None]:
start_time = time.time()
model = sk_pipeline.make_pipeline(sk_preprocessing.StandardScaler(),
                                  sk_linear_model.LinearRegression())

m_LinearRegression_times, m_LinearRegression_scores, m_LinearRegression_mem, _ = eval_multiple_instances(X=X_train, y=y_train, model=model)
m_LinearRegression_runtime = time.time() - start_time
print(f"--- %s seconds ---{m_LinearRegression_runtime}")

In [None]:
start_time = time.time()
model = sk_pipeline.make_pipeline(sk_preprocessing.StandardScaler(),
                                  sk_linear_model.SGDRegressor(max_iter=1, tol=1e-3))

m_SGDRegressor_times, m_SGDRegressor_scores, m_SGDRegressor_mem, _ = eval_multiple_instances(X=X_train, y=y_train, model=model)
m_SGDRegressor_runtime = time.time() - start_time
print(f"--- %s seconds ---{m_SGDRegressor_runtime}")

In [None]:
start_time = time.time()
model = sk_pipeline.make_pipeline(sk_preprocessing.MinMaxScaler(),
                                  sk_linear_model.SGDRegressor(max_iter=1, tol=1e-3))

m_SGDRegressor_MinMaxScaler_times, m_SGDRegressor_MinMaxScaler_scores, m_SGDRegressor_MinMaxScaler_mem, _ = eval_multiple_instances(X=X_train, y=y_train, model=model)
m_SGDRegressor_MinMaxScaler_runtime = time.time() - start_time
print(f"--- %s seconds ---{m_SGDRegressor_MinMaxScaler_runtime}")

In [None]:
start_time = time.time()
model = sk_pipeline.make_pipeline(sk_preprocessing.StandardScaler(),
                                  DecisionTreeRegressor(random_state=0))

m_DTRegressor_times, m_DTRegressor_scores, m_DTRegressor_mem, _ = eval_multiple_instances(X=X_train, y=y_train, model=model)
m_DTRegressor_runtime = time.time() - start_time
print(f"--- %s seconds ---{m_DTRegressor_runtime}")

### Achtung: Der folgende Code benötigt bereits bei einer Sample Größe von 10000 ca. 2 Minuten!

In [None]:
RF_MINIBATCH = False
if RF_MINIBATCH:
    start_time = time.time()
    m_RFRegressor_name = "RFRegressor"
    m_RFRegressor_times, m_RFRegressor_scores, m_RFRegressor_mem, _ = eval_multiple_instances(X=X_train, y=y_train, model='RFRegressor', x_part='linspace', metric=metric)
    m_RFRegressor_runtime = time.time() - start_time
    print("--- %s seconds ---" % (time.time() - start_time))

## Mini Batch Lerner aus `river`

* Nicht sonderlich geeignet.

In [None]:
MLPR = False
if MLPR:
    m_MLPR_name = "MLPR"
    m_MLPR_times, m_MLPR_scores, m_MLPR_mem, _ = eval_multiple_instances(X=X_train, y=y_train, model='MLPR', x_part='linspace', metric=metric)

In [None]:
river_Batch_Regression = False
if river_Batch_Regression:
    m_river_LinearRegression_name = "river_LinearRegression"
    m_river_LinearRegression_times, m_river_LinearRegression_scores, m_river_LinearRegression_mem, _ = eval_multiple_instances(X=X_train, y=y_train, model='river_LinearRegression', x_part='linspace', metric=metric)

# 6 The Plot function 

* Please implement this as a function of the pyspotstream package

## 6.1 Function Definition

## 6.2 Calling the plot function

## 6.2.1 Plotting MAE

In [None]:
from pyspotstream.plot.plotmulti import plot_multiple_instances_results

algorithm_scores_dict = {"LinearRegression":m_LinearRegression_scores,
                  "SGDRegressor":m_SGDRegressor_scores,
                  "SGDRegressor_MinMaxScaler":m_SGDRegressor_MinMaxScaler_scores,
                  "DTRegressor":m_DTRegressor_scores}

plot_multiple_instances_results(algorithm_scores_dict, y_label="MAE", default=batch_mae)

## 6.2.2 Plotting Time

In [None]:
algorithm_times_dict = {"LinearRegression":m_LinearRegression_times,
                  "SGDRegressor":m_SGDRegressor_times,
                  "SGDRegressor_MinMaxScaler":m_SGDRegressor_MinMaxScaler_times,
                  "DTRegressor":m_DTRegressor_times}

plot_multiple_instances_results(algorithm_times_dict, y_label="Number of Seconds", default=batch_time, log_y=True)

## 6.2.2 Plotting Mem



In [None]:
algorithm_mem_dict = {"LinearRegression":m_LinearRegression_mem,
                  "SGDRegressor":m_SGDRegressor_mem,
                  "SGDRegressor_MinMaxScaler":m_SGDRegressor_MinMaxScaler_mem,
                  "DTRegressor":m_DTRegressor_mem}

plot_multiple_instances_results(algorithm_mem_dict, y_label= "Peak Memory, kB", default=batch_mem)

# 7 Online Machine Learning

* Single Instance Learning oder Interleaved test-then-train

## 7.1 Die Methode `eval_single_instances()`

In [None]:
from pyspotstream.eval.evaluation import eval_single_instances

m_river_LinearRegression = river_linear_model.LinearRegression(intercept_lr=.1) 
model_times, model_scores, model_mem, model = eval_single_instances(X=X_train,
                                                                    y=y_train,
                                                                    model=m_river_LinearRegression,
                                                                    metric=metric,
                                                                    task="reg")

## 7.2 Daten

* OPM, Task: Regression, no missing values,  no categorical values, d.h. derselbe Datensatz wie für C.MI.ETM.1 und C.SI.ETM.1
* Es wird der komplette Datensatz verwendet, keine explizite Aufteilung in test, validation, training data.


In [None]:
# dataset = California_Housing(data_format='kaggle', binary_clf=False, drop_nan= True, drop_categorical=True)
# X = dataset.data.data
# y = dataset.data.target

## 7.3 Preprocessing

* Either StandardScaler or MinmaxScaler

In [None]:
scaler = river_preprocessing.StandardScaler()

## 7.4 Models

* User specify models that can be passed to the evaluation function `eval_single_instances()`
* This is much better than `eval_multiple_instances()` from above

In [None]:
m_river_LinearRegression = river_linear_model.LinearRegression(intercept_lr=.1) 

m_river_HTR = river_tree.HoeffdingAdaptiveTreeRegressor(grace_period=50, 
                                                        model_selector_decay=0.3,
                                                        seed=0)

m_river_RF = river_ensemble.AdaptiveRandomForestRegressor(seed=42)

m_river_HATR = river_tree.HoeffdingAdaptiveTreeRegressor(max_depth= 30, 
                                                         drift_detector= river_drift.EDDM(warm_start = 30), 
                                                         grace_period=50, 
                                                         splitter = river_tree.splitter.EBSTSplitter(), 
                                                         seed=0) 

m_river_HTR_2 = river_tree.HoeffdingTreeRegressor(max_depth= 30, splitter = river_tree.splitter.EBSTSplitter()) 

m_river_ARFR = river_ensemble.AdaptiveRandomForestRegressor(max_depth = 20, 
                                                            splitter = river_tree.splitter.EBSTSplitter(), 
                                                            drift_detector = river_drift.ADWIN(), 
                                                            seed=0) 

#
base_model = river_tree.HoeffdingTreeRegressor(grace_period=50)
m_river_SRP = model = river_ensemble.SRPRegressor(
    model=base_model,
    training_method="patches",
    n_models=3,
    seed=42)

## 7.5 Running the experiments: OML, Single Instance 

* Aufruf von `eval_single_instance()`

In [None]:
RIVER_LM = False
if RIVER_LM:
    m_river_LinearRegression_name = "m_river_LinearRegression"
    m_river_LinearRegression_times, m_river_LinearRegression_scores, m_river_LinearRegression_mem, m_river_LinearRegression_model = eval_single_instances(
        X=X_train,
        y=y_train,
        model=m_river_LinearRegression,
        metric=metric,
        task="reg")

In [None]:
RIVER_HTR = False
if RIVER_HTR:
    start_time = time.time()
    m_river_HTR_name = "m_river_HTR"
    m_river_HTR_times, m_river_HTR_scores, m_river_HTR_mem, m_river_LinearRegression_model = eval_single_instances(
        X=X_train,
        y=y_train,
        model=m_river_HTR,
        metric=metric,
        task="reg")
    m_river_HTR_runtime = time.time() - start_time
    print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
RIVER_RF = False
if RIVER_RF:
    start_time = time.time()
    m_river_RF_name = "m_river_Random_ForestRegr"
    m_river_RF_times, m_river_RF_scores, m_river_RF_mem, m_river_RF_model = eval_single_instances(
        X=X_train,
        y=y_train,
        model=m_river_RF,
        metric=metric,
        task="reg")
    m_river_RF_runtime = time.time() - start_time
    print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
RIVER_SRP = False
if RIVER_SRP:
    start_time = time.time()
    m_river_SRP_name ="m_river_SRP: Streaming Random Patches ensemble"
    m_river_SRP_times, m_river_SRP_scores, m_river_SRP_mem, m_river_SRP_model = eval_single_instances(
        X=X_train,
        y=y_train,
        model=m_river_SRP,
        metric=metric,
        task="reg")
    m_river_SRP_runtime = time.time() - start_time
    print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
RIVER_HATR = True
if RIVER_HATR:
    start_time = time.time()
    m_river_HATR_name ="m_river_HATR"
    m_river_HATR_times, m_river_HATR_scores, m_river_HATR_mem, m_river_HATR_model = eval_single_instances(
        X=X_train,
        y=y_train,
        model=m_river_HATR,
        metric=metric,
        task="reg")
    m_river_HATR_runtime = time.time() - start_time
    print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
RIVER_HTR_2 = False
if RIVER_HTR_2:
    start_time = time.time()
    m_river_HTR_2_name ="m_river_HTR_2"
    m_river_HTR_2_times, m_river_HTR_2_scores, m_river_HTR_2_mem, m_river_HTR_2_model = eval_single_instances(
        X=X_train,
        y=y_train,
        model=m_river_HTR_2,
        metric=metric,
        task="reg")
    m_river_HTR_2_runtime = time.time() - start_time
    print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
RIVER_ARFR = False
if RIVER_ARFR:
    start_time = time.time()
    m_river_ARFR_name ="m_river_ARFR"
    m_river_ARFR_times, m_river_ARFR_scores, m_river_ARFR_mem, m_river_ARFR_model = eval_single_instances(
        X=X_train,
        y=y_train,
        model=m_river_ARFR,
        metric=metric,
        task="reg")
    m_river_ARFR_runtime = time.time() - start_time
    print("--- %s seconds ---" % (time.time() - start_time))

## Ergebnisdarstellung

* Aufruf von `plot_results()`

In [None]:
online_score_dict = {m_river_HATR_name:m_river_HATR_scores}

plot_multiple_instances_results(online_score_dict, y_label=metric_name, default=batch_mem)

In [None]:
online_time_dict = {m_river_HATR_name:m_river_HATR_times}

plot_multiple_instances_results(online_time_dict, y_label="Number of Seconds", default=batch_time, log_y=True)

In [None]:
online_mem_dict = {m_river_HATR_name:m_river_HATR_mem}

plot_multiple_instances_results(online_mem_dict, y_label= "Peak Memory, kB", default=batch_time)

# 8 Global Comparison

## 8.1 Performance

In [None]:
sl = algorithm_scores_dict | online_score_dict # Only runable with 3.9.0

In [None]:
plot_multiple_instances_results(sl, y_label= metric_name, default=batch_time,log_y=True)

## 8.2 Zeit

In [None]:
tl = algorithm_times_dict | online_time_dict

plot_multiple_instances_results(tl, y_label="Number of Seconds", default=batch_time, log_y=True, marker="")

## 8.3 Speicher

In [None]:
ml = algorithm_mem_dict | online_mem_dict

plot_multiple_instances_results(ml, y_label="Peak Memory, kB", default=batch_mem, log_y=True)

## 8.4 Draw Hoeffding Trees

In [None]:
m_river_HATR.draw()

In [None]:
m_river_RF.data

# Anhang

## Sicherung der Ergebnisse mit `dill`

In [None]:
file_name = "40_OPM" + time.strftime("%Y%m%d-%H%M%S") + ".dill"
dill.dump_session(file_name)

In [None]:
RELOAD = False
#  Dateinamen mit Zeitstempel erweitern:
if RELOAD == True:
    import dill
    dill.load_session("33_OPM20221122-173554.dill")