## Install river and scikit-learn

In [41]:
pip install "./river/"

Processing ./river
Building wheels for collected packages: river
  Building wheel for river (setup.py) ... [?25ldone
[?25h  Created wheel for river: filename=river-1-cp39-cp39-macosx_10_9_x86_64.whl size=1195855 sha256=0c8a09779057bbafa33e90de91481d1278a41217290b4388a9d61cb884e6dca2
  Stored in directory: /private/var/folders/s0/cs0fw3px6tx5srp431pdvjc80000gn/T/pip-ephem-wheel-cache-zut_xzcu/wheels/d1/6b/a3/83d8a39007debc0733461c491a8263c5af566254c5860a1a1a
Successfully built river
Installing collected packages: river
  Attempting uninstall: river
    Found existing installation: river 1
    Uninstalling river-1:
      Successfully uninstalled river-1
Successfully installed river-1
You should consider upgrading via the '/Users/brianburns/ml/my_env/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install "./scikit-garden/"

Processing ./scikit-garden
Building wheels for collected packages: scikit-garden
  Building wheel for scikit-garden (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-garden: filename=scikit_garden-0.1.3-cp39-cp39-macosx_10_9_x86_64.whl size=530087 sha256=20fe418083ddcd27e2fd796fb87a854e33f904287695b8362e70b74fd4741980
  Stored in directory: /Users/brianburns/Library/Caches/pip/wheels/cc/96/10/f1f98c7ebcbb916ea7b0a696a32ae8d73b3ed876c5d2911053
Successfully built scikit-garden
Installing collected packages: scikit-garden
  Attempting uninstall: scikit-garden
    Found existing installation: scikit-garden 0.1.3
    Uninstalling scikit-garden-0.1.3:
      Successfully uninstalled scikit-garden-0.1.3
Successfully installed scikit-garden-0.1.3
You should consider upgrading via the '/Users/brianburns/ml/my_env/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


## Imports

In [1]:
from river import stream
from river.ensemble import AdaptiveRandomForestRegressorCP, AdaptiveRandomForestRegressorQRF
from skgarden.mondrian import RiverMondrianForestRegressor
import datetime
import os
import pickle
from datetime import datetime
import time
import math
import numpy as np
from river import synth
import copy

## Experiment functions

In [2]:
def get_target_variable(file_path):
    """
    Our data files have different names for the target variable.
    This returns the string name of the target variable of a dataset in an arff file.
    """
    data_stream = stream.iter_arff(file_path)
    v = next(iter(data_stream))
    return list(v[0].keys())[-1]

In [3]:
def load_datastream(arff_file):
    target_name = get_target_variable(arff_file)
    datastream = stream.iter_arff(arff_file, target = target_name)
    return datastream

In [4]:
def run_experiment(datastream, dataset_name, models_with_names, metrics_with_names, alpha, parameter_info):
    """
    datastream is a river stream object, not the path of an arff file.
    """
    # Ensure iteration doesn't exhaust datastream
    datastream = list(datastream)
    
    all_results = dict()
    all_performances = dict()
        
    for model, model_name in models_with_names:
        results = []
        performances = dict()
        time_start = time.process_time()
        i = 0
        for x,y in datastream:
            i += 1
            interval = model.predict_interval(x, alpha)
            y_hat = model.predict_one(x)
            results.append([x,y,y_hat, interval, alpha])
            model.learn_one(x,y)
            if i % 10000 == 0:
                print(i)

        time_end = time.process_time()
        time_elapsed = time_end - time_start
        
        for metric, metric_name in metrics_with_names:
            performances[metric_name] = metric(results)
        performances["time"] = time_elapsed
        
        all_results[model_name] = results
        all_performances[model_name] = performances

    # Save results and performances
    experiment_id = str(datetime.now()).replace(":","-").replace(".","-").replace(" ", "-")
    save_experiment(dataset_name, all_results, all_performances, alpha, experiment_id, parameter_info)

    return all_results, all_performances

In [5]:
def save_experiment(dataset_name, results, performances, alpha, experiment_id, parameter_info):
    for model_name in results.keys():
        filepath = "./results/" + dataset_name + "/" + model_name + "/" + experiment_id + "/"
        # Create directory for experiment, if it doesn't already exist
        os.makedirs(filepath, exist_ok=True)
        # save the result
        result = results[model_name]
        with open(filepath + "results.pckl", "wb") as pickle_file:
            pickle.dump(results, pickle_file)
        # save the metrics
        perf = performances[model_name]
        with open(filepath + "metrics.pckl", "wb") as pickle_file:
            pickle.dump(perf, pickle_file)
        with open(filepath + "parameters.txt","w") as params_file:
            params_file.writelines(parameter_info)

## Metrics

In [6]:
# results is a list with elements of the form [x, y, y_hat, interval, alpha]
def remove_inf_results(results):
    filtered_results = filter(lambda x: x[3][0] != -math.inf and x[3][1] != math.inf, results)
    return list(filtered_results)
    
def mean_error_rate(results):
    filtered_results = remove_inf_results(results)
    # sum instances where y isn't in confidence interval
    s = sum([x[1] < x[3][0] or x[1]> x[3][1] for x in filtered_results])
    n = len(filtered_results)
    return s/n
    

def relative_interval_size(results):
    filtered_results = remove_inf_results(results)
    y_vals = [item[1] for item in filtered_results]
    rho = max(y_vals) - min(y_vals)
    # sum length of intervals
    s = sum(x[3][1] - x[3][0] for x in filtered_results)
    n = len(filtered_results)
    return s/(rho*n)

    
def quantile_loss(results):
    filtered_results = remove_inf_results(results)
    alpha = results[0][4]
    a = alpha*relative_interval_size(results)
    def single_interval_loss(y, interval):
        return max(min(interval)-y, y- max(interval), 0)
    s = sum([single_interval_loss(x[1], x[3]) for x in filtered_results])
    y_vals = [item[1] for item in filtered_results]
    rho = max(y_vals) - min(y_vals)
    n = len(filtered_results)
    return a + (s/(n*rho))
         

def utility(results):
    filtered_results = remove_inf_results(results)
    alpha = results[0][4]
    gamma = 2*np.log(2)/alpha
    mer = mean_error_rate(results)
    ris = relative_interval_size(results)
    if mer <= alpha:
        return 1-ris
    else:
        return (1-ris)*np.exp(-gamma*(mer - alpha))

## Small-dataset experiments

In [7]:
experiment_parameters = """Experiment parameters

All river models
    - n_models = 10
    - max_features = "sqrt"
    - aggregation_method = "mean"
    - lambda_value = 1

OnlineCP
    - c_max = 1000
    - update_threshold = 50 (in case of CPExact)

OnlineQRF
    - K = 200

Mondrian forests
    - n_estimators = 10
    - min_samples_split = 2
"""

In [15]:
def instantiate_models():
    models_with_names = [
#         # Mondrian Forest 
#         (RiverMondrianForestRegressor(n_estimators = 10, min_samples_split =2),
#          "MondrianForest"),
        # CP Exact
#         (AdaptiveRandomForestRegressorCP(n_models = 10, max_features = "sqrt", 
#             aggregation_method = "mean", lambda_value = 1, cp_exact = True, c_max = 1000,
#             update_threshold = 50),
#           "CPExact")
        # OnlineQRF
        (AdaptiveRandomForestRegressorQRF(n_models = 10, max_features = "sqrt", 
            aggregation_method = "mean", lambda_value = 1, k_sketch = 200),
         "OnlineQRF"),
         
        # CP Approx
        (AdaptiveRandomForestRegressorCP(n_models = 10, max_features = "sqrt", 
            aggregation_method = "mean", lambda_value = 1, cp_exact = False, c_max = 1000),
         "CPApprox"),
    ]
    return models_with_names

In [8]:
metrics_with_names = [
        (mean_error_rate, "MER"),
        (relative_interval_size, "RIS"),
        (quantile_loss, "QL"),
        (utility, "Utility")
    ]

In [10]:
ds = list(load_datastream("data/stationary/2dplanes.arff"))
dataset_name = "2dplanes"
alpha = 0.1
models_with_names = instantiate_models()

In [198]:
res = run_experiment(ds[0:5000], dataset_name, models_with_names, metrics_with_names, alpha, experiment_parameters)

In [9]:
def run_small_experiments(n_repeats):
    datasets = os.listdir("data/stationary/")
    datasets.remove(".DS_Store")
    for i in range(n_repeats):
        for dataset in datasets:
            print(dataset)
            models_with_names = instantiate_models()
            alpha = 0.1
            datastream = list(load_datastream("data/stationary/" + dataset))
            dataset_name = dataset.replace(".arff","")
            run_experiment(datastream, dataset_name, models_with_names, metrics_with_names, alpha, experiment_parameters)

In [17]:
run_small_experiments(9)

yprop_4_1.arff
newsPopularity.arff
energy.arff
kin8nm.arff
elevators.arff
cpu_act.arff
house_8L.arff
puma8NH.arff
fried.arff
2dplanes.arff
calHousing.arff
house_16H.arff
ailerons.arff
sulfur.arff
yprop_4_1.arff
newsPopularity.arff
energy.arff
kin8nm.arff
elevators.arff
cpu_act.arff
house_8L.arff
puma8NH.arff
fried.arff
2dplanes.arff
calHousing.arff
house_16H.arff
ailerons.arff
sulfur.arff
yprop_4_1.arff
newsPopularity.arff
energy.arff
kin8nm.arff
elevators.arff
cpu_act.arff
house_8L.arff
puma8NH.arff
fried.arff
2dplanes.arff
calHousing.arff
house_16H.arff
ailerons.arff
sulfur.arff
yprop_4_1.arff
newsPopularity.arff
energy.arff
kin8nm.arff
elevators.arff
cpu_act.arff
house_8L.arff
puma8NH.arff
fried.arff
2dplanes.arff
calHousing.arff
house_16H.arff
ailerons.arff
sulfur.arff
yprop_4_1.arff
newsPopularity.arff
energy.arff
kin8nm.arff
elevators.arff
cpu_act.arff
house_8L.arff
puma8NH.arff
fried.arff
2dplanes.arff
calHousing.arff
house_16H.arff
ailerons.arff
sulfur.arff
yprop_4_1.arff
newsP

## Concept-drift experiments

In [11]:
# Experiment with Friedman data

In [12]:
# Functions which introduce concept-drift to an underlying Friedman datastream of 10 features

In [10]:
def cd_permute(datapoint:dict):
    d = copy.deepcopy(datapoint)
    d[0] = datapoint[3]
    d[1] = datapoint[4]
    d[2] = datapoint[1]
    d[3] = datapoint[0]
    d[4] = datapoint[2]
    return d

In [11]:
def cd_dataset(dataset):
    """
    Permute the positions of the independent variables starting 25% through the dataset.
    Revert to original permutation of independent variables starting 75% through the dataset.
    """
    n = len(dataset)
    n_start = int(0.25*n)
    n_end = int(0.75*n)
    dataset_new = copy.deepcopy(dataset)
    for i in range(n_start, n_end):
        dataset_new[i] = (cd_permute(dataset_new[i][0]), dataset_new[i][1])
    return dataset_new

In [12]:
# Load the Friedman data

In [13]:
datastream = synth.Friedman(seed=28)
datastream = list(datastream.take(1000000))
cd_datastream = cd_dataset(datastream)

In [17]:
# On large concept-drift data, we evaluate metrics on tumbling windows of size 10000.
# We modify our metrics to be evaluated on arithmetically-increasing subsets of our data.

In [14]:
# Given a metric, returns function which computes that metric on arithmetically-increasing
# subsets of the data, with tumbling window = window_size.
def tumbling_metric(metric, window_size):
    def t_metric(results):
        metric_results = []
        for i in range(window_size, len(results), window_size):
            perf = metric(results[i - window_size:i])
            metric_results.append([i, perf])
        return metric_results
    return t_metric

In [16]:
dataset_name = "cd_friedman1_global_abrupt"
models_with_names = instantiate_models()
metrics_with_names = [
        (tumbling_metric(mean_error_rate, 10000), "MER"),
        (tumbling_metric(relative_interval_size, 10000), "RIS"),
        (tumbling_metric(quantile_loss, 10000), "QL"),
        (tumbling_metric(utility, 10000), "Utility")
    ]
alpha = 0.1
parameter_info = """Experiment parameters

All river models
    - n_models = 10
    - max_features = "sqrt"
    - aggregation_method = "mean"
    - lambda_value = 1

OnlineCP
    - c_max = 1000
    - update_threshold = 50 (in case of CPExact)

OnlineQRF
    - K = 200

Mondrian forests
    - n_estimators = 10
    - min_samples_split = 2
"""

In [17]:
s = run_experiment(cd_datastream, dataset_name, models_with_names, metrics_with_names, alpha, parameter_info)

10000


KeyboardInterrupt: 

In [105]:
s[1]

{'MondrianForest': {'MER': [[10000, 0.0407],
   [20000, 0.0255],
   [30000, 0.0222],
   [40000, 0.0239],
   [50000, 0.0234],
   [60000, 0.0207],
   [70000, 0.0211],
   [80000, 0.017],
   [90000, 0.0175],
   [100000, 0.0182],
   [110000, 0.0206],
   [120000, 0.0172],
   [130000, 0.0201],
   [140000, 0.0179],
   [150000, 0.0178],
   [160000, 0.0163],
   [170000, 0.0194],
   [180000, 0.0202],
   [190000, 0.0171],
   [200000, 0.0163],
   [210000, 0.0174],
   [220000, 0.018],
   [230000, 0.0194],
   [240000, 0.0174],
   [250000, 0.0144],
   [260000, 0.1989],
   [270000, 0.1844],
   [280000, 0.1743],
   [290000, 0.1608],
   [300000, 0.1524],
   [310000, 0.1316],
   [320000, 0.1299],
   [330000, 0.1151],
   [340000, 0.1123],
   [350000, 0.0992],
   [360000, 0.091],
   [370000, 0.0884],
   [380000, 0.0893],
   [390000, 0.0779],
   [400000, 0.0723],
   [410000, 0.0749],
   [420000, 0.0689],
   [430000, 0.0622],
   [440000, 0.0611],
   [450000, 0.0566],
   [460000, 0.0532],
   [470000, 0.0501],


In [110]:
cd_datastream[300000]

({0: 0.016410949070087333,
  1: 0.2782558469335449,
  2: 0.8185684229585304,
  3: 0.7683679899654309,
  4: 0.2148203408429732,
  5: 0.984567363665633,
  6: 0.5287647187545143,
  7: 0.7676221057186963,
  8: 0.4693229444473781,
  9: 0.28530809435445836},
 13.320901360071973)

In [111]:
datastream[300000]

({0: 0.7683679899654309,
  1: 0.8185684229585304,
  2: 0.2148203408429732,
  3: 0.016410949070087333,
  4: 0.2782558469335449,
  5: 0.984567363665633,
  6: 0.5287647187545143,
  7: 0.7676221057186963,
  8: 0.4693229444473781,
  9: 0.28530809435445836},
 13.320901360071973)