## Install river and scikit-learn

In [1]:
pip install "./river/"

Processing ./river
Building wheels for collected packages: river
  Building wheel for river (setup.py) ... [?25ldone
[?25h  Created wheel for river: filename=river-1-cp39-cp39-macosx_10_9_x86_64.whl size=1195780 sha256=063aa1c6d588d54b70eda4477e5abfdff7e353a56e9b8ac3353f4324ecf9cc03
  Stored in directory: /private/var/folders/s0/cs0fw3px6tx5srp431pdvjc80000gn/T/pip-ephem-wheel-cache-v3adhme2/wheels/d1/6b/a3/83d8a39007debc0733461c491a8263c5af566254c5860a1a1a
Successfully built river
Installing collected packages: river
  Attempting uninstall: river
    Found existing installation: river 1
    Uninstalling river-1:
      Successfully uninstalled river-1
Successfully installed river-1
You should consider upgrading via the '/Users/brianburns/ml/my_env/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install "./scikit-garden/"

Processing ./scikit-garden
Building wheels for collected packages: scikit-garden
  Building wheel for scikit-garden (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-garden: filename=scikit_garden-0.1.3-cp39-cp39-macosx_10_9_x86_64.whl size=530087 sha256=20fe418083ddcd27e2fd796fb87a854e33f904287695b8362e70b74fd4741980
  Stored in directory: /Users/brianburns/Library/Caches/pip/wheels/cc/96/10/f1f98c7ebcbb916ea7b0a696a32ae8d73b3ed876c5d2911053
Successfully built scikit-garden
Installing collected packages: scikit-garden
  Attempting uninstall: scikit-garden
    Found existing installation: scikit-garden 0.1.3
    Uninstalling scikit-garden-0.1.3:
      Successfully uninstalled scikit-garden-0.1.3
Successfully installed scikit-garden-0.1.3
You should consider upgrading via the '/Users/brianburns/ml/my_env/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


## Imports

In [2]:
from river import stream
from river.ensemble import AdaptiveRandomForestRegressorCP, AdaptiveRandomForestRegressorQRF
from skgarden.mondrian import RiverMondrianForestRegressor
import datetime
import os
import pickle
from datetime import datetime
import time
import math
import numpy as np

## Experiment functions

In [3]:
def get_target_variable(file_path):
    """
    Our data files have different names for the target variable.
    This returns the string name of the target variable of a dataset in an arff file.
    """
    data_stream = stream.iter_arff(file_path)
    v = next(iter(data_stream))
    return list(v[0].keys())[-1]

In [4]:
def load_datastream(arff_file):
    target_name = get_target_variable(arff_file)
    datastream = stream.iter_arff(arff_file, target = target_name)
    return datastream

In [15]:
def run_experiment(datastream, dataset_name, models_with_names, metrics_with_names, alpha, parameter_info):
    """
    datastream is a river stream object, not the path of an arff file.
    """
    # Ensure iteration doesn't exhaust datastream
    datastream = list(datastream)
    
    all_results = dict()
    all_performances = dict()
        
    for model, model_name in models_with_names:
        results = []
        performances = dict()
        time_start = time.process_time()
        for x,y in datastream:
            interval = model.predict_interval(x, alpha)
            y_hat = model.predict_one(x)
            results.append([x,y,y_hat, interval, alpha])
            model.learn_one(x,y)

        time_end = time.process_time()
        time_elapsed = time_end - time_start
        
        for metric, metric_name in metrics_with_names:
            performances[metric_name] = metric(results)
        performances["time"] = time_elapsed
        
        all_results[model_name] = results
        all_performances[model_name] = performances

    # Save results and performances
    experiment_id = str(datetime.now()).replace(":","-").replace(".","-").replace(" ", "-")
    #save_experiment(dataset_name, all_results, all_performances, alpha, experiment_id, parameter_info)

    return all_results, all_performances

In [142]:
def save_experiment(dataset_name, results, performances, alpha, experiment_id, parameter_info):
    for model_name in results.keys():
        filepath = "./results/" + dataset_name + "/" + model_name + "/" + experiment_id + "/"
        # Create directory for experiment, if it doesn't already exist
        os.makedirs(filepath, exist_ok=True)
        # save the result
        result = results[model_name]
        with open(filepath + "results.pckl", "wb") as pickle_file:
            pickle.dump(results, pickle_file)
        # save the metrics
        perf = performances[model_name]
        with open(filepath + "metrics.pckl", "wb") as pickle_file:
            pickle.dump(perf, pickle_file)
        with open(filepath + "parameters.txt","w") as params_file:
            params_file.writelines(parameter_info)

## Metrics

In [7]:
# results is a list with elements of the form [x, y, y_hat, interval, alpha]
def remove_inf_results(results):
    filtered_results = filter(lambda x: x[3][0] != -math.inf and x[3][1] != math.inf, results)
    return list(filtered_results)
    
def mean_error_rate(results):
    filtered_results = remove_inf_results(results)
    # sum instances where y isn't in confidence interval
    s = sum([x[1] < x[3][0] or x[1]> x[3][1] for x in filtered_results])
    n = len(filtered_results)
    return s/n
    

def relative_interval_size(results):
    filtered_results = remove_inf_results(results)
    y_vals = [item[1] for item in filtered_results]
    rho = max(y_vals) - min(y_vals)
    # sum length of intervals
    s = sum(x[3][1] - x[3][0] for x in filtered_results)
    n = len(filtered_results)
    return s/(rho*n)

    
def quantile_loss(results):
    filtered_results = remove_inf_results(results)
    alpha = results[0][4]
    a = alpha*relative_interval_size(results)
    def single_interval_loss(y, interval):
        return max(min(interval)-y, y- max(interval), 0)
    s = sum([single_interval_loss(x[1], x[3]) for x in filtered_results])
    y_vals = [item[1] for item in filtered_results]
    rho = max(y_vals) - min(y_vals)
    n = len(filtered_results)
    return a + (s/(n*rho))
         

def utility(results):
    filtered_results = remove_inf_results(results)
    alpha = results[0][4]
    gamma = 2*np.log(2)/alpha
    mer = mean_error_rate(results)
    ris = relative_interval_size(results)
    if mer <= alpha:
        return 1-ris
    else:
        return (1-ris)*np.exp(-gamma*(mer - alpha))

## Running the experiment!

In [14]:
experiment_parameters = """Experiment parameters

All river models
    - n_models = 10
    - max_features = "sqrt"
    - aggregation_method = "mean"
    - lambda_value = 1

OnlineCP
    - c_max = 1000

OnlineQRF
    - K = 200

Mondrian forests
    - n_estimators = 10
    - min_samples_split = 2
"""

In [196]:
def instantiate_models():
    models_with_names = [
        # Mondrian Forest 
        (RiverMondrianForestRegressor(n_estimators = 10, min_samples_split =2),
         "MondrianForest"),
        # CP Exact
#         (AdaptiveRandomForestRegressorCP(n_models = 10, max_features = "sqrt", 
#             aggregation_method = "mean", lambda_value = 1, cp_exact = True, c_max = 1000),
#          "CPExact"),
        # CP Approx
        (AdaptiveRandomForestRegressorCP(n_models = 10, max_features = "sqrt", 
            aggregation_method = "mean", lambda_value = 1, cp_exact = False, c_max = 1000),
         "CPApprox"),
        # OnlineQRF
        (AdaptiveRandomForestRegressorQRF(n_models = 10, max_features = "sqrt", 
            aggregation_method = "mean", lambda_value = 1, k_sketch = 200),
         "OnlineQRF"),
    ]
    return models_with_names

In [190]:
metrics_with_names = [
        (mean_error_rate, "MER"),
        (relative_interval_size, "RIS"),
        (quantile_loss, "QL"),
        (utility, "Utility")
    ]

In [197]:
ds = list(load_datastream("data/stationary/2dplanes.arff"))
dataset_name = "2dplanes"
alpha = 0.1
models_with_names = instantiate_models()

In [198]:
res = run_experiment(ds[0:5000], dataset_name, models_with_names, metrics_with_names, alpha, experiment_parameters)

In [260]:
def run_small_experiments(n_repeats):
    datasets = os.listdir("data/stationary/")
    datasets.remove(".DS_Store")
    for i in range(n_repeats):
        for dataset in datasets:
            print(dataset)
            models_with_names = instantiate_models()
            alpha = 0.1
            datastream = list(load_datastream("data/stationary/" + dataset))
            dataset_name = dataset.replace(".arff","")
            run_experiment(datastream, dataset_name, models_with_names, metrics_with_names, alpha, experiment_parameters)

In [261]:
run_small_experiments(10)

yprop_4_1.arff
newsPopularity.arff
energy.arff
kin8nm.arff
elevators.arff
cpu_act.arff
house_8L.arff
puma8NH.arff
fried.arff
2dplanes.arff
calHousing.arff
house_16H.arff
ailerons.arff
sulfur.arff
yprop_4_1.arff
newsPopularity.arff
energy.arff
kin8nm.arff
elevators.arff
cpu_act.arff
house_8L.arff
puma8NH.arff
fried.arff
2dplanes.arff
calHousing.arff
house_16H.arff
ailerons.arff
sulfur.arff
yprop_4_1.arff
newsPopularity.arff
energy.arff
kin8nm.arff
elevators.arff
cpu_act.arff
house_8L.arff
puma8NH.arff
fried.arff
2dplanes.arff
calHousing.arff
house_16H.arff
ailerons.arff
sulfur.arff
yprop_4_1.arff
newsPopularity.arff
energy.arff
kin8nm.arff
elevators.arff
cpu_act.arff
house_8L.arff
puma8NH.arff
fried.arff
2dplanes.arff
calHousing.arff
house_16H.arff
ailerons.arff
sulfur.arff
yprop_4_1.arff
newsPopularity.arff
energy.arff
kin8nm.arff
elevators.arff
cpu_act.arff
house_8L.arff
puma8NH.arff
fried.arff
2dplanes.arff
calHousing.arff
house_16H.arff
ailerons.arff
sulfur.arff
yprop_4_1.arff
newsP

In [112]:
r = res["AdaptiveRandomForestRegressorQRF"]

In [204]:
s = os.listdir("data/stationary/")

In [205]:
s.remove(".DS_Store")

In [206]:
s

['puma32H.arff',
 'yprop_4_1.arff',
 'abalone.arff',
 'mv.arff',
 'newsPopularity.arff',
 'energy.arff',
 'kin8nm.arff',
 'elevators.arff',
 'bank32nh.arff',
 'cpu_act.arff',
 'house_8L.arff',
 'puma8NH.arff',
 'fried.arff',
 '2dplanes.arff',
 'calHousing.arff',
 'house_16H.arff',
 'ailerons.arff',
 'sulfur.arff']

In [1]:
utility(r)

NameError: name 'utility' is not defined

In [60]:
True*0.3

0.3

In [61]:
max(0,-1, 2)

2

In [62]:
max([1,2])

2

## Testing CPApprox timing

In [16]:
datastream = list(load_datastream("data/stationary/" + "2dplanes.arff"))
dataset_name = "2dplanes"

In [29]:
models_with_names = [#(AdaptiveRandomForestRegressorCP(n_models = 10, max_features = "sqrt", 
             #aggregation_method = "mean", lambda_value = 1, cp_exact = True, c_max = 1000),
          #"CPExact")
                    (RiverMondrianForestRegressor(n_estimators = 10, min_samples_split =2),
         "MondrianForest")
                    ]

In [30]:
metrics_with_names = [
        (mean_error_rate, "MER"),
        (relative_interval_size, "RIS"),
        (quantile_loss, "QL"),
        (utility, "Utility")
    ]

In [31]:
alpha = 0.1

In [38]:
results_and_perf = run_experiment(datastream[4001:5000], dataset_name, models_with_names, metrics_with_names, alpha, experiment_parameters)

In [40]:
results_and_perf[1]

{'MondrianForest': {'MER': 0.12712712712712712,
  'RIS': 0.291950277255733,
  'QL': 0.03453494973727581,
  'Utility': 0.48611853211072303,
  'time': 8.894488000000024}}