## Install river and scikit-learn

In [None]:
# Install the submodules
# pip install river
# pip install scikit-learn

In [1]:
pip install "./river/"

Processing ./river
Building wheels for collected packages: river
  Building wheel for river (setup.py) ... [?25ldone
[?25h  Created wheel for river: filename=river-1-cp39-cp39-macosx_10_9_x86_64.whl size=1195780 sha256=063aa1c6d588d54b70eda4477e5abfdff7e353a56e9b8ac3353f4324ecf9cc03
  Stored in directory: /private/var/folders/s0/cs0fw3px6tx5srp431pdvjc80000gn/T/pip-ephem-wheel-cache-v3adhme2/wheels/d1/6b/a3/83d8a39007debc0733461c491a8263c5af566254c5860a1a1a
Successfully built river
Installing collected packages: river
  Attempting uninstall: river
    Found existing installation: river 1
    Uninstalling river-1:
      Successfully uninstalled river-1
Successfully installed river-1
You should consider upgrading via the '/Users/brianburns/ml/my_env/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install "./scikit-garden/"

Processing ./scikit-garden
Building wheels for collected packages: scikit-garden
  Building wheel for scikit-garden (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-garden: filename=scikit_garden-0.1.3-cp39-cp39-macosx_10_9_x86_64.whl size=530087 sha256=20fe418083ddcd27e2fd796fb87a854e33f904287695b8362e70b74fd4741980
  Stored in directory: /Users/brianburns/Library/Caches/pip/wheels/cc/96/10/f1f98c7ebcbb916ea7b0a696a32ae8d73b3ed876c5d2911053
Successfully built scikit-garden
Installing collected packages: scikit-garden
  Attempting uninstall: scikit-garden
    Found existing installation: scikit-garden 0.1.3
    Uninstalling scikit-garden-0.1.3:
      Successfully uninstalled scikit-garden-0.1.3
Successfully installed scikit-garden-0.1.3
You should consider upgrading via the '/Users/brianburns/ml/my_env/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


## Imports

In [49]:
from river import stream
import matplotlib.pyplot as plt
from river.ensemble import AdaptiveRandomForestRegressorCP, AdaptiveRandomForestRegressorQRF
from skgarden.mondrian import RiverMondrianForestRegressor
import datetime
import os
import pickle
from datetime import datetime
import time

In [13]:
def get_target_variable(file_path):
    """
    Our data files have different names for the target variable.
    This returns the string name of the target variable of a dataset in an arff file.
    """
    data_stream = stream.iter_arff(file_path)
    v = next(iter(data_stream))
    return list(v[0].keys())[-1]

In [11]:
def load_datastream(arff_file):
    target_name = get_target_variable(arff_file)
    datastream = stream.iter_arff(arff_file, target = target_name)
    return datastream

In [53]:
def run_experiment(datastream, dataset_name, models, alpha):
    """
    datastream is a river stream object, not the path of an arff file.
    """
    # Ensure iteration doesn't exhaust datastream
    datastream = list(datastream)
    results = dict() 
    times = dict()
    
    for model in models:
        results[type(model).__name__] = []
        times[type(model).__name__] = []
    
    t = time.process_time()
    
    for model in models:
        time_start = time.process_time()
        for x,y in datastream:
            interval = model.predict_interval(x, alpha)
            y_hat = model.predict_one(x)
            results[type(model).__name__].append([x,y,y_hat, interval, alpha])
            model.learn_one(x,y)
        time_end = time.process_time()
        time_elapsed = time_end - time_start
    
    experiment_id = str(datetime.now()).replace(":","-").replace(".","-").replace(" ", "-")
    save_experiment(dataset_name, results, alpha, experiment_id)
    print(time_elapsed)

In [47]:
def save_experiment(dataset_name, results, alpha, experiment_id):
    for model_name in results.keys():
        filepath = "./results/" + dataset_name + "/" + model_name + "/" + experiment_id + "/"
        result = results[model_name]
        # Create directory for experiment, if it doesn't already exist
        os.makedirs(filepath, exist_ok=True)
        # save the result
        with open(filepath + "results.pckl", "wb") as pickle_file:
            pickle.dump(results, pickle_file)

In [8]:
models = [
        AdaptiveRandomForestRegressorCP(aggregation_method="mean", lambda_value = 1)    ]

In [None]:
def run_experiment(datastream, dataset_name, models, alpha):

    ds = load_datastream()

In [37]:
ds = list(load_datastream("data/stationary/2dplanes.arff"))
dataset_name = "2dplanes"
models = [AdaptiveRandomForestRegressorQRF(lambda_value = 1)]
alpha = 0.1

In [55]:
run_experiment(ds[0:1000], dataset_name, models, alpha)

1.523071999999999


In [64]:
# Defining the metrics
# results of the form 
# list([x, y, y_hat, interval, alpha])
def remove_inf_results(results):
    filtered_results = filter(lambda x: x[3][0] != -math.inf and x[3][1] != math.inf, results)
    return filtered_results
    
def mean_error_rate(results):
    filtered_results = remove_inf_results(results)
    # sum instances where y isn't in confidence interval
    s = sum([x[1] < x[3][0] or x[1]> x[3][1] for x in filtered_results])
    n = len(filtered_results)
    return s/n
    

def relative_interval_size(results):
    filtered_results = remove_inf_results(results)
    y_vals = [item[1] for item in filtered_results]
    rho = max(y_vals) - min(y_vals)
    # sum length of intervals
    s = sum(x[3][1] - x[3][0] for x in filtered_results)
    n = len(filtered_results)
    return s/(rho*n)

    
def quantile_loss(results):
    filtered_results = remove_inf_results(results)
    alpha = results[0][4]
    a = alpha*relative_interval_size(results)
    def single_interval_loss(y, interval):
        max(min(interval)-y, y- max(interval), 0)
    s = sum([single_interval_loss(x[1], x[3]) for x in filtered_results])
    y_vals = [item[1] for item in filtered_results]
    rho = max(y_vals) - min(y_vals)
    return a + (s/(n*rho))
         

def utility(results):
    filtered_results = remove_inf_results(results)
    alpha = results[0][4]
    gamma = 2*np.log(2)/alpha
    mer = mean_error_rate(results)
    ris = relative_interval_size(results)
    if mer <= alpha:
        return 1-ris
    else:
        return (1-ris)*np.exp(-gamma*(mer - alpha))

In [58]:
False + 2

2

In [59]:
False*0.2

0.0

In [60]:
True*0.3

0.3

In [61]:
max(0,-1, 2)

2

In [62]:
max([1,2])

2