# Running experiments on the artificial data without online weight sharing

In [1]:
import os
import pandas as pd
import numpy as np

### Find time series files

In [2]:
def find_csv_files(path):
    files = os.listdir(path)
    files = [filename for filename in files if filename.endswith('.csv')]
    files = [os.path.join(path, file) for file in files]
    return files

artificial_ts_files = find_csv_files('../data/raw/ING')

### Create model input preprocessing

In [3]:
from river import compose

In [4]:
def get_daily_dummies_timestamp(x):
    n = x['timestamp'].dayofweek
    
    return {
        f'day_{i}': 1 if i == n else 0
        for i in range(7)
    }

def get_hour_dummies_timestamp(x):
    n = x['timestamp'].hour
    
    return {
        f'hour_{i}': 1 if i == n else 0
        for i in range(24)
    }

extract_features = compose.TransformerUnion(
    get_daily_dummies_timestamp, get_hour_dummies_timestamp
)

### Definition of models and detectors

In [32]:
from river.forest import ARFRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor

In [33]:
from src.models import ARBatch, AROnline, ARFRegressorVirtualDrift
from src.detectors import ADWIN, FEDD, AdaptiveFEDD, KSWIN, NoDrift

In [34]:
from multiprocessing.managers import BaseManager
from src.detectors.adaptive_fedd import MetadataManager, AdaptiveFeatureExtarctor

In [35]:
random_seed = 42

day_seasonality = 24 # 24 hours
week_seasonality = day_seasonality * 7

model_window_size = day_seasonality # the last day
model_train_size = 14 * model_window_size # two weeks
model_horizon = day_seasonality # one day in the future
model_validation_size = 2 * day_seasonality # two days in the future

stride = 6 # 6 hours = 1/4 of a day
detector_window_size = week_seasonality # the last week
detector_train_size = model_train_size # the same train size as for the model
detector_train_size_n_shifts = int((detector_train_size - detector_window_size) / stride) # number of shifts for training

adaptive_fedd_observed_features = 15 # number of observed features in Adaptive FEDD

Detectors

In [36]:
adwin_virtual_drift = ADWIN(delta=0.002, grace_period=detector_train_size_n_shifts, clock=4)

In [None]:
def metadata_manager_creator(random_seed=random_seed):
    metadata = pd.read_csv('../data/interim/2024_04_11_artificial_data/feature_metadata.csv', index_col=0)
    metadata.columns = ['features', 'weight', 'true_positives', 'false_positives', 'n_truth', 'n_detected']
    return MetadataManager(metadata=metadata, random_seed=random_seed)

In [None]:
def adaptive_fedd_arf_creator(is_warning, distance_with_weights=False):
    if is_warning:
        delta = 0.02
    else:
        delta = 0.002

    return AdaptiveFEDD(
        window_size=detector_window_size, stride=stride, queue_data=False, n_observed_features=adaptive_fedd_observed_features,
        feature_extractor=AdaptiveFeatureExtarctor(metadata=metadata_manager_creator(), drift_detector=adwin_virtual_drift.clone()),
        delta=delta, grace_period=detector_train_size_n_shifts, clock=4, distance_with_weights=distance_with_weights
    )

In [None]:
def adaptive_fedd_creator(distance_with_weights=False):
    return AdaptiveFEDD(
        window_size=detector_window_size, stride=stride, queue_data=True, n_observed_features=adaptive_fedd_observed_features,
        feature_extractor=AdaptiveFeatureExtarctor(metadata=metadata_manager_creator(), drift_detector=adwin_virtual_drift.clone()),
        delta=0.002, grace_period=detector_train_size_n_shifts, clock=4, distance_with_weights=distance_with_weights
    )

Models

In [40]:
arx_batch = (
    extract_features | 
    ARBatch(p=model_window_size, train_size=model_train_size, y_hat_min=0, y_hat_max=100)
)

arx_online = (
    extract_features | 
    AROnline(p=model_window_size, y_hat_min=0, y_hat_max=100)
)

In [41]:
lightgbm_batch = (
    extract_features | 
    ARBatch(p=model_window_size, train_size=model_train_size, 
            regressor=LGBMRegressor(n_jobs=1, reg_alpha=0.1, reg_lambda=0.1, random_state=random_seed))
)

random_forest_batch = (
    extract_features | 
    ARBatch(p=model_window_size, train_size=model_train_size, 
            regressor=RandomForestRegressor(n_jobs=1, random_state=random_seed))
)

In [42]:
adaptive_random_forest = (
    extract_features |
    AROnline(p=model_window_size, regressor=ARFRegressor(grace_period=10, seed=random_seed))
)

adaptive_random_forest_virtual_drift_adaptive_fedd = (
    extract_features |
    AROnline(p=max(model_window_size, detector_window_size), 
            regressor=ARFRegressorVirtualDrift(
                drift_detector=adaptive_fedd_arf_creator(is_warning=False).clone(), warning_detector=adaptive_fedd_arf_creator(is_warning=True).clone(), 
                virtual_drift_columns=[f"y-{i+1}" for i in range(detector_window_size)], 
                model_columns=[f"y-{i+1}" for i in range(model_window_size)] + list(extract_features.transform_one({'timestamp': pd.Timestamp.now()}).keys()),
                seed=random_seed, grace_period=10 # grace period for tree split
            )
    )
)

adaptive_random_forest_virtual_drift_adaptive_fedd_with_weights_in_distance = (
    extract_features |
    AROnline(p=max(model_window_size, detector_window_size), 
            regressor=ARFRegressorVirtualDrift(
                drift_detector=adaptive_fedd_arf_creator(is_warning=False, distance_with_weights=True).clone(), 
                warning_detector=adaptive_fedd_arf_creator(is_warning=True, distance_with_weights=True).clone(), 
                virtual_drift_columns=[f"y-{i+1}" for i in range(detector_window_size)], 
                model_columns=[f"y-{i+1}" for i in range(model_window_size)] + list(extract_features.transform_one({'timestamp': pd.Timestamp.now()}).keys()),
                seed=random_seed, grace_period=10 # grace period for tree split
            )
    )
)

### Parameters

In [16]:
from src.experiment import OnlineLearningExperiment, BatchLearningExperiment

In [17]:
parameters = []

In [18]:
# online learning
model_list = [
    ('online__arf__virtual_adaptive_fedd_without_weight_sharing', adaptive_random_forest_virtual_drift_adaptive_fedd),
    ('online__arf__virtual_adaptive_fedd_without_weight_sharing_with_weight_in_distance', adaptive_random_forest_virtual_drift_adaptive_fedd_with_weights_in_distance)
]

In [19]:
for time_series_path in artificial_ts_files:
    for model_name, model_obj in model_list:
        data = pd.read_csv(time_series_path, index_col=0)
        data['timestamp'] = pd.to_datetime(data['timestamp'])

        parameters.append(
            (
                time_series_path, model_name, OnlineLearningExperiment(
                    data=data, model=model_obj.clone(),
                    initial_grace_period=detector_train_size_n_shifts, horizon=model_horizon, stride=stride
                )
            )
        )

In [22]:
# advanced batch learning with validation
model_list = [
    ('batch__arx', arx_batch),
    ('batch__lightgbm', lightgbm_batch), 
    ('batch__rf', random_forest_batch),
]

In [23]:
for time_series_path in artificial_ts_files:
    for model_name, model_obj in model_list:
        model_full_name = f"{model_name}__adaptive_fedd_without_weight_sharing"
        data = pd.read_csv(time_series_path, index_col=0)
        data['timestamp'] = pd.to_datetime(data['timestamp'])

        parameters.append(
            (
                time_series_path, model_full_name, BatchLearningExperiment(
                    data=data, base_model=model_obj.clone(), horizon=model_horizon,
                    base_detector=adaptive_fedd_creator().clone(), train_size=model_train_size, stride=stride, validation_size=model_validation_size
                )
            )
        )

        model_full_name = f"{model_name}__adaptive_fedd_without_weight_sharing_with_weight_in_distance"
        parameters.append(
            (
                time_series_path, model_full_name, BatchLearningExperiment(
                    data=data, base_model=model_obj.clone(), horizon=model_horizon,
                    base_detector=adaptive_fedd_creator(distance_with_weights=True).clone(), train_size=model_train_size, stride=stride, validation_size=model_validation_size
                )
            )
        )

### Run experiments

In [24]:
import pickle
from multiprocessing import Pool

In [25]:
N_CPU = 8
SAVE_PATH = '../data/processed/2024_06_10_real_data'
os.makedirs(SAVE_PATH, exist_ok=True)

In [26]:
def run_experiment_and_save_output(time_series_path, model_full_name, experiment_obj, save_path):
    print(f"Starting experiment: {model_full_name} on {time_series_path}", flush=True)

    ts_length = experiment_obj.max_len
    for _ in range(ts_length):
        experiment_obj.step()
    
    print(f"Finished the experiment: {model_full_name} on {time_series_path}", flush=True)
    
    ts_name = os.path.basename(time_series_path).replace('.csv', '')
    path_directory_to_save = os.path.join(save_path, ts_name)
    os.makedirs(path_directory_to_save, exist_ok=True)
    path_to_save_pickle_file = os.path.join(path_directory_to_save, f"{model_full_name}.pickle")
        
    object_to_save = {
        'path_to_save_pickle_file': path_to_save_pickle_file, 
        'time_series_path': time_series_path, 
        'model_full_name': model_full_name, 
        'experiment_obj': experiment_obj
    }

    with open(path_to_save_pickle_file, 'wb') as f:
        pickle.dump(object_to_save, f)
    
    print(f"Saved files for the experiment: {model_full_name} on {time_series_path}", flush=True)

In [27]:
# add save path to parameters
for i in range(len(parameters)):
    parameters[i] = parameters[i] + (SAVE_PATH, )

In [None]:
with Pool(N_CPU) as pool:
    pool.starmap(run_experiment_and_save_output, parameters)