In [1]:
import autosklearn.regression
from autoML.utils import evaluate_regression, data_preparation, Log
import time
import shutil
import os
import pandas as pd 

def flat_dicts(dicts):
    flat_dicts = {}
    for phase, metrics in dicts.items():
        for metric_name, value in metrics.items():
            flat_dicts[f"{phase}_{metric_name}"] = value

    return flat_dicts


def run_autosklearn(X_train, y_train, X_test, y_test, run_time, seed, n_jobs):
    tmp_folder = '/data/ephemeral/home/Dongjin/temp'
    if os.path.exists(tmp_folder) and os.path.isdir(tmp_folder):
        shutil.rmtree(tmp_folder)

    log = Log(logger_name="autosklearn")
    log.log("autosklearn - start")
    log.log(f"{run_time}: run_time, {seed}: seed")
    
    start = time.time()
    automl = autosklearn.regression.AutoSklearnRegressor(
        time_left_for_this_task=run_time,
        per_run_time_limit=30,
        tmp_folder=tmp_folder,
        n_jobs=n_jobs,
        seed=seed)

    automl.fit(X_train, y_train)
    end = time.time()

    y_train_pred = automl.predict(X_train)
    y_test_pred = automl.predict(X_test)

    train_score = evaluate_regression(y_train, y_train_pred, 'train') 
    test_score = evaluate_regression(y_test, y_test_pred, 'test')
    scores = flat_dicts({'train': train_score, 'test': test_score})
    elapsed_time = end-start

    log.log(automl.leaderboard())
    log.log(f'Autosklearn.regression init to training finished in: {elapsed_time:.1f} s')
    log.log_dicts(train_score)
    log.log_dicts(test_score)

    return scores, elapsed_time


In [None]:
save_name = 'autosklearn'
target_times = [] # list(range(60, 610, 60))
target_times.insert(0, 30)
seeds = [1, 2, 3]
n_jobs = 10
data_path = '/data/ephemeral/home/Dongjin/data/melbourne/melb_split.csv'


py_dir_path = os.path.dirname(os.path.abspath(__file__))
raw_save_path = os.path.join(py_dir_path, f'result/{save_name}_raw.csv')
save_path = os.path.join(py_dir_path, f'result/{save_name}.csv')
os.makedirs(os.path.dirname(save_path), exist_ok=True)

X_train, y_train, X_test, y_test = data_preparation(data_path) 
df = None

for target_time in target_times:
    for seed in seeds:
        scores, elapsed_time = run_autosklearn(X_train, y_train, X_test, y_test, 
                                               target_time=target_time, seed=seed, n_jobs=n_jobs)
        result = {'target_time': target_time, 'seed': seed, 'elapsed_time': elapsed_time}
        result.update(scores)

        if df is None:
            df = pd.DataFrame([result])
        else:
            df.loc[len(df)] = result

df_summary = df.drop(columns=['seed']).groupby('target_time').agg(['mean', 'sum'])
df.to_csv(raw_save_path)
df_summary.to_csv(save_path)

[2025-02-05 15:24:42] autosklearn - start
[2025-02-05 15:24:42] 30: run_time, 1: seed

Evaluation for train:
R2 Score: 0.8793
Mean Absolute Error (MAE): 135364.6297
Root Mean Squared Error (RMSE): 233779.9077

Evaluation for test:
R2 Score: 0.7706
Mean Absolute Error (MAE): 195485.4764
Root Mean Squared Error (RMSE): 324515.5077
[2025-02-05 15:25:20]           rank  ensemble_weight                 type      cost  duration
model_id                                                                
12           1             0.44    gradient_boosting  0.238716  1.697685
2            2             0.04        random_forest  0.253091  7.911697
15           3             0.26    gradient_boosting  0.255342  4.108240
16           4             0.14  k_nearest_neighbors  0.306216  0.848781
11           5             0.12  k_nearest_neighbors  0.393048  0.913552
[2025-02-05 15:25:20] Autosklearn.regression init to training finished in: 36.2 s
[2025-02-05 15:25:20] R2: 0.8793, MAE: 135364.6297, RM

OSError: Cannot save file into a non-existent directory: 'result'

In [7]:
raw_save_path = os.path.join(py_dir_path, f'result/{save_name}_raw.csv')
save_path = os.path.join(py_dir_path, f'result/{save_name}.csv')
os.makedirs(os.path.dirname(save_path), exist_ok=True)

df.to_csv(raw_save_path)
df.to_csv(save_path)