In [8]:
import autosklearn.regression
from autoML.utils import evaluate_regression, data_preparation, Log
import time
import shutil
import os
import pandas as pd 

def flat_dicts(dicts):
    flat_dicts = {}
    for phase, metrics in dicts.items():
        for metric_name, value in metrics.items():
            flat_dicts[f"{phase}_{metric_name}"] = value

    return flat_dicts


def run_autosklearn(X_train, y_train, X_test, y_test, run_time, seed):
    tmp_folder = '/data/ephemeral/home/Dongjin/temp'
    if os.path.exists(tmp_folder) and os.path.isdir(tmp_folder):
        shutil.rmtree(tmp_folder)

    log = Log(logger_name="autosklearn")
    log.log("autosklearn - start")
    
    start = time.time()
    automl = autosklearn.regression.AutoSklearnRegressor(
        time_left_for_this_task=run_time,
        per_run_time_limit=30,
        tmp_folder=tmp_folder,
        n_jobs=-1,
        seed=seed)

    automl.fit(X_train, y_train)
    end = time.time()

    y_train_pred = automl.predict(X_train)
    y_test_pred = automl.predict(X_test)

    train_score = evaluate_regression(y_train, y_train_pred, 'train') 
    test_score = evaluate_regression(y_test, y_test_pred, 'test')
    scores = flat_dicts({'train': train_score, 'test': test_score})
    elapsed_time = end-start

    log.log(automl.leaderboard())
    log.log(f'Autosklearn.regression init to training finished in: {elapsed_time:.1f} s')
    log.log_dicts(train_score)
    log.log_dicts(test_score)

    return scores, elapsed_time



In [12]:
run_times = list(range(60, 610, 60))
run_times.insert(0, 30)
seeds = [1, 2, 3]
data_path = '/data/ephemeral/home/Dongjin/data/melbourne/melb_split.csv'

X_train, y_train, X_test, y_test = data_preparation(data_path) 

for run_time in run_times:
    for seed in seeds:
        result = {'run_time': run_time, 'seed': seed}
        scores, elapsed_time = run_autosklearn(X_train, y_train, X_test, y_test, run_time, seed)
        result.update(scores)

[2025-02-05 15:06:40] autosklearn - start



Evaluation for train:
R2 Score: 0.8793
Mean Absolute Error (MAE): 135364.6297
Root Mean Squared Error (RMSE): 233779.9077

Evaluation for test:
R2 Score: 0.7706
Mean Absolute Error (MAE): 195485.4764
Root Mean Squared Error (RMSE): 324515.5077
[2025-02-05 15:07:12]           rank  ensemble_weight                 type      cost  duration
model_id                                                                
12           1             0.44    gradient_boosting  0.238716  1.564097
2            2             0.04        random_forest  0.253091  7.561969
15           3             0.26    gradient_boosting  0.255342  3.885651
16           4             0.14  k_nearest_neighbors  0.306216  0.762110
11           5             0.12  k_nearest_neighbors  0.393048  0.833081
[2025-02-05 15:07:12] Autosklearn.regression init to training finished in: 31.2 s
[2025-02-05 15:07:12] R2: 0.8793, MAE: 135364.6297, RMSE: 233779.9077
[2025-02-05 15:07:12] R2: 0.7706, MAE: 195485.4764, RMSE: 324515.5077


AttributeError: 'NoneType' object has no attribute 'info'

In [15]:
result.update(scores)

In [16]:
result

{'run_time': 30,
 'seed': 2,
 'train_R2': 0.8792751610252849,
 'train_MAE': 135364.62973834752,
 'train_RMSE': 233779.90770771875,
 'test_R2': 0.7706333464850197,
 'test_MAE': 195485.47637188574,
 'test_RMSE': 324515.507742028}