In [1]:
import os

import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

from autoflow import AutoFlowRegressor

In [2]:
train_df = pd.read_csv("../data/train_regression.csv")
train_df.replace("NA", np.nan, inplace=True)
test_df = pd.read_csv("../data/test_regression.csv")
test_df.replace("NA", np.nan, inplace=True)

In [3]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
trained_pipeline = AutoFlowRegressor(initial_runs=5, run_limit=10, n_jobs=1, included_regressors=["lightgbm"],
                                      per_run_time_limit=60)

In [6]:
column_descriptions = {
    "id": "Id",
    "target": "SalePrice",
}

In [7]:
trained_pipeline.fit(
    X_train=train_df, X_test=test_df, column_descriptions=column_descriptions,
    splitter=KFold(n_splits=3, shuffle=True, random_state=42), fit_ensemble_params=False
)

[INFO] [autoflow.manager.data_manager.DataManager] X_train and X_test are both set.
[INFO] [autoflow.hdl.hdl_constructor.HDL_Constructor] Using 'generic_recommend' method to initialize a generic DAG_workflow, 
to Adapt to various data such like NaN and categorical features.
[INFO] [autoflow.estimator.regressor.AutoFlowRegressor] task_id:	badea8b7d26fea85c104fbd9702176f9
[INFO] [autoflow.estimator.regressor.AutoFlowRegressor] hdl_id:	08a9db6bbad09b339494595bfb77e593
[INFO] [autoflow.estimator.regressor.AutoFlowRegressor] experiment_id:	12
[INFO] [dsmac.utils.io.cmd_reader.CMDReader] Output to smac3-output_2020-04-15_09:30:30_231023
[INFO] [dsmac.facade.smac_hpo_facade.SMAC4HPO] Optimizing a deterministic scenario for quality without a tuner timeout - will make SMAC deterministic and only evaluate one configuration per iteration!
[INFO] [dsmac.scenario.scenario.Scenario] No output directory for scenario logging specified -- scenario will not be logged.
[INFO] [dsmac.initial_design.random



AutoFlowRegressor(hdl_constructor=None, highC_cat_threshold=0.5,
                   highR_nan_threshold=0.5, log_config=None, log_file=None,
                   random_state=42,
                   resource_manager=<autoflow.manager.resource_manager.ResourceManager object at 0x7f73f11c83c8>,
                   tuner=None)

In [None]:
hdl_constructor = trained_pipeline.hdl_constructors[0]
hdl_constructor.draw_workflow_space()

In [None]:
joblib.dump(trained_pipeline, "autoflow_regression.bz2")

In [None]:
predict_pipeline = joblib.load("autoflow_regression.bz2")

In [None]:
result = predict_pipeline.predict(test_df)