# Imports and paths configuration

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ..

c:\Users\alois\Desktop\Advanced ML\AML_Task_1


In [3]:
import sys
import os
import yaml
from ax.service.ax_client import AxClient
from task1.src.train import main
from task1.utils import unflatten_dict

In [4]:
with open('task1/config/config_ada.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Parameters choice

In [5]:
parameters = [
    {
        "name": "mice.estimator_params.n_estimators",
        "type": "range",
        "bounds": [50, 200],
        "value_type": 'int'
    },
    {
        "name": "mice.estimator_params.max_depth",
        "type": 'range',
        'bounds': [2, 10],
        'value_type': 'int'
    },
    {
        'name': 'mice.mice_params.max_iter',
        'type': 'range',
        'bounds': [5, 20],
        'value_type': 'int'
    },
    {
        'name': 'mice.num_corr_features',
        'type': 'range',
        'bounds': [5, 150],
        'value_type': 'int'
    },
    {
        "name": "lasso_alpha", 
        "type": "range", 
        "bounds": [0, 10],
        "value_type": 'int'
    },
    {
        "name": "mice.simple_params.strategy",
        "type": "choice",
        "values": ["mean", "median", "most_frequent", "constant"],
    },
    {
        "name": "mice.mice_params.imputation_order",
        "type": "choice",
        "values": ["ascending", "descending", "roman", "arabic", "random"],
    },
    {
        "name": "n_estimators",
        "type": "range",
        "bounds": [10, 100],
        "value_type": "int"
    },
    {
        "name": "learning_rate",
        "type": "range",
        "bounds": [0.01, 1.0],
        "value_type": "float"
    }
]


# Tuning

In [6]:
def evaluate_model(parameters):
    # Change any default config here:
    config['train']['model_type'] = 'ada'
    
    config['train']['preprocessing']['feature_selection']['arguments']['alpha'] = parameters['lasso_alpha']/10
    config['train']['model_config']['n_estimators'] = parameters['n_estimators']
    config['train']['model_config']['learning_rate'] = parameters['learning_rate']

    config['train']['preprocessing']['imputation']['method'] = 'mice'
    config['train']['preprocessing']['imputation']['arguments'] = {}
    config['train']['preprocessing']['imputation']['arguments']['estimator_params'] = {}
    config['train']['preprocessing']['imputation']['arguments']['mice_params'] = {}
    config['train']['preprocessing']['imputation']['arguments']['simple_params'] = {}
    config['train']['preprocessing']['imputation']['arguments']['num_corr_features'] = parameters['mice.num_corr_features']


    
    # Parse parameters
    for param_name in parameters:
        param_value = parameters[param_name]
        if param_name.startswith('mice.'):
            param_name_split = param_name.split('.')
            if len(param_name_split) == 3:
                config['train']['preprocessing']['imputation']['arguments'][param_name_split[1]][param_name_split[2]] = param_value
            elif len(param_name_split) == 4:
                config['train']['preprocessing']['imputation']['arguments'][param_name_split[1]][param_name_split[2]][param_name_split[3]] = param_value
    
    print(config)
    return main(config)

In [7]:
ax_client = AxClient()
objective_name = 'preprocessing_tune'
ax_client.create_experiment(
    parameters=parameters,
    objective_name=objective_name,
    minimize=False
)

# Specify the initial parameters according to your configuration
initial_parameters = {
    "mice.estimator_params.n_estimators": 100,  # You can set the initial values as needed
    "mice.estimator_params.max_depth": 5,
    "mice.mice_params.max_iter": 10,
    "mice.num_corr_features": 50,
    "lasso_alpha": 5,
    "n_estimators": 50,
    "learning_rate": 0.1,
    "mice.simple_params.strategy": "mean",  # Choose a strategy from your configuration
    "mice.mice_params.imputation_order": "descending"
}

_, trial_id = ax_client.attach_trial(initial_parameters)
ax_client.complete_trial(trial_id, raw_data=evaluate_model(initial_parameters))






for i in range(50):    
    curr_parameters, trial_index = ax_client.get_next_trial()
    ax_client.complete_trial(trial_index=trial_index, raw_data=evaluate_model(curr_parameters))

[INFO 11-12 13:05:09] ax.service.ax_client: Starting optimization with verbose logging. To disable logging, set the `verbose_logging` argument to `False`. Note that float values in the logs are rounded to 6 decimal points.
[INFO 11-12 13:05:09] ax.service.utils.instantiation: Inferred value type of ParameterType.STRING for parameter mice.simple_params.strategy. If that is not the expected value type, you can explicity specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
  warn(
  warn(
[INFO 11-12 13:05:09] ax.service.utils.instantiation: Inferred value type of ParameterType.STRING for parameter mice.mice_params.imputation_order. If that is not the expected value type, you can explicity specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
  warn(
  warn(
[INFO 11-12 13:05:09] ax.service.utils.instantiation: Created search space: SearchSpace(parameters=[RangeParameter(name='mice.estimator_params.n_estimators', parameter_type=INT, range=[50, 200]

{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 100, 'max_depth': 5}, 'mice_params': {'max_iter': 10, 'imputation_order': 'descending'}, 'simple_params': {'strategy': 'mean'}, 'num_corr_features': 50}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.5}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 50, 'learning_rate': 0.1}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 39




Number of selected features: 40




Number of selected features: 41




Number of selected features: 31




Number of selected features: 33




Number of selected features: 33


[INFO 11-12 13:10:47] ax.service.ax_client: Completed trial 0 with data: {'preprocessing_tune': (0.406673, None)}.
[INFO 11-12 13:10:47] ax.service.ax_client: Generated new trial 1 with parameters {'mice.estimator_params.n_estimators': 122, 'mice.estimator_params.max_depth': 3, 'mice.mice_params.max_iter': 9, 'mice.num_corr_features': 66, 'lasso_alpha': 6, 'n_estimators': 22, 'learning_rate': 0.418389, 'mice.simple_params.strategy': 'constant', 'mice.mice_params.imputation_order': 'arabic'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 122, 'max_depth': 3}, 'mice_params': {'max_iter': 9, 'imputation_order': 'arabic'}, 'simple_params': {'strategy': 'constant'}, 'num_corr_features': 66}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.6}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 22, 'learning_rate': 0.41838921517133715}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 25




Number of selected features: 32




Number of selected features: 27




Number of selected features: 27




Number of selected features: 25




Number of selected features: 28


[INFO 11-12 13:15:01] ax.service.ax_client: Completed trial 1 with data: {'preprocessing_tune': (0.393656, None)}.
[INFO 11-12 13:15:01] ax.service.ax_client: Generated new trial 2 with parameters {'mice.estimator_params.n_estimators': 51, 'mice.estimator_params.max_depth': 6, 'mice.mice_params.max_iter': 17, 'mice.num_corr_features': 122, 'lasso_alpha': 8, 'n_estimators': 13, 'learning_rate': 0.279207, 'mice.simple_params.strategy': 'mean', 'mice.mice_params.imputation_order': 'arabic'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 51, 'max_depth': 6}, 'mice_params': {'max_iter': 17, 'imputation_order': 'arabic'}, 'simple_params': {'strategy': 'mean'}, 'num_corr_features': 122}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.8}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 13, 'learning_rate': 0.2792073409352452}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 21




Number of selected features: 20




Number of selected features: 22




Number of selected features: 25




Number of selected features: 20




Number of selected features: 22


[INFO 11-12 13:31:27] ax.service.ax_client: Completed trial 2 with data: {'preprocessing_tune': (0.390728, None)}.
[INFO 11-12 13:31:27] ax.service.ax_client: Generated new trial 3 with parameters {'mice.estimator_params.n_estimators': 173, 'mice.estimator_params.max_depth': 2, 'mice.mice_params.max_iter': 10, 'mice.num_corr_features': 115, 'lasso_alpha': 9, 'n_estimators': 64, 'learning_rate': 0.873291, 'mice.simple_params.strategy': 'most_frequent', 'mice.mice_params.imputation_order': 'descending'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 173, 'max_depth': 2}, 'mice_params': {'max_iter': 10, 'imputation_order': 'descending'}, 'simple_params': {'strategy': 'most_frequent'}, 'num_corr_features': 115}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.9}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 64, 'learning_rate': 0.8732905666623264}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 38




Number of selected features: 43




Number of selected features: 37




Number of selected features: 44




Number of selected features: 30




Number of selected features: 41


[INFO 11-12 13:38:21] ax.service.ax_client: Completed trial 3 with data: {'preprocessing_tune': (0.420403, None)}.
[INFO 11-12 13:38:21] ax.service.ax_client: Generated new trial 4 with parameters {'mice.estimator_params.n_estimators': 83, 'mice.estimator_params.max_depth': 6, 'mice.mice_params.max_iter': 16, 'mice.num_corr_features': 22, 'lasso_alpha': 3, 'n_estimators': 58, 'learning_rate': 0.765872, 'mice.simple_params.strategy': 'median', 'mice.mice_params.imputation_order': 'descending'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 83, 'max_depth': 6}, 'mice_params': {'max_iter': 16, 'imputation_order': 'descending'}, 'simple_params': {'strategy': 'median'}, 'num_corr_features': 22}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.3}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 58, 'learning_rate': 0.7658719615172594}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 96




Number of selected features: 103




Number of selected features: 101




Number of selected features: 90




Number of selected features: 106




Number of selected features: 88


[INFO 11-12 13:44:33] ax.service.ax_client: Completed trial 4 with data: {'preprocessing_tune': (0.408745, None)}.
[INFO 11-12 13:44:33] ax.service.ax_client: Generated new trial 5 with parameters {'mice.estimator_params.n_estimators': 111, 'mice.estimator_params.max_depth': 4, 'mice.mice_params.max_iter': 12, 'mice.num_corr_features': 63, 'lasso_alpha': 6, 'n_estimators': 94, 'learning_rate': 0.960135, 'mice.simple_params.strategy': 'median', 'mice.mice_params.imputation_order': 'random'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 111, 'max_depth': 4}, 'mice_params': {'max_iter': 12, 'imputation_order': 'random'}, 'simple_params': {'strategy': 'median'}, 'num_corr_features': 63}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.6}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 94, 'learning_rate': 0.9601347304973751}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 28




Number of selected features: 27




Number of selected features: 28




Number of selected features: 24




Number of selected features: 24




Number of selected features: 27


[INFO 11-12 13:51:44] ax.service.ax_client: Completed trial 5 with data: {'preprocessing_tune': (0.416261, None)}.
[INFO 11-12 13:51:45] ax.service.ax_client: Generated new trial 6 with parameters {'mice.estimator_params.n_estimators': 126, 'mice.estimator_params.max_depth': 7, 'mice.mice_params.max_iter': 19, 'mice.num_corr_features': 86, 'lasso_alpha': 7, 'n_estimators': 62, 'learning_rate': 0.639519, 'mice.simple_params.strategy': 'mean', 'mice.mice_params.imputation_order': 'descending'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 126, 'max_depth': 7}, 'mice_params': {'max_iter': 19, 'imputation_order': 'descending'}, 'simple_params': {'strategy': 'mean'}, 'num_corr_features': 86}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.7}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 62, 'learning_rate': 0.6395189069211483}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 24




Number of selected features: 23




Number of selected features: 23




Number of selected features: 25




Number of selected features: 22




Number of selected features: 24


[INFO 11-12 14:29:00] ax.service.ax_client: Completed trial 6 with data: {'preprocessing_tune': (0.421865, None)}.
[INFO 11-12 14:29:00] ax.service.ax_client: Generated new trial 7 with parameters {'mice.estimator_params.n_estimators': 51, 'mice.estimator_params.max_depth': 3, 'mice.mice_params.max_iter': 17, 'mice.num_corr_features': 112, 'lasso_alpha': 3, 'n_estimators': 33, 'learning_rate': 0.836376, 'mice.simple_params.strategy': 'mean', 'mice.mice_params.imputation_order': 'random'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 51, 'max_depth': 3}, 'mice_params': {'max_iter': 17, 'imputation_order': 'random'}, 'simple_params': {'strategy': 'mean'}, 'num_corr_features': 112}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.3}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 33, 'learning_rate': 0.8363762765936553}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 98




Number of selected features: 102




Number of selected features: 102




Number of selected features: 92




Number of selected features: 106




Number of selected features: 84


[INFO 11-12 14:35:11] ax.service.ax_client: Completed trial 7 with data: {'preprocessing_tune': (0.394254, None)}.
[INFO 11-12 14:35:11] ax.service.ax_client: Generated new trial 8 with parameters {'mice.estimator_params.n_estimators': 157, 'mice.estimator_params.max_depth': 9, 'mice.mice_params.max_iter': 8, 'mice.num_corr_features': 62, 'lasso_alpha': 3, 'n_estimators': 68, 'learning_rate': 0.101642, 'mice.simple_params.strategy': 'median', 'mice.mice_params.imputation_order': 'descending'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 157, 'max_depth': 9}, 'mice_params': {'max_iter': 8, 'imputation_order': 'descending'}, 'simple_params': {'strategy': 'median'}, 'num_corr_features': 62}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.3}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 68, 'learning_rate': 0.10164201835170388}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 98




Number of selected features: 104




Number of selected features: 104




Number of selected features: 101




Number of selected features: 105




Number of selected features: 80


[INFO 11-12 14:48:23] ax.service.ax_client: Completed trial 8 with data: {'preprocessing_tune': (0.41821, None)}.
[INFO 11-12 14:48:23] ax.service.ax_client: Generated new trial 9 with parameters {'mice.estimator_params.n_estimators': 130, 'mice.estimator_params.max_depth': 7, 'mice.mice_params.max_iter': 16, 'mice.num_corr_features': 129, 'lasso_alpha': 7, 'n_estimators': 88, 'learning_rate': 0.619179, 'mice.simple_params.strategy': 'constant', 'mice.mice_params.imputation_order': 'ascending'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 130, 'max_depth': 7}, 'mice_params': {'max_iter': 16, 'imputation_order': 'ascending'}, 'simple_params': {'strategy': 'constant'}, 'num_corr_features': 129}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.7}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 88, 'learning_rate': 0.6191790395416319}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 24




Number of selected features: 26




Number of selected features: 23




Number of selected features: 22




Number of selected features: 20




Number of selected features: 24


[INFO 11-12 15:31:59] ax.service.ax_client: Completed trial 9 with data: {'preprocessing_tune': (0.41095, None)}.
[INFO 11-12 15:31:59] ax.service.ax_client: Generated new trial 10 with parameters {'mice.estimator_params.n_estimators': 149, 'mice.estimator_params.max_depth': 5, 'mice.mice_params.max_iter': 8, 'mice.num_corr_features': 73, 'lasso_alpha': 7, 'n_estimators': 91, 'learning_rate': 0.389035, 'mice.simple_params.strategy': 'mean', 'mice.mice_params.imputation_order': 'roman'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 149, 'max_depth': 5}, 'mice_params': {'max_iter': 8, 'imputation_order': 'roman'}, 'simple_params': {'strategy': 'mean'}, 'num_corr_features': 73}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.7}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 91, 'learning_rate': 0.3890345442947}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 23




Number of selected features: 24




Number of selected features: 23




Number of selected features: 20




Number of selected features: 20




Number of selected features: 23


[INFO 11-12 15:43:30] ax.service.ax_client: Completed trial 10 with data: {'preprocessing_tune': (0.408019, None)}.
[INFO 11-12 15:43:30] ax.service.ax_client: Generated new trial 11 with parameters {'mice.estimator_params.n_estimators': 129, 'mice.estimator_params.max_depth': 4, 'mice.mice_params.max_iter': 12, 'mice.num_corr_features': 58, 'lasso_alpha': 1, 'n_estimators': 25, 'learning_rate': 0.864459, 'mice.simple_params.strategy': 'median', 'mice.mice_params.imputation_order': 'ascending'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 129, 'max_depth': 4}, 'mice_params': {'max_iter': 12, 'imputation_order': 'ascending'}, 'simple_params': {'strategy': 'median'}, 'num_corr_features': 58}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.1}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 25, 'learning_rate': 0.8644592835940421}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 362




Number of selected features: 377




Number of selected features: 354




Number of selected features: 363




Number of selected features: 384




Number of selected features: 356


[INFO 11-12 15:52:05] ax.service.ax_client: Completed trial 11 with data: {'preprocessing_tune': (0.404357, None)}.
[INFO 11-12 15:52:05] ax.service.ax_client: Generated new trial 12 with parameters {'mice.estimator_params.n_estimators': 163, 'mice.estimator_params.max_depth': 9, 'mice.mice_params.max_iter': 10, 'mice.num_corr_features': 15, 'lasso_alpha': 5, 'n_estimators': 13, 'learning_rate': 0.554588, 'mice.simple_params.strategy': 'mean', 'mice.mice_params.imputation_order': 'roman'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 163, 'max_depth': 9}, 'mice_params': {'max_iter': 10, 'imputation_order': 'roman'}, 'simple_params': {'strategy': 'mean'}, 'num_corr_features': 15}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.5}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 13, 'learning_rate': 0.5545877566654235}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 39




Number of selected features: 40




Number of selected features: 38




Number of selected features: 30




Number of selected features: 34




Number of selected features: 34


[INFO 11-12 15:56:48] ax.service.ax_client: Completed trial 12 with data: {'preprocessing_tune': (0.402709, None)}.
[INFO 11-12 15:56:48] ax.service.ax_client: Generated new trial 13 with parameters {'mice.estimator_params.n_estimators': 92, 'mice.estimator_params.max_depth': 2, 'mice.mice_params.max_iter': 7, 'mice.num_corr_features': 82, 'lasso_alpha': 3, 'n_estimators': 85, 'learning_rate': 0.28342, 'mice.simple_params.strategy': 'most_frequent', 'mice.mice_params.imputation_order': 'descending'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 92, 'max_depth': 2}, 'mice_params': {'max_iter': 7, 'imputation_order': 'descending'}, 'simple_params': {'strategy': 'most_frequent'}, 'num_corr_features': 82}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.3}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 85, 'learning_rate': 0.28341973611153665}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 204




Number of selected features: 206




Number of selected features: 194




Number of selected features: 192




Number of selected features: 197




Number of selected features: 173


[INFO 11-12 15:59:26] ax.service.ax_client: Completed trial 13 with data: {'preprocessing_tune': (0.397811, None)}.
[INFO 11-12 15:59:26] ax.service.ax_client: Generated new trial 14 with parameters {'mice.estimator_params.n_estimators': 87, 'mice.estimator_params.max_depth': 8, 'mice.mice_params.max_iter': 13, 'mice.num_corr_features': 24, 'lasso_alpha': 2, 'n_estimators': 81, 'learning_rate': 0.087082, 'mice.simple_params.strategy': 'constant', 'mice.mice_params.imputation_order': 'arabic'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 87, 'max_depth': 8}, 'mice_params': {'max_iter': 13, 'imputation_order': 'arabic'}, 'simple_params': {'strategy': 'constant'}, 'num_corr_features': 24}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.2}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 81, 'learning_rate': 0.08708228751085698}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 197




Number of selected features: 205




Number of selected features: 183




Number of selected features: 186




Number of selected features: 178




Number of selected features: 179


[INFO 11-12 16:07:42] ax.service.ax_client: Completed trial 14 with data: {'preprocessing_tune': (0.420267, None)}.
[INFO 11-12 16:07:42] ax.service.ax_client: Generated new trial 15 with parameters {'mice.estimator_params.n_estimators': 112, 'mice.estimator_params.max_depth': 5, 'mice.mice_params.max_iter': 17, 'mice.num_corr_features': 11, 'lasso_alpha': 4, 'n_estimators': 44, 'learning_rate': 0.15316, 'mice.simple_params.strategy': 'median', 'mice.mice_params.imputation_order': 'random'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 112, 'max_depth': 5}, 'mice_params': {'max_iter': 17, 'imputation_order': 'random'}, 'simple_params': {'strategy': 'median'}, 'num_corr_features': 11}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.4}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 44, 'learning_rate': 0.1531595783866942}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 53




Number of selected features: 61




Number of selected features: 65




Number of selected features: 56




Number of selected features: 52




Number of selected features: 49


[INFO 11-12 16:10:42] ax.service.ax_client: Completed trial 15 with data: {'preprocessing_tune': (0.417202, None)}.
[INFO 11-12 16:10:42] ax.service.ax_client: Generated new trial 16 with parameters {'mice.estimator_params.n_estimators': 80, 'mice.estimator_params.max_depth': 8, 'mice.mice_params.max_iter': 19, 'mice.num_corr_features': 37, 'lasso_alpha': 6, 'n_estimators': 65, 'learning_rate': 0.478717, 'mice.simple_params.strategy': 'most_frequent', 'mice.mice_params.imputation_order': 'ascending'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 80, 'max_depth': 8}, 'mice_params': {'max_iter': 19, 'imputation_order': 'ascending'}, 'simple_params': {'strategy': 'most_frequent'}, 'num_corr_features': 37}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.6}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 65, 'learning_rate': 0.478716683331877}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 67




Number of selected features: 70




Number of selected features: 59




Number of selected features: 64




Number of selected features: 53




Number of selected features: 54


[INFO 11-12 16:29:50] ax.service.ax_client: Completed trial 16 with data: {'preprocessing_tune': (0.410201, None)}.
[INFO 11-12 16:29:50] ax.service.ax_client: Generated new trial 17 with parameters {'mice.estimator_params.n_estimators': 85, 'mice.estimator_params.max_depth': 9, 'mice.mice_params.max_iter': 16, 'mice.num_corr_features': 74, 'lasso_alpha': 6, 'n_estimators': 72, 'learning_rate': 0.796021, 'mice.simple_params.strategy': 'median', 'mice.mice_params.imputation_order': 'descending'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 85, 'max_depth': 9}, 'mice_params': {'max_iter': 16, 'imputation_order': 'descending'}, 'simple_params': {'strategy': 'median'}, 'num_corr_features': 74}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.6}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 72, 'learning_rate': 0.7960205571260304}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 28




Number of selected features: 31




Number of selected features: 28




Number of selected features: 27




Number of selected features: 24




Number of selected features: 27


[INFO 11-12 16:56:45] ax.service.ax_client: Completed trial 17 with data: {'preprocessing_tune': (0.416992, None)}.
[INFO 11-12 16:56:45] ax.service.ax_client: Generated new trial 18 with parameters {'mice.estimator_params.n_estimators': 74, 'mice.estimator_params.max_depth': 7, 'mice.mice_params.max_iter': 9, 'mice.num_corr_features': 45, 'lasso_alpha': 0, 'n_estimators': 48, 'learning_rate': 0.267642, 'mice.simple_params.strategy': 'mean', 'mice.mice_params.imputation_order': 'roman'}.


{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 74, 'max_depth': 7}, 'mice_params': {'max_iter': 9, 'imputation_order': 'roman'}, 'simple_params': {'strategy': 'mean'}, 'num_corr_features': 45}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.0}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 48, 'learning_rate': 0.26764179673045874}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}


KeyboardInterrupt: 

# Get best config and write prediction data

In [8]:
import pandas as pd
results_path = 'results/overview.csv'
# Remove duplicates results, should fix this in train.py
pd.read_csv(results_path).sort_values(by='train_results', ascending=False).drop_duplicates().to_csv(results_path, index=False)
results = pd.read_csv(results_path).sort_values(by='train_results', ascending=False).drop_duplicates().reset_index(drop=True)

In [9]:
results

Unnamed: 0,train/data_paths/features,train/data_paths/label,train/preprocessing/scale,train/preprocessing/imputation/method,train/preprocessing/imputation/arguments/n_neighbors,train/preprocessing/outliers_detection/method,train/preprocessing/feature_selection/method,train/preprocessing/feature_selection/arguments/alpha,train/preprocessing/execution_order,train/model_type,...,evaluate/run,evaluate/data_paths/features,evaluate/data_paths/label,train_results,train/preprocessing/imputation/arguments/estimator_params/n_estimators,train/preprocessing/imputation/arguments/estimator_params/max_depth,train/preprocessing/imputation/arguments/mice_params/max_iter,train/preprocessing/imputation/arguments/mice_params/imputation_order,train/preprocessing/imputation/arguments/simple_params/strategy,train/preprocessing/imputation/arguments/num_corr_features
0,data/X_train.csv,data/y_train.csv,True,knn,20.0,isolation_forest,lasso,0.7,imputation; outliers_detection; feature_selection,ada,...,True,data/X_test.csv,data/y_test_ada.csv,0.448614,,,,,,
1,data/X_train.csv,data/y_train.csv,True,knn,20.0,isolation_forest,lasso,0.7,imputation; outliers_detection; feature_selection,ada,...,True,data/X_test.csv,data/y_test_ada.csv,0.445853,,,,,,
2,data/X_train.csv,data/y_train.csv,True,knn,20.0,isolation_forest,lasso,0.8,imputation; outliers_detection; feature_selection,ada,...,True,data/X_test.csv,data/y_test_ada.csv,0.445686,,,,,,
3,data/X_train.csv,data/y_train.csv,True,knn,8.0,isolation_forest,lasso,1.0,imputation; outliers_detection; feature_selection,ada,...,True,data/X_test.csv,data/y_test_ada.csv,0.443252,,,,,,
4,data/X_train.csv,data/y_train.csv,True,knn,19.0,isolation_forest,lasso,0.8,imputation; outliers_detection; feature_selection,ada,...,True,data/X_test.csv,data/y_test_ada.csv,0.441705,,,,,,
5,data/X_train.csv,data/y_train.csv,True,knn,18.0,isolation_forest,lasso,0.9,imputation; outliers_detection; feature_selection,ada,...,True,data/X_test.csv,data/y_test_ada.csv,0.439079,,,,,,
6,data/X_train.csv,data/y_train.csv,True,knn,20.0,isolation_forest,lasso,0.6,imputation; outliers_detection; feature_selection,ada,...,True,data/X_test.csv,data/y_test_ada.csv,0.428652,,,,,,
7,data/X_train.csv,data/y_train.csv,True,knn,15.0,isolation_forest,lasso,0.8,imputation; outliers_detection; feature_selection,ada,...,True,data/X_test.csv,data/y_test_ada.csv,0.428306,,,,,,
8,data/X_train.csv,data/y_train.csv,True,knn,13.0,isolation_forest,lasso,0.6,imputation; outliers_detection; feature_selection,ada,...,True,data/X_test.csv,data/y_test_ada.csv,0.423359,,,,,,
9,data/X_train.csv,data/y_train.csv,True,knn,8.0,isolation_forest,lasso,0.4,imputation; outliers_detection; feature_selection,ada,...,True,data/X_test.csv,data/y_test_ada.csv,0.422158,,,,,,


#### Get best params and rerun the trial to get predictions

In [10]:
def get_best_params(ax_client):
    print(ax_client.experiment.fetch_data().df.sort_values(by='mean', ascending=False).iloc[0])
    best_trial_idx = ax_client.experiment.fetch_data().df.sort_values(by='mean', ascending=False).iloc[0].trial_index
    best_params = ax_client.experiment.trials[best_trial_idx].arm.parameters
    return best_params


In [14]:
initial_params = get_best_params(ax_client)

arm_name                      6_0
metric_name    preprocessing_tune
mean                     0.421865
sem                           NaN
trial_index                     6
Name: 6, dtype: object


In [15]:
# Rerun trial nb_trial
ax_client = AxClient()
objective_name = 'preprocessing_tune'
ax_client.create_experiment(
    parameters=parameters,
    objective_name=objective_name,
    minimize=False
)
_, trial_id = ax_client.attach_trial(initial_params)
ax_client.complete_trial(trial_id, raw_data=evaluate_model(initial_params))

for i in range(10):    
    curr_parameters, trial_index = ax_client.get_next_trial()
    ax_client.complete_trial(trial_index=trial_index, raw_data=evaluate_model(curr_parameters))

[INFO 11-12 16:58:39] ax.service.ax_client: Starting optimization with verbose logging. To disable logging, set the `verbose_logging` argument to `False`. Note that float values in the logs are rounded to 6 decimal points.
[INFO 11-12 16:58:39] ax.service.utils.instantiation: Inferred value type of ParameterType.STRING for parameter mice.simple_params.strategy. If that is not the expected value type, you can explicity specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
  warn(
  warn(
[INFO 11-12 16:58:39] ax.service.utils.instantiation: Inferred value type of ParameterType.STRING for parameter mice.mice_params.imputation_order. If that is not the expected value type, you can explicity specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
  warn(
  warn(
[INFO 11-12 16:58:39] ax.service.utils.instantiation: Created search space: SearchSpace(parameters=[RangeParameter(name='mice.estimator_params.n_estimators', parameter_type=INT, range=[50, 200]

{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'mice', 'arguments': {'estimator_params': {'n_estimators': 126, 'max_depth': 7}, 'mice_params': {'max_iter': 19, 'imputation_order': 'descending'}, 'simple_params': {'strategy': 'mean'}, 'num_corr_features': 86}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.7}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'ada', 'model_config': {'n_estimators': 62, 'learning_rate': 0.6395189069211483}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_ada_mice.csv'}}}




Number of selected features: 24




Number of selected features: 24




Number of selected features: 22




Number of selected features: 25




Number of selected features: 20




Number of selected features: 23


[INFO 11-12 17:37:43] ax.service.ax_client: Completed trial 0 with data: {'preprocessing_tune': (0.405412, None)}.


NameError: name 'n2' is not defined

In [None]:
best_param_dct_linear = results[results['train/model_type'] == 'ada'].iloc[0].to_dict()
del best_param_dct_linear['train_results']

In [None]:
best_param_dct_linear

In [None]:
best_param_config = unflatten_dict(best_param_dct_linear)
best_param_config['evaluate']['data_paths']['label'] = 'results/y_test_svm.csv'
best_param_config['evaluate']['run'] = True
best_param_config['train']['model_config'] = {}
best_param_config['train']['preprocessing']['outliers_detection']['arguments'] = {}

In [None]:
main(best_param_config)

In [None]:
pd.read_csv('results/overview.csv').sort_values(by='train_results', ascending=False).drop_duplicates().reset_index(drop=True)