# Imports and paths configuration

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ..

c:\Users\alois\Desktop\Advanced ML\AML_Task_1


In [3]:
import sys
import os
import yaml
from ax.service.ax_client import AxClient
from task1.src.train import main
from task1.utils import unflatten_dict

In [4]:
with open('task1/config/default_train_config_mice.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Parameters choice

In [5]:
parameters = [
    {
        "name": "miceV2.estimator_params.n_estimators",
        "type": "range",
        "bounds": [50, 200],
        "value_type": 'int'
    },
    {
        "name": "miceV2.estimator_params.max_depth",
        "type": 'range',
        'bounds': [2, 10],
        'value_type': 'int'
    },
    {
        'name': 'miceV2.mice_params.max_iter',
        'type': 'range',
        'bounds': [5, 20],
        'value_type': 'int'
    },
    {
        'name': 'miceV2.num_corr_features',
        'type': 'range',
        'bounds': [0.1, 0.9],
        'value_type': 'float'
    },
    {
        "name": "lasso_alpha", 
        "type": "range", 
        "bounds": [0, 10],
        "value_type": 'int'
    },
    {
        "name": "ridge_alpha",
        "type": "range",
        "bounds": [0, 10],
        "value_type": 'int'        
    },
    {
        "name": "gamma",
        "type": "range",
        "bounds": [0.001, 0.01],
        "value_type": 'float'
    },
    {
        "name": "miceV2.simple_params.strategy",
        "type": "choice",
        "values": ["mean", "median", "most_frequent", "constant"],
    },
    {
        "name": "miceV2.mice_params.imputation_order",
        "type": "choice",
        "values": ["ascending", "descending", "roman", "arabic", "random"],
    },
]


# Tuning

In [6]:
def evaluate_model(parameters):
    # Change any default config here:
    config['train']['model_type'] = 'krr'
    
    # Parse parameters
    config['train']['preprocessing']['feature_selection']['arguments']['alpha'] = parameters['lasso_alpha']/10
    config['train']['model_config']['alpha'] = parameters['ridge_alpha']/10
    config['train']['model_config']['gamma'] = parameters['gamma']

    config['train']['preprocessing']['imputation']['method'] = 'miceV2'
    config['train']['preprocessing']['imputation']['arguments'] = {}
    config['train']['preprocessing']['imputation']['arguments']['estimator_params'] = {}
    config['train']['preprocessing']['imputation']['arguments']['mice_params'] = {}
    config['train']['preprocessing']['imputation']['arguments']['simple_params'] = {}
    config['train']['preprocessing']['imputation']['arguments']['num_corr_features'] = parameters['miceV2.num_corr_features']


    
    # Parse parameters
    for param_name in parameters:
        param_value = parameters[param_name]
        if param_name.startswith('miceV2.'):
            param_name_split = param_name.split('.')
            if len(param_name_split) == 3:
                config['train']['preprocessing']['imputation']['arguments'][param_name_split[1]][param_name_split[2]] = param_value
            elif len(param_name_split) == 4:
                config['train']['preprocessing']['imputation']['arguments'][param_name_split[1]][param_name_split[2]][param_name_split[3]] = param_value
        else:
            #config['train']['preprocessing']['imputation']['arguments'][param_name] = param_value
            print("no arg to parse")
    print(config)
    
    return main(config)





In [7]:
ax_client = AxClient()
objective_name = 'preprocessing_tune'
ax_client.create_experiment(
    parameters=parameters,
    objective_name=objective_name,
    minimize=False
)

# Specify the initial parameters according to your configuration
initial_parameters = {
    "miceV2.estimator_params.n_estimators": 100,  # You can set the initial values as needed
    "miceV2.estimator_params.max_depth": 5,
    "miceV2.mice_params.max_iter": 10,
    "miceV2.num_corr_features": 0.5,
    "lasso_alpha": 5,
    "ridge_alpha": 5,
    "gamma": 0.005,
    "miceV2.simple_params.strategy": "mean",  # Choose a strategy from your configuration
    "miceV2.mice_params.imputation_order": "descending"
}

_, trial_id = ax_client.attach_trial(initial_parameters)
ax_client.complete_trial(trial_id, raw_data=evaluate_model(initial_parameters))



for i in range(10):    
    curr_parameters, trial_index = ax_client.get_next_trial()
    ax_client.complete_trial(trial_index=trial_index, raw_data=evaluate_model(curr_parameters))


[INFO 11-08 23:00:45] ax.service.ax_client: Starting optimization with verbose logging. To disable logging, set the `verbose_logging` argument to `False`. Note that float values in the logs are rounded to 6 decimal points.
[INFO 11-08 23:00:45] ax.service.utils.instantiation: Inferred value type of ParameterType.STRING for parameter miceV2.simple_params.strategy. If that is not the expected value type, you can explicity specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
  warn(
  warn(
[INFO 11-08 23:00:45] ax.service.utils.instantiation: Inferred value type of ParameterType.STRING for parameter miceV2.mice_params.imputation_order. If that is not the expected value type, you can explicity specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
  warn(
  warn(
[INFO 11-08 23:00:45] ax.service.utils.instantiation: Created search space: SearchSpace(parameters=[RangeParameter(name='miceV2.estimator_params.n_estimators', parameter_type=INT, range=[50

no arg to parse
no arg to parse
no arg to parse
{'train': {'data_paths': {'features': 'data/X_train.csv', 'label': 'data/y_train.csv'}, 'preprocessing': {'scale': True, 'imputation': {'method': 'miceV2', 'arguments': {'estimator_params': {'n_estimators': 100, 'max_depth': 5}, 'mice_params': {'max_iter': 10, 'imputation_order': 'descending'}, 'simple_params': {'strategy': 'mean'}, 'num_corr_features': 0.5}}, 'outliers_detection': {'method': 'isolation_forest', 'arguments': {}}, 'feature_selection': {'method': 'lasso', 'arguments': {'alpha': 0.5}}, 'execution_order': ['imputation', 'outliers_detection', 'feature_selection']}, 'model_type': 'krr', 'model_config': {'kernel': 'rbf', 'gamma': 0.005, 'alpha': 0.5}, 'cross_validation': 5, 'results_path': 'results/overview.csv'}, 'evaluate': {'run': True, 'data_paths': {'features': 'data/X_test.csv', 'label': 'data/y_test_rbf.csv'}}}
225


# Get best config and write prediction data

In [None]:
import pandas as pd
results_path = 'results/overview.csv'
# Remove duplicates results, should fix this in train.py
pd.read_csv(results_path).sort_values(by='train_results', ascending=False).drop_duplicates().to_csv(results_path, index=False)
results = pd.read_csv(results_path).sort_values(by='train_results', ascending=False).drop_duplicates().reset_index(drop=True)

In [None]:
results

Unnamed: 0,train/data_paths/features,train/data_paths/label,train/preprocessing/scale,train/preprocessing/imputation/method,train/preprocessing/imputation/arguments/estimator_params/n_estimators,train/preprocessing/imputation/arguments/estimator_params/max_depth,train/preprocessing/imputation/arguments/mice_params/max_iter,train/preprocessing/imputation/arguments/mice_params/imputation_order,train/preprocessing/imputation/arguments/simple_params/strategy,train/preprocessing/imputation/arguments/num_corr_features,...,train/model_type,train/model_config/kernel,train/model_config/gamma,train/model_config/alpha,train/cross_validation,train/results_path,evaluate/run,evaluate/data_paths/features,evaluate/data_paths/label,train_results
0,data/X_train.csv,data/y_train.csv,True,mice,200.0,9.0,6.0,descending,mean,12.0,...,krr,rbf,0.004307,0.1,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.449828
1,data/X_train.csv,data/y_train.csv,True,mice,200.0,2.0,8.0,ascending,mean,5.0,...,krr,rbf,0.006216,0.2,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.449525
2,data/X_train.csv,data/y_train.csv,True,mice,174.0,7.0,7.0,ascending,mean,12.0,...,krr,rbf,0.006247,0.1,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.444121
3,data/X_train.csv,data/y_train.csv,True,mice,134.0,10.0,5.0,ascending,median,7.0,...,krr,rbf,0.006037,0.2,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.443323
4,data/X_train.csv,data/y_train.csv,True,mice,167.0,9.0,9.0,arabic,constant,15.0,...,krr,rbf,0.004635,0.1,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.441701
5,data/X_train.csv,data/y_train.csv,True,mice,119.0,2.0,16.0,descending,mean,5.0,...,krr,rbf,0.004141,0.4,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.438186
6,data/X_train.csv,data/y_train.csv,True,mice,152.0,2.0,8.0,arabic,median,7.0,...,krr,rbf,0.004696,0.7,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.435585
7,data/X_train.csv,data/y_train.csv,True,mice,134.0,7.0,20.0,ascending,mean,15.0,...,krr,rbf,0.00457,0.4,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.435293
8,data/X_train.csv,data/y_train.csv,True,mice,130.0,7.0,5.0,descending,constant,14.0,...,krr,rbf,0.002951,0.2,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.431519
9,data/X_train.csv,data/y_train.csv,True,mice,91.0,2.0,9.0,descending,constant,5.0,...,krr,rbf,0.005537,0.7,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.43019


In [None]:
best_param_dct_linear = results[results['train/model_type'] == 'krr'].iloc[0].to_dict()
del best_param_dct_linear['train_results']

In [None]:
best_param_dct_linear

{'train/data_paths/features': 'data/X_train.csv',
 'train/data_paths/label': 'data/y_train.csv',
 'train/preprocessing/scale': True,
 'train/preprocessing/imputation/method': 'mice',
 'train/preprocessing/imputation/arguments/estimator_params/n_estimators': 200.0,
 'train/preprocessing/imputation/arguments/estimator_params/max_depth': 9.0,
 'train/preprocessing/imputation/arguments/mice_params/max_iter': 6.0,
 'train/preprocessing/imputation/arguments/mice_params/imputation_order': 'descending',
 'train/preprocessing/imputation/arguments/simple_params/strategy': 'mean',
 'train/preprocessing/imputation/arguments/num_corr_features': 12.0,
 'train/preprocessing/outliers_detection/method': 'isolation_forest',
 'train/preprocessing/feature_selection/method': 'lasso',
 'train/preprocessing/feature_selection/arguments/alpha': 1.0,
 'train/preprocessing/execution_order': 'imputation; outliers_detection; feature_selection',
 'train/model_type': 'krr',
 'train/model_config/kernel': 'rbf',
 'tra

In [None]:
best_param_config = unflatten_dict(best_param_dct_linear)
best_param_config['evaluate']['data_paths']['label'] = 'Data/y_test_rbf_miceV2.csv'
best_param_config['evaluate']['run'] = True
best_param_config['train']['model_config'] = {}
best_param_config['train']['preprocessing']['outliers_detection']['arguments'] = {}

best_param_config['train']['preprocessing']['feature_selection']['arguments']['alpha'] = {}
best_param_config['train']['model_config']['alpha'] = {}
best_param_config['train']['model_config']['gamma'] = {}

best_param_config['train']['preprocessing']['imputation']['method'] = 'miceV2'
best_param_config['train']['preprocessing']['imputation']['arguments'] = {}
best_param_config['train']['preprocessing']['imputation']['arguments']['estimator_params'] = {}
best_param_config['train']['preprocessing']['imputation']['arguments']['mice_params'] = {}
best_param_config['train']['preprocessing']['imputation']['arguments']['simple_params'] = {}
best_param_config['train']['preprocessing']['imputation']['arguments']['num_corr_features'] = {}


    
# Parse parameters
for param_name in parameters:
    param_value = best_param_config[param_name]
    if param_name.startswith('miceV2.'):
        param_name_split = param_name.split('.')
        if len(param_name_split) == 3:
            best_param_config['train']['preprocessing']['imputation']['arguments'][param_name_split[1]][param_name_split[2]] = param_value
        elif len(param_name_split) == 4:
            best_param_config['train']['preprocessing']['imputation']['arguments'][param_name_split[1]][param_name_split[2]][param_name_split[3]] = param_value
    else:
        #config['train']['preprocessing']['imputation']['arguments'][param_name] = param_value
        print("no arg to parse")

print(best_param_config)

TypeError: unhashable type: 'dict'

In [None]:
main(best_param_config)

TypeError: slice indices must be integers or None or have an __index__ method

In [None]:
pd.read_csv('results/overview.csv').sort_values(by='train_results', ascending=False).drop_duplicates().reset_index(drop=True)

Unnamed: 0,train/data_paths/features,train/data_paths/label,train/preprocessing/scale,train/preprocessing/imputation/method,train/preprocessing/imputation/arguments/estimator_params/n_estimators,train/preprocessing/imputation/arguments/estimator_params/max_depth,train/preprocessing/imputation/arguments/mice_params/max_iter,train/preprocessing/imputation/arguments/mice_params/imputation_order,train/preprocessing/imputation/arguments/simple_params/strategy,train/preprocessing/imputation/arguments/num_corr_features,...,train/model_type,train/model_config/kernel,train/model_config/gamma,train/model_config/alpha,train/cross_validation,train/results_path,evaluate/run,evaluate/data_paths/features,evaluate/data_paths/label,train_results
0,data/X_train.csv,data/y_train.csv,True,mice,200.0,9.0,6.0,descending,mean,12.0,...,krr,rbf,0.004307,0.1,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.449828
1,data/X_train.csv,data/y_train.csv,True,mice,200.0,2.0,8.0,ascending,mean,5.0,...,krr,rbf,0.006216,0.2,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.449525
2,data/X_train.csv,data/y_train.csv,True,mice,174.0,7.0,7.0,ascending,mean,12.0,...,krr,rbf,0.006247,0.1,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.444121
3,data/X_train.csv,data/y_train.csv,True,mice,134.0,10.0,5.0,ascending,median,7.0,...,krr,rbf,0.006037,0.2,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.443323
4,data/X_train.csv,data/y_train.csv,True,mice,167.0,9.0,9.0,arabic,constant,15.0,...,krr,rbf,0.004635,0.1,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.441701
5,data/X_train.csv,data/y_train.csv,True,mice,119.0,2.0,16.0,descending,mean,5.0,...,krr,rbf,0.004141,0.4,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.438186
6,data/X_train.csv,data/y_train.csv,True,mice,134.0,7.0,20.0,ascending,mean,15.0,...,krr,rbf,0.00457,0.4,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.435293
7,data/X_train.csv,data/y_train.csv,True,mice,130.0,7.0,5.0,descending,constant,14.0,...,krr,rbf,0.002951,0.2,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.431519
8,data/X_train.csv,data/y_train.csv,True,mice,91.0,2.0,9.0,descending,constant,5.0,...,krr,rbf,0.005537,0.7,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.43019
9,data/X_train.csv,data/y_train.csv,True,mice,50.0,6.0,5.0,descending,constant,12.0,...,krr,rbf,0.006071,0.3,5.0,results/overview.csv,True,data/X_test.csv,data/y_test_rbf.csv,0.427068
