In [1]:
import os
import shutil
import pandas as pd
import mercury as mr
from supervised.automl import AutoML 

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
app = mr.App(title="Train AutoML (advanced) 🤓", 
             description="Train ML pipeline with MLJAR AutoML with more params")

# Train Machine Learning Pipeline with MLJAR AutoML

You can control AutoML behavior with more parameters. This notebook is running autoML in the `Compete` mode. 

You can choose:
- feature preprocessing parameters: golden features and features selection
- select algorithms, stack, and ensemble them,
- set cross-validation strategy (number of folds, stratify and shuffle),
- choose evaluation metric.

### Steps
1. Upload CSV file with data. Data should have column names in the first line.
2. Select input features and target column.
3. Select AutoML training mode, algorithms, and training time limit.
4. Directory with all ML models will be zipped and available to download.

In [4]:
# data_file = mr.File(label="Upload CSV with training data", max_file_size="1MB")

In [5]:
from sklearn.datasets import load_iris
df = load_iris(return_X_y = True, as_frame = True)

In [6]:
# if data_file.filepath is None:
#     mr.Stop()

In [7]:
# df = pd.read_csv(data_file.filepath)

In [8]:
mr.Markdown("### Training data")

### Training data

In [9]:
df

(     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
 0                  5.1               3.5                1.4               0.2
 1                  4.9               3.0                1.4               0.2
 2                  4.7               3.2                1.3               0.2
 3                  4.6               3.1                1.5               0.2
 4                  5.0               3.6                1.4               0.2
 ..                 ...               ...                ...               ...
 145                6.7               3.0                5.2               2.3
 146                6.3               2.5                5.0               1.9
 147                6.5               3.0                5.2               2.0
 148                6.2               3.4                5.4               2.3
 149                5.9               3.0                5.1               1.8
 
 [150 rows x 4 columns],
 0      0
 1      0
 2   

In [10]:
df[0]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [11]:
x_columns = mr.MultiSelect(label="Input features", value=list(df[0].columns)[:-1], 
                           choices=list(df[0].columns))

mercury.MultiSelect

In [15]:
y_column = mr.Select(label="Target", value=list(df[0].columns)[-1], choices=list(df[0].columns))

mercury.Select

In [16]:
if x_columns.value is None or len(x_columns.value) == 0 or y_column.value is None:
    print("Please select input features and target column")
    mr.Stop()

In [17]:
_ = mr.Note("#### Prepare data")

#### Prepare data

In [18]:
golden_features = mr.Checkbox(label="Construct Golden Features")

mercury.Checkbox

In [19]:
features_selection = mr.Checkbox(label="Features Selection")

mercury.Checkbox

In [20]:
_ = mr.Note("#### Algorithms")

#### Algorithms

In [21]:
algos = ["Decision Tree", "Linear", "Random Forest", "Extra Trees", "LightGBM", 
                "Xgboost", "CatBoost", "Neural Network", "Nearest Neighbors"]


In [22]:
algorithms = mr.MultiSelect(label="Algorithms", value=algos, choices=algos)

mercury.MultiSelect

In [23]:
stack_models = mr.Checkbox(label="Stack Models")

mercury.Checkbox

In [24]:
train_ensemble = mr.Checkbox(label="Train Ensemble")

mercury.Checkbox

In [25]:
_ = mr.Note("#### Validation")

#### Validation

In [26]:
folds = mr.Numeric(label="Number of Folds", value=5, min=2, max=100)

mercury.Numeric

In [27]:
shuffle = mr.Checkbox(label="Suffle Samples")

mercury.Checkbox

In [28]:
stratify = mr.Checkbox(label="Stratify Samples")

mercury.Checkbox

In [29]:
eval_metric = mr.Select(label="Evaluation Metric", value="auto", 
                           choices=["auto", "logloss", "f1", "average_precision",
                                    "accuracy", "rmse", "mse", "mae", "r2",
                                    "mape", "spearman", "pearson"])

mercury.Select

In [30]:
time_limit = mr.Select(label="Time Limit (seconds)", value="60", choices=["60", "120", "240", "300"])

mercury.Select

In [43]:
start_training = mr.Button(label="Start Training", style="success")

mercury.Button

In [32]:
output_dir = mr.OutputDir()

In [46]:
automl = AutoML(mode="Compete", 
                algorithms=algorithms.value,
                train_ensemble=train_ensemble.value,
                stack_models=stack_models.value,
                golden_features=golden_features.value,
                features_selection=features_selection.value,
                validation_strategy={
                    "validation_type": "kfold",
                    "k_folds": int(folds.value),
                    "shuffle": shuffle.value,
                    "stratify": stratify.value,
                    "random_seed": 123
                },
                eval_metric=eval_metric.value,
                total_time_limit=int(time_limit.value)
                )

In [47]:
if start_training.clicked:
    mr.Markdown("### AutoML training logs")
    automl.fit(df[0][x_columns.value], df[0][y_column.value])
    
    output_filename = os.path.join(output_dir.path, automl._results_path)
    shutil.make_archive(output_filename, 'zip', automl._results_path)

### AutoML training logs

AutoML directory: AutoML_3
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Decision Tree', 'Linear', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step simple_algorithms will try to check up to 4 models
1_DecisionTree rmse 0.213154 trained in 0.63 seconds
2_DecisionTree rmse 0.206743 trained in 0.61 seconds
3_DecisionTree rmse 0.206902 trained in 0.59 seconds
4_Linear rmse 0.194117 trained in 0.69 seconds
* Step default_algorithms will try to check up to 7 models
5_Default_LightGBM rmse 0.199116 trained in 0.83 seconds
6_Default_Xgboost rmse 0.201006 trained in 1.1 seconds
7_De

In [42]:
if automl._best_model is None:
    mr.Stop()

In [None]:
automl.report()