In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd

from atml import AtmlOrchestrator

%load_ext autoreload
%autoreload 2

# Preparation

### Load Data

In [3]:
def load_test_data(data_filepath):
    #data_filepath = "../tuner/test/data/binary_data.csv"

    df = pd.read_csv(data_filepath)
    
    #TODO: remove later
    df = df.dropna()
    
    X = df.drop('Survived', axis=1)
    
    #include_feature_columns = ['Sex', 'Embarked', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
    include_feature_columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
    
    X = X[include_feature_columns]
    label_columns = ['Survived']
    y = df[label_columns]
    
    return X, y

X, y = load_test_data("../test/data/binary_data.csv")

In [4]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
1,1,38.0,1,0,71.2833
3,1,35.0,1,0,53.1
6,1,54.0,0,0,51.8625
10,3,4.0,1,1,16.7
11,1,58.0,0,0,26.55


In [5]:
y.head()

Unnamed: 0,Survived
1,1
3,1
6,0
10,1
11,1


### Execution Control

In [6]:
_DEFAULT_CONFIG = {
    "concurrency": "1", # number of concurrent processors
    "max_duration": "1m", # maximum time duration
    "max_trials": "3", # maximum number of trails
    "training_platform": "local", # local, remote
    "tuner": "TPE" # TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner
}

### Model Space

In [7]:
_DEFAULT_SEARCH_SPACE = {
    "sklearn.ensemble.forest.RandomForestClassifier": [
        {"property": "bootstrap", "type": "choice", "value": [True, False]},
        {"property": "max_features", "type": "choice", "value": ['auto', 'sqrt']},
        {"property": "max_depth", "type": "choice", "value": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]},
        {"property": "n_estimators", "type": "choice", "value": [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
        {"property": "min_samples_leaf", "type": "choice", "value": [1, 2, 4]},
        {"property": "min_samples_split", "type": "choice", "value": [2, 5, 10]}
    ]
}

# 1. Design Considerations
* User-Friendliness is one of the most important considerations
* Decouple User Flow development from Model development
* An End-to-End working prototype has been developed and also integrated Shuji's SimpleClassifier with little hard-coding

### 1.1 Default Run
* User no need to input any configuration. They just need to provide training dataset
* AutoML will search all the available models with default search spaces for the best model
* The Execution is bounded with max duration and max number of trials
* Each Model will get a Web UI for tracking the progress and results
* At any point in time, User can stop the execution and get the best model
* Using sqlite database to store the exeuction and model information and artefacts

### 1.2 Customized Run - In addition to the default functionalities
* User is able to choose the models as well as change the default search space
* User is able to change the execution mechanism. ie. User can run the execution in Parallel to speed up

# 2. Default Run - No Need Any Config

In [8]:
auto_classifier = AtmlOrchestrator()

[I 200619 20:43:13 atml_impl:23] run_id: 20200619204313
[I 200619 20:43:13 atml_impl:24] run_dir: C:\Users\Admin\nni\runs\20200619204313
[I 200619 20:43:13 atml_impl:25] with_default: True
[I 200619 20:43:13 model_repo_manager:16] with_default: True
[D 200619 20:43:13 nni_exec_manager:23] Default NNI Execution Config: {'base_port': '8080', 'concurrency': '8', 'max_duration': '30s', 'max_trials': '99999', 'training_platform': 'local', 'tuner': 'TPE', 'optimize_mode': 'maximize'}


In [9]:
auto_classifier.run(X, y)

[I 200619 20:43:17 nni_exec_manager:50] Starting Hyperparameter Tuning with sklearn.ensemble.forest.RandomForestClassifier
[I 200619 20:43:28 nni_exec_manager:100] http://localhost:8080 for sklearn.ensemble.forest.RandomForestClassifier Started


Experiment finished. 6 seconds taken. Number of trials succeeded: 7


[I 200619 20:44:12 nni_exec_manager:100] http://localhost:8080 for sklearn.ensemble.forest.RandomForestClassifier Stopped


In [10]:
all_trials = auto_classifier.get_all_trials()
all_trials

Unnamed: 0,RUN_ID,EXP_ID,TRIAL_ID,MODEL_KEY,METRIC,MODEL,PARAMS,EXCEPTION
0,20200619204313,ukqCuWWK,NRpDi,sklearn.ensemble.forest.RandomForestClassifier,0.702703,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': False, 'max_depth': 90, 'max_fea...",
1,20200619204313,ukqCuWWK,PKprl,sklearn.ensemble.forest.RandomForestClassifier,0.702703,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': True, 'max_depth': 70, 'max_feat...",
2,20200619204313,ukqCuWWK,YhUB0,sklearn.ensemble.forest.RandomForestClassifier,0.702703,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': True, 'max_depth': 60, 'max_feat...",
3,20200619204313,ukqCuWWK,cTsYo,sklearn.ensemble.forest.RandomForestClassifier,0.702703,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': False, 'max_depth': 10, 'max_fea...",
4,20200619204313,ukqCuWWK,iOut2,sklearn.ensemble.forest.RandomForestClassifier,0.702703,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': False, 'max_depth': 70, 'max_fea...",
5,20200619204313,ukqCuWWK,rpEoN,sklearn.ensemble.forest.RandomForestClassifier,0.702703,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': False, 'max_depth': 50, 'max_fea...",
6,20200619204313,ukqCuWWK,Hj1Q1,sklearn.ensemble.forest.RandomForestClassifier,0.675676,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': False, 'max_depth': 10, 'max_fea...",


In [11]:
best_model_details = auto_classifier.get_best_model_details()
best_model_details

RUN_ID                                          20200619204313
EXP_ID                                                ukqCuWWK
TRIAL_ID                                                 NRpDi
MODEL_KEY       sklearn.ensemble.forest.RandomForestClassifier
METRIC                                                0.702703
MODEL        (DecisionTreeClassifier(class_weight=None, cri...
PARAMS       {'bootstrap': False, 'max_depth': 90, 'max_fea...
EXCEPTION                                                 None
Name: 0, dtype: object

In [12]:
model = best_model_details['MODEL']
X[0:5]

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
1,1,38.0,1,0,71.2833
3,1,35.0,1,0,53.1
6,1,54.0,0,0,51.8625
10,3,4.0,1,1,16.7
11,1,58.0,0,0,26.55


In [13]:
model.predict(X[0:5])

array([1, 1, 0, 1, 1], dtype=int64)

# 3. Customized Run - Choose Model & Change Default Search Space

In [15]:
auto_classifier = AtmlOrchestrator()

[I 200619 20:48:38 atml_impl:23] run_id: 20200619204838
[I 200619 20:48:38 atml_impl:24] run_dir: C:\Users\Admin\nni\runs\20200619204838
[I 200619 20:48:38 atml_impl:25] with_default: True
[I 200619 20:48:38 model_repo_manager:16] with_default: True
[D 200619 20:48:38 nni_exec_manager:23] Default NNI Execution Config: {'base_port': '8080', 'concurrency': '8', 'max_duration': '30s', 'max_trials': '99999', 'training_platform': 'local', 'tuner': 'TPE', 'optimize_mode': 'maximize'}


In [16]:
auto_classifier.model_manager.repo

{'sklearn.ensemble.forest.RandomForestClassifier': {'bootstrap': {'_type': 'choice',
   '_value': [True, False]},
  'max_features': {'_type': 'choice', '_value': ['auto', 'sqrt']},
  'max_depth': {'_type': 'choice',
   '_value': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 'None']},
  'n_estimators': {'_type': 'choice',
   '_value': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
  'min_samples_leaf': {'_type': 'choice', '_value': [1, 2, 4]},
  'min_samples_split': {'_type': 'choice', '_value': [2, 5, 10]}}}

In [17]:
# Optional
from sklearn.ensemble.forest import RandomForestClassifier
auto_classifier.choose_models([RandomForestClassifier()])

In [18]:
# Optional
auto_classifier.set_model_search_param(RandomForestClassifier(), 'max_depth', [2, 4, 8])

In [19]:
auto_classifier.model_manager.repo

{'sklearn.ensemble.forest.RandomForestClassifier': {'bootstrap': {'_type': 'choice',
   '_value': [True, False]},
  'max_features': {'_type': 'choice', '_value': ['auto', 'sqrt']},
  'max_depth': {'_type': 'choice', '_value': [2, 4, 8]},
  'n_estimators': {'_type': 'choice',
   '_value': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
  'min_samples_leaf': {'_type': 'choice', '_value': [1, 2, 4]},
  'min_samples_split': {'_type': 'choice', '_value': [2, 5, 10]}}}

In [20]:
auto_classifier.run(X, y)

[I 200619 20:48:49 nni_exec_manager:50] Starting Hyperparameter Tuning with sklearn.ensemble.forest.RandomForestClassifier
[I 200619 20:49:00 nni_exec_manager:100] http://localhost:8080 for sklearn.ensemble.forest.RandomForestClassifier Started


Experiment finished. 6 seconds taken. Number of trials succeeded: 8


[I 200619 20:49:42 nni_exec_manager:100] http://localhost:8080 for sklearn.ensemble.forest.RandomForestClassifier Stopped


In [21]:
trials = auto_classifier.get_all_trials()
trials

Unnamed: 0,RUN_ID,EXP_ID,TRIAL_ID,MODEL_KEY,METRIC,MODEL,PARAMS,EXCEPTION
0,20200619204838,jSTusAbz,D3cxR,sklearn.ensemble.forest.RandomForestClassifier,0.702703,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': True, 'max_depth': 4, 'max_featu...",
1,20200619204838,jSTusAbz,OcY4F,sklearn.ensemble.forest.RandomForestClassifier,0.702703,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': True, 'max_depth': 8, 'max_featu...",
2,20200619204838,jSTusAbz,P7ozR,sklearn.ensemble.forest.RandomForestClassifier,0.702703,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': True, 'max_depth': 4, 'max_featu...",
3,20200619204838,jSTusAbz,klRFK,sklearn.ensemble.forest.RandomForestClassifier,0.702703,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': True, 'max_depth': 2, 'max_featu...",
4,20200619204838,jSTusAbz,qc4oR,sklearn.ensemble.forest.RandomForestClassifier,0.702703,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': True, 'max_depth': 2, 'max_featu...",
5,20200619204838,jSTusAbz,Omgm3,sklearn.ensemble.forest.RandomForestClassifier,0.675676,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': False, 'max_depth': 4, 'max_feat...",
6,20200619204838,jSTusAbz,RBKmy,sklearn.ensemble.forest.RandomForestClassifier,0.648649,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': False, 'max_depth': 8, 'max_feat...",
7,20200619204838,jSTusAbz,ywFvc,sklearn.ensemble.forest.RandomForestClassifier,0.648649,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': False, 'max_depth': 8, 'max_feat...",


In [22]:
auto_classifier.get_best_model_details()

RUN_ID                                          20200619204838
EXP_ID                                                jSTusAbz
TRIAL_ID                                                 D3cxR
MODEL_KEY       sklearn.ensemble.forest.RandomForestClassifier
METRIC                                                0.702703
MODEL        (DecisionTreeClassifier(class_weight=None, cri...
PARAMS       {'bootstrap': True, 'max_depth': 4, 'max_featu...
EXCEPTION                                                 None
Name: 0, dtype: object

# 4. Customized Run - Parallel Run

In [23]:
auto_classifier = AtmlOrchestrator()

[I 200619 20:49:47 atml_impl:23] run_id: 20200619204947
[I 200619 20:49:47 atml_impl:24] run_dir: C:\Users\Admin\nni\runs\20200619204947
[I 200619 20:49:47 atml_impl:25] with_default: True
[I 200619 20:49:47 model_repo_manager:16] with_default: True
[D 200619 20:49:47 nni_exec_manager:23] Default NNI Execution Config: {'base_port': '8080', 'concurrency': '8', 'max_duration': '30s', 'max_trials': '99999', 'training_platform': 'local', 'tuner': 'TPE', 'optimize_mode': 'maximize'}


In [24]:
auto_classifier.get_available_models_and_spaces()

{'sklearn.ensemble.forest.RandomForestClassifier': {'bootstrap': {'_type': 'choice',
   '_value': [True, False]},
  'max_features': {'_type': 'choice', '_value': ['auto', 'sqrt']},
  'max_depth': {'_type': 'choice',
   '_value': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 'None']},
  'n_estimators': {'_type': 'choice',
   '_value': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
  'min_samples_leaf': {'_type': 'choice', '_value': [1, 2, 4]},
  'min_samples_split': {'_type': 'choice', '_value': [2, 5, 10]}}}

In [25]:
auto_classifier.choose_models([RandomForestClassifier()])

In [26]:
# Optional to run in parallel
auto_classifier.set_exec_config_param('concurrency', '3')

In [27]:
auto_classifier.nni_executor.exec_config

{'base_port': '8080',
 'concurrency': '3',
 'max_duration': '30s',
 'max_trials': '99999',
 'training_platform': 'local',
 'tuner': 'TPE',
 'optimize_mode': 'maximize'}

In [28]:
auto_classifier.run(X, y)

[I 200619 20:49:51 nni_exec_manager:50] Starting Hyperparameter Tuning with sklearn.ensemble.forest.RandomForestClassifier
[I 200619 20:50:02 nni_exec_manager:100] http://localhost:8080 for sklearn.ensemble.forest.RandomForestClassifier Started


Experiment finished. 6 seconds taken. Number of trials succeeded: 3


[I 200619 20:50:43 nni_exec_manager:100] http://localhost:8080 for sklearn.ensemble.forest.RandomForestClassifier Stopped


In [29]:
trials = auto_classifier.get_all_trials()
trials

Unnamed: 0,RUN_ID,EXP_ID,TRIAL_ID,MODEL_KEY,METRIC,MODEL,PARAMS,EXCEPTION
0,20200619204947,uSZJT1cQ,dVWhk,sklearn.ensemble.forest.RandomForestClassifier,0.702703,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': False, 'max_depth': 50, 'max_fea...",
1,20200619204947,uSZJT1cQ,dq3wD,sklearn.ensemble.forest.RandomForestClassifier,0.702703,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': True, 'max_depth': 40, 'max_feat...",
2,20200619204947,uSZJT1cQ,pXWpy,sklearn.ensemble.forest.RandomForestClassifier,0.702703,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': True, 'max_depth': 70, 'max_feat...",
3,20200619204947,uSZJT1cQ,diTdo,sklearn.ensemble.forest.RandomForestClassifier,0.675676,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': False, 'max_depth': 100, 'max_fe...",
4,20200619204947,uSZJT1cQ,gmTwe,sklearn.ensemble.forest.RandomForestClassifier,0.648649,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': True, 'max_depth': 10, 'max_feat...",
5,20200619204947,uSZJT1cQ,hJfQe,sklearn.ensemble.forest.RandomForestClassifier,0.621622,"(DecisionTreeClassifier(class_weight=None, cri...","{'bootstrap': False, 'max_depth': 70, 'max_fea...",


In [30]:
auto_classifier.get_best_model_details()['MODEL']

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=50, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=1200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)