In [3]:
from azureml.widgets import RunDetails
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.train.automl import AutoMLConfig
from azureml.train import automl
from azureml.core import Experiment
from datetime import datetime
# azureml-core of version 1.0.72 or higher is required
from azureml.core import Workspace, Dataset
import azureml.core
import logging
import time
import os
import datetime as dt


# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

# Get Workspace defined in by default config.json file
ws = Workspace.from_config()

# Load Data - "hospitalization" dataSet
aml_dataset = ws.datasets['hospitalization']

# Use Pandas DataFrame just to sneak peak some data and schema
full_df = aml_dataset.to_pandas_dataframe()
# .to_pandas_dataframe().dropna()
full_df.head(5)

# Use Pandas DataFrame just to investigate the dataset's schema and info
full_df.describe()

# Dropping unused columns 
aml_dataset = aml_dataset.drop_columns(['iso_code','new_deaths_smoothed','total_cases_per_million','total_cases_per_million','new_cases_per_million','new_cases_smoothed_per_million','total_deaths_per_million','new_deaths_per_million','new_deaths_smoothed_per_million','icu_patients','icu_patients_per_million','hosp_patients','hosp_patients_per_million','weekly_icu_admissions','weekly_icu_admissions_per_million','weekly_hosp_admissions','weekly_hosp_admissions_per_million','total_tests_per_thousand','new_tests_per_thousand','new_tests_smoothed','new_tests_smoothed_per_thousand','tests_per_case','new_vaccinations_smoothed','total_vaccinations_per_hundred','new_vaccinations_smoothed_per_million'])

full_df = aml_dataset.to_pandas_dataframe()
full_df.describe()

# Split using Azure Tabular Datasets (Better for Remote Compute)
# https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py#random-split-percentage--seed-none-

train_dataset, test_dataset = aml_dataset.random_split(0.9, seed=1)

# Use Pandas DF only to check the data
train_dataset_df = train_dataset.to_pandas_dataframe()
# test_dataset_df = test_dataset.to_pandas_dataframe()

y_test = test_dataset_df.fillna(test_dataset_df.mean())
print(train_dataset_df.describe())
# print(test_dataset_df.describe())

ComputeTarget.list(ws)

# Define remote compute target to use
# Further docs on Remote Compute Target: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-remote

# Choose a name for your cluster.
amlcompute_cluster_name = "computeCluster"

found = False
# Check if this compute target already exists in the workspace.
cts = ws.compute_targets

if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':
     found = True
     print('Found existing training cluster.')
     # Get existing cluster
     # Method 1:
     aml_remote_compute = cts[amlcompute_cluster_name]
     # Method 2:
     # aml_remote_compute = ComputeTarget(ws, amlcompute_cluster_name)
    
if not found:
     print('Creating a new training cluster...')
     provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D12_V2", # for CPU, use "STANDARD_D12"
                                                                 #vm_priority = 'lowpriority', # optional
                                                                 max_nodes = 20)
     # Create the cluster.
     aml_remote_compute = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)
    
print('Checking cluster status...')
# Can poll for a minimum number of nodes and for a specific timeout.
# If no min_node_count is provided, it will use the scale settings for the cluster.
aml_remote_compute.wait_for_completion(show_output = True, min_node_count = 0, timeout_in_minutes = 20)

# For additional details of current AmlCompute status:
aml_remote_compute.get_status().serialize()

# List of possible primary metrics is here:
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train#primary-metric
    
# Get a list of valid metrics for your given task
automl.utilities.get_primary_metrics('regression')

# We'll use 'accuracy' as primary metric (Closer to 1.00 is better)

# You can provide additional settings as a **kwargs parameter for the AutoMLConfig object
# automl_settings = {
#     "whitelist_models": 'XGBoostClassifier'
# }


project_folder = './automl'
os.makedirs(project_folder, exist_ok=True)
target_column_name = "new_deaths"

automl_config = AutoMLConfig(compute_target=aml_remote_compute,
                             task='regression',
                             primary_metric='normalized_root_mean_squared_error',
                             experiment_timeout_minutes=15,                            
                             training_data=train_dataset,
                             label_column_name=target_column_name,
                             n_cross_validations=5,
                             #blacked_models=['XGBoostClassifier','MaxAbsScaler','StandardScalerWrapper','ElasticNet','MaxAbsScaler','DecisionTree','MaxAbsScaler','LightGBM'], 
                             # iteration_timeout_minutes=5,                                                    
                             enable_early_stopping=True,
                             featurization='auto',
                             debug_log='automated_ml_errors.log',
                             verbosity=logging.INFO,
                             path=project_folder
                             # **automl_settings
                             )

# PaperCut?: Why is drop_column_names only supported by Time Series Forecast? - If used for classification, you get:
# drop_column_names= ['EmployeeCount','EmployeeNumber','Over18','StandardHours'], # Clean up dataset by dropping not needed columns
# WARNING - Received unrecognized parameter: drop_column_names ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']
# In documentation it doesn't state that it is only supported for Forecast...:
# https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py

# Explanation of Settings: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train#configure-your-experiment-settings

# AutoMLConfig info on: 
# https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig

now = datetime.now()
time_string = now.strftime("%m-%d-%Y-%H")
experiment_name = "classif-automl-remote-{0}".format(time_string)
print(experiment_name)

experiment = Experiment(workspace=ws, name=experiment_name)

start_time = time.time()
            
run = experiment.submit(automl_config, show_output=True)

print('Manual run timing: --- %s seconds needed for running the whole Remote AutoML Experiment ---' % (time.time() - start_time))

RunDetails(run).show()
run_details = run.get_details()

# Like: 2020-01-12T23:11:56.292703Z
end_time_utc_str = run_details['endTimeUtc'].split(".")[0]
start_time_utc_str = run_details['startTimeUtc'].split(".")[0]
timestamp_end = time.mktime(datetime.strptime(end_time_utc_str, "%Y-%m-%dT%H:%M:%S").timetuple())
timestamp_start = time.mktime(datetime.strptime(start_time_utc_str, "%Y-%m-%dT%H:%M:%S").timetuple())

parent_run_time = timestamp_end - timestamp_start
print('Run Timing: --- %s seconds needed for running the whole Remote AutoML Experiment ---' % (parent_run_time))

best_run, fitted_model = run.get_output()
print(best_run)
print(fitted_model)


#Test Model
# X_test = test_dataset
# y_test = X_test.pop(target_column_name).values

# X_test.head()

# # forecast returns the predictions and the featurized data, aligned to X_test.
# # This contains the assumptions that were made in the forecast
# y_predictions, X_trans = fitted_model.forecast(X_test)

# assign_dict = {'predicted': y_predictions, target_column_name: y_test}
# df_all = X_test.assign(**assign_dict)

# from azureml.automl.core.shared import constants
# from azureml.automl.runtime.shared.score import scoring
# from matplotlib import pyplot as plt

# # use automl scoring module
# scores = scoring.score_regression(
#     y_test=df_all[target_column_name],
#     y_pred=df_all['predicted'],
#     metrics=list(constants.Metric.SCALAR_REGRESSION_SET))

# print("[Test data scores]\n")
# for key, value in scores.items():    
#     print('{}:   {:.3f}'.format(key, value))
    
# # Plot outputs
# %matplotlib inline
# test_pred = plt.scatter(df_all[target_column_name], df_all['predicted'], color='b')
# test_test = plt.scatter(df_all[target_column_name], df_all[target_column_name], color='g')
# plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)
# plt.show()


SDK version: 1.20.0
       total_cases  new_cases  new_cases_smoothed  total_deaths  new_deaths  \
count     55664.00   55657.00            54791.00      47982.00    47981.00   
mean     317581.98    3175.08             3158.88      10168.30       80.05   
std     3102752.79   27972.62            27467.01      80867.91      592.13   
min           1.00  -46076.00             -838.71          1.00    -1918.00   
25%         509.00       1.00                4.57         28.00        0.00   
50%        5375.50      44.00               53.14        164.00        1.00   
75%       54945.50     497.00              528.64       1495.75       11.00   
max    99722272.00  857852.00           738158.86    2140085.00    17817.00   

       reproduction_rate  stringency_index    population  population_density  \
count           45213.00          50521.00      55875.00            54709.00   
mean                1.03             59.12   90976911.54              324.16   
std                 0.37    

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 'sâ€¦

Run Timing: --- 1647.0 seconds needed for running the whole Remote AutoML Experiment ---
Run(Experiment: classif-automl-remote-01-27-2021-14,
Id: AutoML_60f83bdf-ea76-4639-b753-dfe57851b363_5,
Type: azureml.scriptrun,
Status: Completed)
RegressionPipeline(pipeline=Pipeline(memory=None,
                                     steps=[('datatransformer',
                                             DataTransformer(enable_dnn=None,
                                                             enable_feature_sweeping=None,
                                                             feature_sweeping_config=None,
                                                             feature_sweeping_timeout=None,
                                                             featurization_config=None,
                                                             force_text_dnn=None,
                                                             is_cross_validation=None,
                                        

AttributeError: 'TabularDataset' object has no attribute 'pop'