In [1]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.simplefilter(action='ignore')

#### Dev Intro to Data Science

* 👨🏻‍🏫 [youtube playlist](https://www.youtube.com/playlist?list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE)
* 👨🏻‍💻 [github](https://github.com/microsoft/c9-dev-intro-data-science/?WT.mc_id=DevIntroDS-Ch9-Lazzeri)

Videos

1. [Intro to the Developer's Intro to DS Video Series](https://www.youtube.com/watch?v=cUHXjTdMdYc&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=2)
1. [What is the DS Lifecycle?](https://www.youtube.com/watch?v=LeEj3S4Okao&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=3)
1. [How do you define your business goal and scope of your data science solution?](https://www.youtube.com/watch?v=mft6VuRv8q8&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=3)
    * predict how many bikes will be rented in the next hour
1. [What is ML?](https://www.youtube.com/watch?v=jX2hSjiI5Y0&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=4)
1. [Which ML Algorithm Should You Use?](https://www.youtube.com/watch?v=iFyRujSaEmw&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=5)
    * [cheatsheet](https://www.aka.ms/AlgorithmCheatSheet)
1. [What is AutoML?](https://www.youtube.com/watch?v=lrv9btrzDY8&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=6)
    * [AutomatedML](https://www.aka.ms/AutomatedML)
    * [AutoMLConfig-Class](https://www.aka.ms/AutoMLConfig-Class)
1. [How do you create a machine learning resource in Azure](https://www.youtube.com/watch?v=c1MIP4zbdto&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=7)
1. [How do you setup your local environment for data exploration?](https://www.youtube.com/watch?v=5E3WMb8_T3s&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=8)
1. [How do Jupyter notebooks work in Visual Studio Code?](https://www.youtube.com/watch?v=ilFYqD2SR4k&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=9)
1. [Connect your ML resources to your local VSCode environment?](https://www.youtube.com/watch?v=tgz3uxxbj4I&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=10)
    * where code really starts
1. [How do you prepare your data for time series forecast?](https://www.youtube.com/watch?v=N55wemye7z0&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=11)
1. [Why do you split data into testing and training data in DS?](https://www.youtube.com/watch?v=_vdMKioCXqQ&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=12)
1. [What is an AutoML Config file?](https://www.youtube.com/watch?v=ghzWYLdwW6c&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=13)
1. [What should your parameters be when creating an AutoML Config file?](https://www.youtube.com/watch?v=g5daLl3w0Tk&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=14)
    * Primary metric: normalized RMSE
    * blacklisted models: xtreme random trees
1. [How do you create an AutoML Config file & run your DS experiments on the cloud?](https://www.youtube.com/watch?v=JbFM8N4gLOg&list=PLlrxD0HtieHjDop2DtiCmwTTcrlwKAVHE&index=15)

In [11]:
# !pip install --user azureml-sdk
# !pip install --user --upgrade azureml-sdk[automl,explain]
# !pip install --upgrade --no-deps --user azureml-train-automl

In [3]:
import azureml.core
import pandas as pd
import numpy as np
import logging

print(f"AzureML SDK Version: {azureml.core.VERSION}")

AzureML SDK Version: 1.18.0


In [17]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
experiment_name = "automl_bikeshare_forecast"
experiment = Experiment(ws,experiment_name)

In [22]:
from azureml.core.compute import AmlCompute, ComputeTarget

amlcompute_cluster_name = "cpu-cluster"
provisioning_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",max_nodes=4)
compute_target = ComputeTarget.create(ws,amlcompute_cluster_name,provisioning_config)
compute_target.wait_for_completion(show_output=True,min_node_count=None,timeout_in_minutes=20)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [35]:
github_csv_url = "https://raw.githubusercontent.com/Azure/MachineLearningNotebooks/master/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/bike-no.csv"
bike_no_df = pd.read_csv(github_csv_url,index_col=None)
csv_name = 'bike-no.csv'
bike_no_df.to_csv(csv_name,index=False)
datastore = ws.get_default_datastore()
datastore.upload_files(files=[csv_name],target_path="dataset/",overwrite=True,show_progress=True)

Uploading an estimated of 1 files
Uploading bike-no.csv
Uploaded bike-no.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_9bbc3f0101dd4db4a367d57db6644681

In [7]:
from azureml.core import Dataset
from datetime import datetime

time_column_name, target_column_name = "date", "cnt"
dataset = Dataset.Tabular.from_delimited_files(path=[(datastore,f'dataset/{csv_name}')]).with_timestamp_columns(fine_grain_timestamp=time_column_name)
dataset.take(5).to_pandas_dataframe().reset_index(drop=True)

Unnamed: 0,instant,date,season,yr,mnth,weekday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,6,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,2,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,3,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [8]:
train = dataset.time_before(datetime(2012,8,31),include_boundary=True)
train.to_pandas_dataframe().tail(5).reset_index(drop=True)
# choosing ~75% of data for train size

Unnamed: 0,instant,date,season,yr,mnth,weekday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,605,2012-08-27,3,1,8,1,1,0.703333,0.654688,0.730417,0.128733,989,5928,6917
1,606,2012-08-28,3,1,8,2,1,0.728333,0.66605,0.62,0.190925,935,6105,7040
2,607,2012-08-29,3,1,8,3,1,0.685,0.635733,0.552083,0.112562,1177,6520,7697
3,608,2012-08-30,3,1,8,4,1,0.706667,0.652779,0.590417,0.077117,1172,6541,7713
4,609,2012-08-31,3,1,8,5,1,0.764167,0.6894,0.5875,0.168533,1433,5917,7350


In [9]:
test = dataset.time_after(datetime(2012,9,1),include_boundary=True)
test.to_pandas_dataframe().head(5).reset_index(drop=True)

Unnamed: 0,instant,date,season,yr,mnth,weekday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,610,2012-09-01,3,1,9,6,2,0.753333,0.702654,0.638333,0.113187,2352,3788,6140
1,611,2012-09-02,3,1,9,0,2,0.696667,0.649,0.815,0.064071,2613,3197,5810
2,612,2012-09-03,3,1,9,1,1,0.7075,0.661629,0.790833,0.151121,1965,4069,6034
3,613,2012-09-04,3,1,9,2,1,0.725833,0.686888,0.755,0.236321,867,5997,6864
4,614,2012-09-05,3,1,9,3,1,0.736667,0.708983,0.74125,0.187808,832,6280,7112


In [36]:
from azureml.train.automl import AutoMLConfig

time_series_settings = {
    'time_column_name': time_column_name,
    'max_horizon': 14,
    'country_or_region': 'US',
    'target_lags': 'auto',
    'drop_column_names': ['casual','registered']
}
automl_config = AutoMLConfig(
    task='forecasting',
    primary_metric='normalized_root_mean_squared_error',
    blacklist_models=['ExtremeRandomTrees'],
    experiment_timeout_minutes=30,
    training_data=train,
    label_column_name=target_column_name,
    compute_target=compute_target,
    enable_early_stopping=True,
    n_cross_validations=3,
    max_concurrent_iterations=4,
    max_cores_per_iteration=-1,
    verbosity=logging.INFO,
    **time_series_settings
)



In [38]:
# remote_run = experiment.submit(automl_config,show_output=False)
# remote_run