In [1]:
import logging

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig

In [2]:
import azureml
subscription_id = "57c0a109-5ef9-4713-aaf0-415f299efa76"
resource_group = "AutoMLRG2"
workspace_name = "AutoMLPoc"
workspace_region = "eastus" 
# import the Workspace class and check the azureml SDK version
from azureml.core import Workspace
 
ws = Workspace.create(name = workspace_name,                    
                      subscription_id = subscription_id,
                      resource_group = resource_group, 
                      location = workspace_region,
                     exist_ok = True)
ws.get_details()
ws = Workspace(workspace_name = workspace_name,
               subscription_id = subscription_id,
               resource_group = resource_group)
 
# persist the subscription id, resource group name, and workspace name in aml_config/config.json.
ws.write_config()

In [3]:
ws = Workspace.from_config()

# Choose a name for the experiment and specify the project folder.
experiment_name = 'automl-local-classification'
project_folder = './sample_projects/automl-local-classification'

experiment = Experiment(ws, experiment_name)

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace Name'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
output['Experiment Name'] = experiment.name
pd.set_option('display.max_colwidth', -1)
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

In [4]:
from azureml.telemetry import set_diagnostics_collection
set_diagnostics_collection(send_diagnostics = True)

In [5]:
import time
import datetime
import sys
import io
from pyspark.sql import *
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.types import DecimalType, StringType, DateType, DoubleType, IntegerType
from pyspark.sql.functions import udf
from pyspark.sql.functions import lit, col, expr, when, round, concat
from pyspark.sql.functions import from_unixtime, unix_timestamp
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

readConfig = {
  "Endpoint" : "https://automlpoc.documents.azure.com:443/",
  "Masterkey" : "ug9douHKHXAnDEWH2Gwdt1W4MyqB3HzghbTMjTalNIrBYwSiykUrEJMdtm2ZPUt1tGlOa3YMIXsDfG0vjjcHdQ==",
  "Database" : "AutoMlPoc",
  "Collection" : "AutoMLData",
  "query_pagesize" : "2147483647",
  "query_custom" : "select * from AutoMLData where AutoMLData.datatype = 'training'"
}

# Connect via azure-cosmosdb-spark to create Spark DataFrame
flights = spark.read.format("com.microsoft.azure.cosmosdb.spark").options(**readConfig).load()
data = flights.select('data').collect()
target = flights.select('target').collect()
cosmosdbData = datasets.base.Bunch(data=np.asarray(data[0][0]), target=np.asarray(target[0][0]))
X_train = cosmosdbData.data[100:,:]
y_train = cosmosdbData.target[100:]


In [6]:
automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             primary_metric = 'AUC_weighted',
                             iteration_timeout_minutes = 60,
                             iterations = 20,
                             n_cross_validations = 3,
                             verbosity = logging.INFO,
                             X = X_train, 
                             y = y_train,
                             path = project_folder)

In [7]:
local_run = experiment.submit(automl_config, show_output = True)
local_run