# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [2]:
import logging
import os 
import csv 

import numpy as np 
import pandas as pd 
import pkg_resources
from matplotlib import pyplot as plt 
from sklearn.model_selection import train_test_split

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.data.dataset_factory import TabularDatasetFactory

from azureml.core import Environment
from azureml.core.model import InferenceConfig
from azureml.core.webservice import Webservice, AciWebservice
from azureml.core.model import Model

from azureml.pipeline.steps import AutoMLStep
from azureml.widgets import RunDetails

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.44.0


## Dataset

### Overview
The dataset I will be using for this project is the Top Hits Spotify from 2000-2019 dataset taken from kaggle. The goal of this project will be to predict song popularity given the available features. Popularity is measured on a continuous scale, thus this will be a **regression** problem. 


The code below will download the dataset from my github repository and create a data asset. 

In [3]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'udacity-capstone'

experiment=Experiment(ws, experiment_name)

In [7]:
found = False
key = "spotify-songs-dataset"
description_text = "Top Hits Spotify from 2000-2019"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        data = 'https://raw.githubusercontent.com/ash-mohan/azureMLCapstone/main/starter_file/data/songs_normalize.csv'
        dataset = Dataset.Tabular.from_delimited_files(data)        
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)


df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,duration_ms,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,228748.1245,2009.494,59.8725,0.667437,0.720366,5.378,-5.512435,0.5535,0.103568,0.128955,0.015226,0.181216,0.55169,120.122558
std,39136.569008,5.85996,21.335577,0.140416,0.152745,3.615059,1.933482,0.497254,0.096159,0.173346,0.087771,0.140669,0.220864,26.967112
min,113000.0,1998.0,0.0,0.129,0.0549,0.0,-20.514,0.0,0.0232,1.9e-05,0.0,0.0215,0.0381,60.019
25%,203580.0,2004.0,56.0,0.581,0.622,2.0,-6.49025,0.0,0.0396,0.014,0.0,0.0881,0.38675,98.98575
50%,223279.5,2010.0,65.5,0.676,0.736,6.0,-5.285,1.0,0.05985,0.0557,0.0,0.124,0.5575,120.0215
75%,248133.0,2015.0,73.0,0.764,0.839,8.0,-4.16775,1.0,0.129,0.17625,6.8e-05,0.241,0.73,134.2655
max,484146.0,2020.0,89.0,0.975,0.999,11.0,-0.276,1.0,0.576,0.976,0.985,0.853,0.973,210.851


In [12]:
df.info()

print(f"\n\nRows: {df.shape[0]} \nColumns: {df.shape[1]}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist            2000 non-null   object 
 1   song              2000 non-null   object 
 2   duration_ms       2000 non-null   int64  
 3   explicit          2000 non-null   bool   
 4   year              2000 non-null   int64  
 5   popularity        2000 non-null   int64  
 6   danceability      2000 non-null   float64
 7   energy            2000 non-null   float64
 8   key               2000 non-null   int64  
 9   loudness          2000 non-null   float64
 10  mode              2000 non-null   int64  
 11  speechiness       2000 non-null   float64
 12  acousticness      2000 non-null   float64
 13  instrumentalness  2000 non-null   float64
 14  liveness          2000 non-null   float64
 15  valence           2000 non-null   float64
 16  tempo             2000 non-null   float64


We can see that there are no missing values , so we will not have to drop nulls. However, this data is meant to be from 2000-2019 and we can see from the output above there are some outliers outside of this range. We will drop these rows along with any duplicate rows. 

In [44]:
new_df = df[df['year'].between(2000, 2019)].drop_duplicates()
new_df.describe()

Unnamed: 0,duration_ms,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,1899.0,1899.0,1899.0,1899.0,1899.0,1899.0,1899.0,1899.0,1899.0,1899.0,1899.0,1899.0,1899.0,1899.0
mean,228123.52554,2009.720379,59.558715,0.667637,0.721256,5.379147,-5.503538,0.553976,0.104413,0.127722,0.015525,0.181934,0.552579,120.115425
std,39116.687604,5.725938,21.683909,0.140547,0.152511,3.610882,1.931067,0.497209,0.096523,0.172951,0.089036,0.140827,0.220361,26.986107
min,113000.0,2000.0,0.0,0.129,0.0549,0.0,-20.514,0.0,0.0232,1.9e-05,0.0,0.0215,0.0381,60.019
25%,203273.0,2005.0,56.0,0.581,0.6245,2.0,-6.48,0.0,0.0399,0.01355,0.0,0.08875,0.3895,98.605
50%,222920.0,2010.0,65.0,0.676,0.738,6.0,-5.279,1.0,0.0613,0.0553,0.0,0.125,0.559,120.028
75%,247086.0,2015.0,73.0,0.765,0.839,8.0,-4.167,1.0,0.1305,0.175,6.4e-05,0.242,0.7305,134.1245
max,484146.0,2019.0,89.0,0.975,0.999,11.0,-0.276,1.0,0.576,0.976,0.985,0.853,0.973,210.851


In [20]:
print(f"Rows: {new_df.shape[0]} \nColumns: {new_df.shape[1]}")

Rows: 1899 
Columns: 18


In [9]:
# only 2000 data points so test size will be small 
train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)

print(f"Training Data: {train_data.shape[0]} entries")
print(f"Testing Data: {test_data.shape[0]} entries")

Training Data: 1800 entries
Testing Data: 200 entries


In [11]:
# save csv file and upload to datastore
pd.DataFrame(train_data).to_csv("data/train_dataset.csv", index=False)
pd.DataFrame(test_data).to_csv("data/test_dataset.csv", index=False)
datastore = ws.get_default_datastore()
datastore.upload(src_dir='./data', target_path='spotify_exp', overwrite=True, show_progress=True)

training_data = TabularDatasetFactory.from_delimited_files(path=datastore.path('spotify_exp/train_dataset.csv'))
testing_data = TabularDatasetFactory.from_delimited_files(path=datastore.path('spotify_exp/test_dataset.csv'))


Uploading an estimated of 5 files
Uploading ./data/.amlignore
Uploaded ./data/.amlignore, 1 files out of an estimated total of 5
Uploading ./data/.amlignore.amltmp
Uploaded ./data/.amlignore.amltmp, 2 files out of an estimated total of 5
Uploading ./data/songs_normalize.csv
Uploaded ./data/songs_normalize.csv, 3 files out of an estimated total of 5
Uploading ./data/test_dataset.csv
Uploaded ./data/test_dataset.csv, 4 files out of an estimated total of 5
Uploading ./data/train_dataset.csv
Uploaded ./data/train_dataset.csv, 5 files out of an estimated total of 5
Uploaded 5 files
Uploading an estimated of 5 files
Uploading ./data/.amlignore
Uploaded ./data/.amlignore, 1 files out of an estimated total of 5
Uploading ./data/.amlignore.amltmp
Uploaded ./data/.amlignore.amltmp, 2 files out of an estimated total of 5
Uploading ./data/songs_normalize.csv
Uploaded ./data/songs_normalize.csv, 3 files out of an estimated total of 5
Uploading ./data/test_dataset.csv
Uploaded ./data/test_dataset.cs

## AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In order to ensure a reasonable training time, I set experiment timeout to 15 minutes. The dataset I am using is relatively small, so longer training times may not give better results. Additonally, I set the maximum current iterations to 5 to take advantage of the hardware and stay within the 15 minute bound.  Early stopping is also enabled to make the training process much more efficient. Given the small size of my dataset, I chose a larger value for cross validations to ensure the best possible results. Featurization is set to auto to take advantage of auto ML capabilities. 

In [16]:
from azureml.train.automl.utilities import get_primary_metrics

get_primary_metrics("regression")

['spearman_correlation',
 'normalized_root_mean_squared_error',
 'normalized_mean_absolute_error',
 'r2_score']

In [17]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

amlcompute_cluster_name = "auto-ml"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS3_v2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 3)

InProgress.
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded....................................
AmlCompute wait for completion finished

Wait timeout has been reached
Current provisioning state of AmlCompute is "Succeeded" and current node count is "0"


In [18]:
# TODO: Put your automl settings here
automl_settings = {
    "n_cross_validations": 5,
    "experiment_timeout_hours": .25,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'normalized_root_mean_squared_error'
}

# TODO: Put your automl config here
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "regression",
                             training_data=training_data,
                             label_column_name="popularity",
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

In [19]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-capstone,AutoML_b151aac2-26f8-4636-95a9-681b5fdd4658,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [20]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [21]:
# Retrieve best model from Pipeline Run
best_run, fitted_model = remote_run.get_output()

Package:azureml-automl-runtime, training version:1.46.1, current version:1.44.0
Package:azureml-core, training version:1.46.0, current version:1.44.0
Package:azureml-dataprep, training version:4.5.7, current version:4.2.2
Package:azureml-dataprep-rslex, training version:2.11.4, current version:2.8.1
Package:azureml-dataset-runtime, training version:1.46.0, current version:1.44.0
Package:azureml-defaults, training version:1.46.0, current version:1.44.0
Package:azureml-interpret, training version:1.46.0, current version:1.44.0
Package:azureml-mlflow, training version:1.46.0, current version:1.44.0
Package:azureml-pipeline-core, training version:1.46.0, current version:1.44.0
Package:azureml-responsibleai, training version:1.46.0, current version:1.44.0
Package:azureml-telemetry, training version:1.46.0, current version:1.44.0
Package:azureml-train-automl-client, training version:1.46.0, current version:1.44.0
Package:azureml-train-automl-runtime, training version:1.46.1, current version:

In [22]:
print(best_run)
print(best_run.properties["score"])

Run(Experiment: udacity-capstone,
Id: AutoML_b151aac2-26f8-4636-95a9-681b5fdd4658_21,
Type: azureml.scriptrun,
Status: Completed)
0.23376243594049892


## Model Deployment

Remember you have to deploy only one of the two models you trained but you still need to register both the models. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [23]:
# Save the best model
best_model = remote_run.register_model(model_name='spotify-popularity-model')

best_model.download(target_dir="outputs", exist_ok=True)

'outputs/model.pkl'

In [24]:
aciconfig = AciWebservice.deploy_configuration(
    cpu_cores=1,
    memory_gb=2,
    description="Get popularity score for spotify song",
)

# save the environment
myenv = Environment.get(workspace=ws, name="AzureML-AutoML")
myenv.save_to_directory('env', overwrite=True)

inference_config = InferenceConfig(entry_script="score.py", environment=myenv)

In [25]:
# Use configs and models generated above
service=Model.deploy(workspace=ws,
                    name="modelwebservice",
                    models=[best_model],
                    inference_config=inference_config,
                    deployment_config=aciconfig)
service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2022-11-07 04:40:20+00:00 Creating Container Registry if not exists.
2022-11-07 04:40:20+00:00 Registering the environment.
2022-11-07 04:40:21+00:00 Use the existing image.
2022-11-07 04:40:21+00:00 Submitting deployment to compute.
2022-11-07 04:40:25+00:00 Checking the status of deployment modelwebservice..
2022-11-07 04:42:39+00:00 Checking the status of inference endpoint modelwebservice.
Succeeded
ACI service creation operation finished, operation "Succeeded"


TODO: In the cell below, send a request to the web service you deployed to test it.

In [26]:
if service.state == "Healthy":
    X_test = test_data.drop("popularity", axis=1)
    # Serialize the first row of the test data into json
    X_test_json = X_test[:1].to_json(orient="records")
    print(f"Data: {X_test_json}")
    # Call the service to get the predictions and the engineered and raw explanations
    output = service.run(X_test_json)
    # Print the predicted value
    print(f"\nPrediction: {output['predictions']}")

Data: [{"artist":"Charlie Puth","song":"How Long","duration_ms":200853,"explicit":false,"year":2018,"danceability":0.845,"energy":0.561,"key":1,"loudness":-5.253,"mode":0,"speechiness":0.0778,"acousticness":0.211,"instrumentalness":0.00000349,"liveness":0.0383,"valence":0.811,"tempo":109.974,"genre":"pop"}]

Prediction: [65.6844329837312]


In [46]:
import json 

test_data = testing_data.to_pandas_dataframe().drop("popularity", axis=1) 
samples = test_data[:2].to_json(orient="records")
samples

'[{"artist":"Charlie Puth","song":"How Long","duration_ms":200853,"explicit":false,"year":2018,"danceability":0.845,"energy":0.561,"key":1,"loudness":-5.253,"mode":0,"speechiness":0.0778,"acousticness":0.211,"instrumentalness":0.00000349,"liveness":0.0383,"valence":0.811,"tempo":109.974,"genre":"pop"},{"artist":"50 Cent","song":"21 Questions","duration_ms":224440,"explicit":true,"year":2003,"danceability":0.646,"energy":0.813,"key":6,"loudness":-3.846,"mode":0,"speechiness":0.299,"acousticness":0.349,"instrumentalness":0.0000937,"liveness":0.0427,"valence":0.895,"tempo":92.729,"genre":"hip hop, pop"}]'

In [48]:
import requests
scoring_uri = service.scoring_uri

# create request 
headers = {'Content-Type':'application/json'}

# uncomment if auth is enabled 
#headers["Authorization"] = f"Bearer {key}"

response = requests.post(scoring_uri,samples,headers=headers)
print(response.text)

{"predictions": [65.6844329837312, 63.63655269426005]}


TODO: In the cell below, print the logs of the web service and delete the service

In [129]:
# view service logs 
service.get_logs()



In [140]:
service.delete()
compute_target.delete()

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"



**Submission Checklist**
- I have registered the model.
- I have deployed the model with the best accuracy as a webservice.
- I have tested the webservice by sending a request to the model endpoint.
- I have deleted the webservice and shutdown all the computes that I have used.
- I have taken a screenshot showing the model endpoint as active.
- The project includes a file containing the environment details.
