### Data Preparation 

In [1]:
!apt-get install unzip
!wget -O house-pricing.zip https://tinyurl.com/yyn4z2xo 
!unzip -o house-pricing.zip

/bin/sh: apt-get: command not found
--2020-09-24 13:24:14--  https://tinyurl.com/yyn4z2xo
Resolving tinyurl.com (tinyurl.com)... 104.20.138.65, 104.20.139.65, 172.67.1.225, ...
Connecting to tinyurl.com (tinyurl.com)|104.20.138.65|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://vpon-workshop-yianc.s3.us-east-1.amazonaws.com/house-prices-advanced-regression-techniques.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIATLORAEYMTX7JY4ER%2F20200921%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200921T022500Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=b34071b5632bc111755b1930607ae483d5560fa2dcc4b97e458d2659e0fee343 [following]
--2020-09-24 13:24:15--  https://vpon-workshop-yianc.s3.us-east-1.amazonaws.com/house-prices-advanced-regression-techniques.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIATLORAEYMTX7JY4ER%2F20200921%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200921T022500Z&X-Amz-Expires=604800&X-A

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# create training and testing data set

In [4]:
train_data = df.sample(frac=0.8,random_state=200)
test_data = df.drop(train_data.index)
test_data_no_target = test_data.drop(columns=['SalePrice'])

In [5]:
print('total:{} training:{} testing:{}'.format(df.shape[0], train_data.shape[0], test_data.shape[0]))

total:1460 training:1168 testing:292


# Upload data to s3

In [6]:
import sagemaker
import boto3
from sagemaker import get_execution_role

region = boto3.Session().region_name

session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'sagemaker/autopilot-housepricing-prediction'

role = get_execution_role()

sm = boto3.Session().client(service_name='sagemaker',region_name=region)

In [7]:
train_data.to_csv('automl-train.csv', index=False, header=True) # Make sure features are comma-separated

In [8]:
train_data_url = session.upload_data(path='automl-train.csv', bucket=bucket, key_prefix=prefix + '/input')
train_data_url

's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-housepricing-prediction/input/automl-train.csv'

In [9]:
test_data.to_csv('test_data.csv', index=False, header=True)
test_data_no_target.to_csv('test_data_no_target.csv', index=False, header=True)

In [10]:
test_data_no_target_url = session.upload_data(path='test_data_no_target.csv', bucket=bucket, key_prefix=prefix + '/test')
test_data_no_target_url

's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-housepricing-prediction/test/test_data_no_target.csv'

In [11]:

!aws s3 ls {train_data_url}

2020-09-24 13:24:37     363963 automl-train.csv


# Configure data location and artifacts

* Select target attribute "SalePrice" to predict fraud or not fraud

In [12]:
input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': 's3://{}/{}/input'.format(bucket,prefix)
        }
      },
      'TargetAttributeName': 'SalePrice'
    }
  ]

output_data_config = {
    'S3OutputPath': 's3://{}/{}/output'.format(bucket,prefix)
  }

input_data_config

[{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
    'S3Uri': 's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-housepricing-prediction/input'}},
  'TargetAttributeName': 'SalePrice'}]

In [13]:
output_data_config

{'S3OutputPath': 's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-housepricing-prediction/output'}

In [14]:
automl_job_config = {
#   "CompletionCriteria": {
#     "MaxCandidates": 10,
#     "MaxRuntimePerTrainingJobInSeconds": 10*60
#       "MaxAutoMLJobRuntimeInSeconds": 60*120
#   }
}
automl_job_config

{}

# Create SageMaker Auto-pilot job

In [15]:
from time import gmtime, strftime, sleep
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

auto_ml_job_name = 'automl-housepricing-' + timestamp_suffix
print('AutoMLJobName: ' + auto_ml_job_name)

sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig=automl_job_config,
                      RoleArn=role)

AutoMLJobName: automl-housepricing-24-13-25-14


{'AutoMLJobArn': 'arn:aws:sagemaker:us-east-1:230755935769:automl-job/automl-housepricing-24-13-25-14',
 'ResponseMetadata': {'RequestId': 'bf76fb30-5d5d-4e9d-b05c-53e3236bdafa',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'bf76fb30-5d5d-4e9d-b05c-53e3236bdafa',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '102',
   'date': 'Thu, 24 Sep 2020 13:25:16 GMT'},
  'RetryAttempts': 0}}

# Track Autopolit Job Status

In [16]:
# AnalysingData / FeatureEngineering / ModelTuning
print ('JobStatus - Secondary Status')
print('------------------------------')


describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    job_secondary_status = describe_response['AutoMLJobSecondaryStatus']
    print (job_run_status + " - " + job_secondary_status)
    sleep(30)

JobStatus - Secondary Status
------------------------------
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProg

# Result

In [17]:
import pprint
import json
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']

best_candidate_name = best_candidate['CandidateName']
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(best_candidate)
print('\n')
print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

{   'CandidateName': 'tuning-job-1-cc80cb25384d4caba5-240-352a7df8',
    'CandidateStatus': 'Completed',
    'CandidateSteps': [   {   'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:230755935769:processing-job/db-1-d802bac7f0af4cb18ea5831c973ea2bad675bcabcf6d46ca81ff31ee55',
                              'CandidateStepName': 'db-1-d802bac7f0af4cb18ea5831c973ea2bad675bcabcf6d46ca81ff31ee55',
                              'CandidateStepType': 'AWS::SageMaker::ProcessingJob'},
                          {   'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:230755935769:training-job/automl-hou-dpp0-1-e827a0df47ad4f91bbbea016156d8c368b30931f6e934',
                              'CandidateStepName': 'automl-hou-dpp0-1-e827a0df47ad4f91bbbea016156d8c368b30931f6e934',
                              'CandidateStepType': 'AWS::SageMaker::TrainingJob'},
                          {   'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:230755935769:transform-job/automl-hou-dpp0-csv-1-3d1f498190b4462d9cb

# Create Model for best candidates

In [18]:
model_name = 'automl-housepricing-model-' + timestamp_suffix

model = sm.create_model(Containers=best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)

print('Model ARN corresponding to the best candidate is : {}'.format(model['ModelArn']))


Model ARN corresponding to the best candidate is : arn:aws:sagemaker:us-east-1:230755935769:model/automl-housepricing-model-24-13-25-14


# Use Transformation Job to test the model

In [19]:
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())
transform_job_name = 'automl-housepricing-transform-' + timestamp_suffix

transform_input = {
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': test_data_no_target_url
            }
        },
        'ContentType': 'text/csv',
        'CompressionType': 'None',
        'SplitType': 'Line'
    }

transform_output = {
        'S3OutputPath': 's3://{}/{}/inference-results'.format(bucket,prefix),
    }

transform_resources = {
        'InstanceType': 'ml.m5.4xlarge',
        'InstanceCount': 1
    }

sm.create_transform_job(TransformJobName = transform_job_name,
                        ModelName = model_name,
                        TransformInput = transform_input,
                        TransformOutput = transform_output,
                        TransformResources = transform_resources
)

{'TransformJobArn': 'arn:aws:sagemaker:us-east-1:230755935769:transform-job/automl-housepricing-transform-24-14-47-30',
 'ResponseMetadata': {'RequestId': '1ed3769b-fcf5-486d-b1c7-27b54bf139a8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1ed3769b-fcf5-486d-b1c7-27b54bf139a8',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '118',
   'date': 'Thu, 24 Sep 2020 14:47:30 GMT'},
  'RetryAttempts': 0}}

# Get Batch Transform Job Status

In [20]:
print ('JobStatus')
print('----------')


describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
job_run_status = describe_response['TransformJobStatus']
print (job_run_status)

while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
    job_run_status = describe_response['TransformJobStatus']
    print (job_run_status)
    sleep(30)

JobStatus
----------
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


# View Results of Transform Job

In [21]:
s3_output_path = sm.describe_transform_job(TransformJobName = transform_job_name)['TransformOutput']['S3OutputPath']
s3_output_key = s3_output_path + '/{}'.format('test_data_no_target.csv.out')
s3_output_key

's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-housepricing-prediction/inference-results/test_data_no_target.csv.out'

In [22]:
local_inference_results_path = 'inference_results.csv'
!aws s3 cp {s3_output_key} {local_inference_results_path}

Completed 3.8 KiB/3.8 KiB (46.2 KiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-housepricing-prediction/inference-results/test_data_no_target.csv.out to ./inference_results.csv


In [23]:
data = pd.read_csv(local_inference_results_path, sep=';')
pd.set_option('display.max_rows', 10)         # Keep the output on one page
data

Unnamed: 0,11163.083984375
0,185773.890625
1,124176.718750
2,140397.421875
3,104733.578125
4,148430.656250
...,...
287,243847.234375
288,177781.390625
289,135253.187500
290,106124.765625


In [25]:
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
10,11,20,RL,70.0,11200,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,129500
15,16,45,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,,0,7,2007,WD,Normal,132000
17,18,90,RL,72.0,10791,Pave,,Reg,Lvl,AllPub,...,0,,,Shed,500,10,2006,WD,Normal,90000
18,19,20,RL,66.0,13695,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,159000


In [26]:
df_test_data_label = test_data['SalePrice']
test_labels = df_test_data_label
test_labels.size

292

In [27]:
df_preds = pd.read_csv(local_inference_results_path, sep=';', header=0, names=['SalePrice'])

test_preds = df_preds['SalePrice']
test_preds.size

292

In [28]:
from sklearn.metrics import mean_squared_error 

import math
y_True = test_labels.to_numpy()
predictions = test_preds.to_numpy()
rmse = math.sqrt(mean_squared_error(y_True, predictions)) 
rmse 


29082.718071506493

# View other candidates explored by SageMaker Autopilot

In [29]:
candidates = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, SortBy='FinalObjectiveMetricValue')['Candidates']
index = 1
for candidate in candidates:
  print (str(index) + "  " + candidate['CandidateName'] + "  " + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))
  index += 1

1  tuning-job-1-cc80cb25384d4caba5-027-2451ff8c  34905227264.0
2  tuning-job-1-cc80cb25384d4caba5-018-52c4bfc6  34676015104.0
3  tuning-job-1-cc80cb25384d4caba5-026-7ad6bcc2  34587873280.0
4  tuning-job-1-cc80cb25384d4caba5-013-21da4113  34256414720.0
5  tuning-job-1-cc80cb25384d4caba5-012-5e67d73d  34229196800.0
6  tuning-job-1-cc80cb25384d4caba5-011-3981f148  34214594560.0
7  tuning-job-1-cc80cb25384d4caba5-045-c37a8f54  34021953536.0
8  tuning-job-1-cc80cb25384d4caba5-057-ca24dd16  33300133888.0
9  tuning-job-1-cc80cb25384d4caba5-112-77d2a71d  32476653568.0
10  tuning-job-1-cc80cb25384d4caba5-008-3b0eab39  31165061120.0


# Candidate Generation Notebook

In [30]:
nb_data_explore_url = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['AutoMLJobArtifacts']['DataExplorationNotebookLocation']
nb_data_explore_url

's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-housepricing-prediction/output/automl-housepricing-24-13-25-14/sagemaker-automl-candidates/pr-1-f788e98482834e999a200e714cc1ad337b835aa9c0c7443a988ec7b9fc/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb'

In [31]:
nb_training_url = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation']
nb_training_url

's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-housepricing-prediction/output/automl-housepricing-24-13-25-14/sagemaker-automl-candidates/pr-1-f788e98482834e999a200e714cc1ad337b835aa9c0c7443a988ec7b9fc/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb'

In [32]:
!aws s3 cp {nb_data_explore_url} ./data_explore.ipynb
!aws s3 cp {nb_training_url} ./autopilot_training.ipynb

download: s3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-housepricing-prediction/output/automl-housepricing-24-13-25-14/sagemaker-automl-candidates/pr-1-f788e98482834e999a200e714cc1ad337b835aa9c0c7443a988ec7b9fc/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb to ./data_explore.ipynb
download: s3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-housepricing-prediction/output/automl-housepricing-24-13-25-14/sagemaker-automl-candidates/pr-1-f788e98482834e999a200e714cc1ad337b835aa9c0c7443a988ec7b9fc/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb to ./autopilot_training.ipynb
