In [1]:
import numpy as np
import pandas as pd

# Define IAM Roles
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\CHETAN\AppData\Local\sagemaker\sagemaker\config.yaml


In [2]:
bucket_name= 'ani-sagemaker-edu' # create an S3 bucket
training_file_key = 'biketrain/bike_train.csv'
validation_file_key = 'biketrain/bike_val.csv'
test_file_key = 'biketrain/bike_test.csv'

In [3]:
s3_model_output_location = r's3://{0}/biketrain/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name, training_file_key)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name, validation_file_key)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name, test_file_key)


In [4]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://ani-sagemaker-edu/biketrain/model
s3://ani-sagemaker-edu/biketrain/bike_train.csv
s3://ani-sagemaker-edu/biketrain/bike_val.csv
s3://ani-sagemaker-edu/biketrain/bike_test.csv


In [5]:
# S3 naming Conventions
# files = objects in S3
# filename = key name in S3

def write_to_s3(filename, bucket, key):
    with open(filename, 'rb') as f: # Read in binary mode
            return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [6]:
write_to_s3('bike_train.csv', bucket_name, training_file_key)
write_to_s3('bike_val.csv', bucket_name, validation_file_key)
write_to_s3('bike_test.csv', bucket_name, test_file_key)

### Training Algorithm Docker Image


AWS maintains a separate image for every region and algorithm

In [7]:
# Registry patch for algorithms provided by SageMaker
# https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

containers = {'us-east-1':'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1'} # N.virginia

In [8]:
role = get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [9]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::205653679088:role/service-role/AmazonSageMaker-ExecutionRole-20231012T130147


### Build Model

In [10]:
sess = sagemaker.Session()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [12]:
# Access appropriate algorithm container image
# Specify how many instances to use for distributed training and what type of machine to use
# finally specify where the trained model artifacts needs to be stored
# reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
# Optionally, give a name to the training job using the base_job_name

estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                         role,
                                         instance_count = 1,
                                         instance_type = 'ml.m4.xlarge',
                                         output_path = s3_model_output_location,
                                         sagemaker_session = sess,
                                         base_job_name = 'xgboost-biketrain-vl')

In [13]:
# Specify hyperparameters that appropriate for the training algorithm
# XGBoost Training Parameters

#max_depth = 5, eta = 0.1, subsample = 0.7, num_round = 150
estimator.set_hyperparameters(max_depth = 5, objective = "reg:linear", eta = 0.1, subsample = 0.7, num_round = 150)
estimator.hyperparameters()

{'max_depth': 5,
 'objective': 'reg:linear',
 'eta': 0.1,
 'subsample': 0.7,
 'num_round': 150}

In [None]:
# not for execution
# How to specify hyperparameters for autotuning

# hyperparameter_ranges_linear = {
# 'alpha' : ContinuousParameter(0.01, 10, scaling_type = "Linear")
# 'lambda': ContinuousParameter(0.01, 10, scaling_type = "Linear")
# 'num_round' : ContinuousParameter(1, 200, scaling_type = "Linear")
#}

# tuner_linear = HyperparameterTuner(
# xgb,
# objective_metric_name,
# hyperparameter_ranges_linear,
# max_jobs = 20,
# max_parallel_jobs = 10,
# strategy= 'Random')


#tuner_linear.fit({'train': s3_input_train, 'validation': s3_input_validation}, include_cls_metadata = False)

### Specify Training Data Location and Optionally, Validation Data Location

In [15]:
# content type can be libsvm or csv for XGBoost
#training_input_config = sagemaker.session.s3_input(s3_data = s3_training_file_location, content_type = "csv")
#validation_input_config = sagemaker.session.s3_input(s3_data = s3_validation_file_location, content_type = "csv")

training_input_config = sagemaker.inputs.TrainingInput(s3_data = s3_training_file_location, content_type = "csv")
validation_input_config = sagemaker.inputs.TrainingInput(s3_data = s3_validation_file_location, content_type = "csv")

In [16]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://ani-sagemaker-edu/biketrain/bike_train.csv', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://ani-sagemaker-edu/biketrain/bike_val.csv', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


### Train the model

In [17]:
# XGBoost supports "train", "validation" channels
estimator.fit({'train': training_input_config, "validation": validation_input_config})

INFO:sagemaker:Creating training-job with name: xgboost-biketrain-vl-2023-10-18-22-00-02-676


2023-10-18 22:00:02 Starting - Starting the training job......
2023-10-18 22:00:40 Starting - Preparing the instances for training......
2023-10-18 22:01:56 Downloading - Downloading input data...
2023-10-18 22:02:26 Training - Downloading the training image......
2023-10-18 22:03:17 Training - Training image download completed. Training in progress..[34m[2023-10-18 22:03:26.729 ip-10-0-179-47.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-10-18 22:03:26.763 ip-10-0-179-47.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-10-18:22:03:27:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-10-18:22:03:27:INFO] Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34m[2023-10-18:22:03:27:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-10-18:22:03:27:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2


2023-10-18 22:03:48 Uploading - Uploading generated training model
2023-10-18 22:03:48 Completed - Training job completed
Training seconds: 112
Billable seconds: 112


In [None]:
eval_result = estimator.evals_result()

In [None]:
training_rounds = range(len(eval_result['validation_0']['rmse']))

In [None]:
plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.title('Training Vs Validation Error')
plt.legend()
plt.show()

In [None]:
estimator.plot_importance(regressor)
plt.show()

In [None]:
# Updated - Changed to validation dataset
# Compare actual vs predicted performance with dataset not seen by the model before
df_v = pd.read_csv('bike_val.csv',names=columns)

In [None]:
df_v.head()

In [None]:
X_test = df.iloc[:,1:]
print(X_test[:5])

In [None]:
result = estimator.predict(X_test)

In [None]:
result[:5]

In [None]:
df_v['count_predicted'] = result

In [None]:
df_v.head()

In [None]:
# Negative Values are predicted
df_v['count_predicted'].describe()

In [None]:
df_v[df_v['count_predicted'] < 0]

In [None]:
def adjust_count(x):
    if x < 0:
        return 0
    else:
        return x

In [None]:
df_v['count_predicted'] = df_v['count_predicted'].map(adjust_count)

In [None]:
df_v[df_v['count_predicted'] < 0]

In [None]:
df_v['count'] = df_v['count'].map(np.expm1)
df_v['count_predicted'] = df_v['count_predicted'].map(np.expm1)

In [None]:
# Actual Vs Predicted
plt.plot(df_v['count'], label='Actual')
plt.plot(df_v['count_predicted'],label='Predicted')
plt.xlabel('Sample')
plt.ylabel('Count')
plt.xlim([100,150])
plt.title('Validation Dataset - Predicted Vs. Actual')
plt.legend()
plt.show()

In [None]:
# Over prediction and Under Prediction needs to be balanced
# Training Data Residuals
residuals = (df_v['count'] - df_v['count_predicted'])

plt.hist(residuals)
plt.grid(True)
plt.xlabel('Actual - Predicted')
plt.ylabel('Count')
plt.title('Residuals Distribution')
plt.axvline(color='r')
plt.show()

In [None]:
value_counts = (residuals > 0).value_counts(sort=False)
print(' Under Estimation: {0:.2f}'.format(value_counts[True]/len(residuals)))
print(' Over  Estimation: {0:.2f}'.format(value_counts[False]/len(residuals)))

In [None]:
import sklearn.metrics as metrics
print("RMSE: {0:.2f}".format(metrics.mean_squared_error(df_v['count'],
                                                    df_v['count_predicted'])**.5))

### Deploy Model

In [18]:
predictor= estimator.deploy(initial_instance_count =1, 
                           instance_type = 'ml.m4.xlarge',
                           endpoint_name = 'xgboost-biketrain-vl')

INFO:sagemaker:Creating model with name: xgboost-biketrain-vl-2023-10-18-22-05-44-169
INFO:sagemaker:Creating endpoint-config with name xgboost-biketrain-vl
INFO:sagemaker:Creating endpoint with name xgboost-biketrain-vl


-------!

### Run predictions

In [27]:
#from sagemaker.predictor import csv_serializer, json_deserializer


# predictor.content_type = 'text/csv'
# predictor.serializer = CSVSerializer
# predictor.deserializer = CSVDeserializer

In [None]:
from sagemaker.serializers import CSVSerializer
predictor.serializer = CSVSerializer()


In [None]:
predictor.predict([[3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3]])

In [46]:
from sagemaker.predictor import Predictorq
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

predictor = Predictor('xgboost-biketrain-vl')
predictor.content_type = CONTENT_TYPE_CSV
predictor.accept = CONTENT_TYPE_JSON
predictor.serializer = CSVSerializer()

#payload = '3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3'
predictor.deserializer = JSONDeserializer()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [47]:
print(predictor.predict([[3,0,1,1,32.8,34.85,33,7.0015,2012,8,13,0,14]]))

{'predictions': [{'score': 272.4788513183594}]}
